blob: afec8bd84fe056914930c157e65d2272baba01f9 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000150 * Parser stacks related functions and macros *
151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
276
277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200291 * htmlFindEncoding:
292 * @the HTML parser context
293 *
294 * Ty to find and encoding in the current data available in the input
295 * buffer this is needed to try to switch to the proper encoding when
296 * one face a character error.
297 * That's an heuristic, since it's operating outside of parsing it could
298 * try to use a meta which had been commented out, that's the reason it
299 * should only be used in case of error, not as a default.
300 *
301 * Returns an encoding string or NULL if not found, the string need to
302 * be freed
303 */
304static xmlChar *
305htmlFindEncoding(xmlParserCtxtPtr ctxt) {
306 const xmlChar *start, *cur, *end;
307
308 if ((ctxt == NULL) || (ctxt->input == NULL) ||
309 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
310 (ctxt->input->buf->encoder != NULL))
311 return(NULL);
312 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
313 return(NULL);
314
315 start = ctxt->input->cur;
316 end = ctxt->input->end;
317 /* we also expect the input buffer to be zero terminated */
318 if (*end != 0)
319 return(NULL);
320
321 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
322 if (cur == NULL)
323 return(NULL);
324 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
325 if (cur == NULL)
326 return(NULL);
327 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
328 if (cur == NULL)
329 return(NULL);
330 cur += 8;
331 start = cur;
332 while (((*cur >= 'A') && (*cur <= 'Z')) ||
333 ((*cur >= 'a') && (*cur <= 'z')) ||
334 ((*cur >= '0') && (*cur <= '9')) ||
335 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
336 cur++;
337 if (cur == start)
338 return(NULL);
339 return(xmlStrndup(start, cur - start));
340}
341
342/**
Owen Taylor3473f882001-02-23 17:55:21 +0000343 * htmlCurrentChar:
344 * @ctxt: the HTML parser context
345 * @len: pointer to the length of the char read
346 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000347 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000348 * bytes in the input buffer. Implement the end of line normalization:
349 * 2.11 End-of-Line Handling
350 * If the encoding is unspecified, in the case we find an ISO-Latin-1
351 * char, then the encoding converter is plugged in automatically.
352 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000353 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000354 */
355
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000356static int
Owen Taylor3473f882001-02-23 17:55:21 +0000357htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
358 if (ctxt->instate == XML_PARSER_EOF)
359 return(0);
360
361 if (ctxt->token != 0) {
362 *len = 0;
363 return(ctxt->token);
364 }
365 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
366 /*
367 * We are supposed to handle UTF8, check it's valid
368 * From rfc2044: encoding of the Unicode values on UTF-8:
369 *
370 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
371 * 0000 0000-0000 007F 0xxxxxxx
372 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
373 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
374 *
375 * Check for the 0x110000 limit too
376 */
377 const unsigned char *cur = ctxt->input->cur;
378 unsigned char c;
379 unsigned int val;
380
381 c = *cur;
382 if (c & 0x80) {
383 if (cur[1] == 0)
384 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
385 if ((cur[1] & 0xc0) != 0x80)
386 goto encoding_error;
387 if ((c & 0xe0) == 0xe0) {
388
389 if (cur[2] == 0)
390 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
391 if ((cur[2] & 0xc0) != 0x80)
392 goto encoding_error;
393 if ((c & 0xf0) == 0xf0) {
394 if (cur[3] == 0)
395 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
396 if (((c & 0xf8) != 0xf0) ||
397 ((cur[3] & 0xc0) != 0x80))
398 goto encoding_error;
399 /* 4-byte code */
400 *len = 4;
401 val = (cur[0] & 0x7) << 18;
402 val |= (cur[1] & 0x3f) << 12;
403 val |= (cur[2] & 0x3f) << 6;
404 val |= cur[3] & 0x3f;
405 } else {
406 /* 3-byte code */
407 *len = 3;
408 val = (cur[0] & 0xf) << 12;
409 val |= (cur[1] & 0x3f) << 6;
410 val |= cur[2] & 0x3f;
411 }
412 } else {
413 /* 2-byte code */
414 *len = 2;
415 val = (cur[0] & 0x1f) << 6;
416 val |= cur[1] & 0x3f;
417 }
418 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000419 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
420 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000421 }
422 return(val);
423 } else {
424 /* 1-byte code */
425 *len = 1;
426 return((int) *ctxt->input->cur);
427 }
428 }
429 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000430 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000431 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000432 * XML constructs only use < 128 chars
433 */
434 *len = 1;
435 if ((int) *ctxt->input->cur < 0x80)
436 return((int) *ctxt->input->cur);
437
438 /*
439 * Humm this is bad, do an automatic flow conversion
440 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200441 {
442 xmlChar * guess;
443 xmlCharEncodingHandlerPtr handler;
444
445 guess = htmlFindEncoding(ctxt);
446 if (guess == NULL) {
447 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
448 } else {
449 if (ctxt->input->encoding != NULL)
450 xmlFree((xmlChar *) ctxt->input->encoding);
451 ctxt->input->encoding = guess;
452 handler = xmlFindCharEncodingHandler((const char *) guess);
453 if (handler != NULL) {
454 xmlSwitchToEncoding(ctxt, handler);
455 } else {
456 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
457 "Unsupported encoding %s", guess, NULL);
458 }
459 }
460 ctxt->charset = XML_CHAR_ENCODING_UTF8;
461 }
462
Owen Taylor3473f882001-02-23 17:55:21 +0000463 return(xmlCurrentChar(ctxt, len));
464
465encoding_error:
466 /*
467 * If we detect an UTF8 error that probably mean that the
468 * input encoding didn't get properly advertized in the
469 * declaration header. Report the error and switch the encoding
470 * to ISO-Latin-1 (if you don't like this policy, just declare the
471 * encoding !)
472 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000473 {
474 char buffer[150];
475
Daniel Veillard861101d2007-06-12 08:38:57 +0000476 if (ctxt->input->end - ctxt->input->cur >= 4) {
477 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
478 ctxt->input->cur[0], ctxt->input->cur[1],
479 ctxt->input->cur[2], ctxt->input->cur[3]);
480 } else {
481 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
482 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000483 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
484 "Input is not proper UTF-8, indicate encoding !\n",
485 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000486 }
487
488 ctxt->charset = XML_CHAR_ENCODING_8859_1;
489 *len = 1;
490 return((int) *ctxt->input->cur);
491}
492
493/**
Owen Taylor3473f882001-02-23 17:55:21 +0000494 * htmlSkipBlankChars:
495 * @ctxt: the HTML parser context
496 *
497 * skip all blanks character found at that point in the input streams.
498 *
499 * Returns the number of space chars skipped
500 */
501
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000502static int
Owen Taylor3473f882001-02-23 17:55:21 +0000503htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
504 int res = 0;
505
William M. Brack76e95df2003-10-18 16:20:14 +0000506 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000507 if ((*ctxt->input->cur == 0) &&
508 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
509 xmlPopInput(ctxt);
510 } else {
511 if (*(ctxt->input->cur) == '\n') {
512 ctxt->input->line++; ctxt->input->col = 1;
513 } else ctxt->input->col++;
514 ctxt->input->cur++;
515 ctxt->nbChars++;
516 if (*ctxt->input->cur == 0)
517 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
518 }
519 res++;
520 }
521 return(res);
522}
523
524
525
526/************************************************************************
527 * *
528 * The list of HTML elements and their properties *
529 * *
530 ************************************************************************/
531
532/*
533 * Start Tag: 1 means the start tag can be ommited
534 * End Tag: 1 means the end tag can be ommited
535 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000536 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000537 * Depr: this element is deprecated
538 * DTD: 1 means that this element is valid only in the Loose DTD
539 * 2 means that this element is valid only in the Frameset DTD
540 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000541 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000543 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544
545/* Definitions and a couple of vars for HTML Elements */
546
547#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000548#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000550#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000551#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
552#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000554#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
555#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
556#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000557#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000558#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000559#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000560#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000561#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000562#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000563#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000564#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000565#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000566#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000567#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000568#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000569#define EMPTY NULL
570
571
Daniel Veillard065abe82006-07-03 08:55:04 +0000572static const char* const html_flow[] = { FLOW, NULL } ;
573static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574
575/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000576static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000577#define html_cdata html_pcdata
578
579
580/* ... and for HTML Attributes */
581
582#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000583#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000584#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000585#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000586#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000587#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000588#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000589#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000590#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000591#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000592#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000593#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000594
Daniel Veillard065abe82006-07-03 08:55:04 +0000595static const char* const html_attrs[] = { ATTRS, NULL } ;
596static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
597static const char* const core_attrs[] = { COREATTRS, NULL } ;
598static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000599
600
601/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000602static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
604 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000605static const char* const target_attr[] = { "target", NULL } ;
606static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
607static const char* const alt_attr[] = { "alt", NULL } ;
608static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
609static const char* const href_attrs[] = { "href", NULL } ;
610static const char* const clear_attrs[] = { "clear", NULL } ;
611static const char* const inline_p[] = { INLINE, "p", NULL } ;
612
613static const char* const flow_param[] = { FLOW, "param", NULL } ;
614static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615 "archive", "alt", "name", "height", "width", "align",
616 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000617static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000618 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000619static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000621static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
622static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
623static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
624static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000625 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000626static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
628
629
Daniel Veillard065abe82006-07-03 08:55:04 +0000630static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
631static const char* const col_elt[] = { "col", NULL } ;
632static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
633static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
634static const char* const dl_contents[] = { "dt", "dd", NULL } ;
635static const char* const compact_attr[] = { "compact", NULL } ;
636static const char* const label_attr[] = { "label", NULL } ;
637static const char* const fieldset_contents[] = { FLOW, "legend" } ;
638static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
639static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
640static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
641static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
642static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
643static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
644static const char* const head_attrs[] = { I18N, "profile", NULL } ;
645static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
646static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
647static const char* const version_attr[] = { "version", NULL } ;
648static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
649static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
650static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000651static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000652static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
653static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
654static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
655static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
656static const char* const align_attr[] = { "align", NULL } ;
657static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
658static const char* const map_contents[] = { BLOCK, "area", NULL } ;
659static const char* const name_attr[] = { "name", NULL } ;
660static const char* const action_attr[] = { "action", NULL } ;
661static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
662static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
663static const char* const content_attr[] = { "content", NULL } ;
664static const char* const type_attr[] = { "type", NULL } ;
665static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
666static const char* const object_contents[] = { FLOW, "param", NULL } ;
667static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
668static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
669static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
670static const char* const option_elt[] = { "option", NULL } ;
671static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
672static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
673static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
674static const char* const width_attr[] = { "width", NULL } ;
675static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
676static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
677static const char* const language_attr[] = { "language", NULL } ;
678static const char* const select_content[] = { "optgroup", "option", NULL } ;
679static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
680static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200681static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000682static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
683static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
684static const char* const tr_elt[] = { "tr", NULL } ;
685static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
686static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
687static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
688static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
689static const char* const tr_contents[] = { "th", "td", NULL } ;
690static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
691static const char* const li_elt[] = { "li", NULL } ;
692static const char* const ul_depr[] = { "type", "compact", NULL} ;
693static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000694
695#define DECL (const char**)
696
Daniel Veillard22090732001-07-16 00:06:07 +0000697static const htmlElemDesc
698html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000699{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
700 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
701},
702{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
703 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
704},
705{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
706 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
707},
708{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
709 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
710},
711{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
712 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
713},
714{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
715 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
716},
717{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
718 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
719},
720{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
721 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
722},
723{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
724 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
725},
726{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
727 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
728},
729{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
730 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
731},
732{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
733 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
734},
735{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
736 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
737},
738{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
739 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
740},
741{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
742 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
743},
744{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
745 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
746},
747{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
748 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
749},
750{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
751 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
752},
753{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
754 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
755},
756{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
757 EMPTY , NULL , DECL col_attrs , NULL, NULL
758},
759{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
760 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
761},
762{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
763 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
764},
765{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
766 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
767},
768{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
769 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
770},
771{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
772 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
773},
774{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
775 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
776},
777{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000778 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000779},
780{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
781 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
782},
783{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
784 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
785},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000786{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000787 EMPTY, NULL, DECL embed_attrs, NULL, NULL
788},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000789{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
790 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
791},
792{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
793 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
794},
795{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
796 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
797},
798{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
799 EMPTY, NULL, NULL, DECL frame_attrs, NULL
800},
801{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
802 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
803},
804{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
805 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
806},
807{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
808 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
809},
810{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
814 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
815},
816{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
817 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
818},
819{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
823 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
824},
825{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
826 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
827},
828{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
829 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
830},
831{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
832 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
833},
834{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
835 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
836},
837{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000838 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000839},
840{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
841 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
842},
843{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
844 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
845},
846{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
847 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
848},
849{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
853 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
854},
855{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
856 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
857},
858{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
859 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
860},
861{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
862 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
863},
864{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000865 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000866},
867{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
868 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
869},
870{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
871 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
872},
873{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
874 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
875},
876{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
877 DECL html_flow, "div", DECL html_attrs, NULL, NULL
878},
879{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
880 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
881},
882{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
883 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
884},
885{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000886 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000887},
888{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
889 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
890},
891{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000895 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000896},
897{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
898 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
899},
900{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
901 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
902},
903{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
904 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
905},
906{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
907 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
908},
909{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
910 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
911},
912{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
913 DECL select_content, NULL, DECL select_attrs, NULL, NULL
914},
915{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
916 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
917},
918{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
919 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
920},
921{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
922 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
923},
924{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
925 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
926},
927{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
928 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
929},
930{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
931 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
932},
933{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
934 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
935},
936{ "table", 0, 0, 0, 0, 0, 0, 0, "",
937 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
938},
939{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
940 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
941},
942{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
943 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
944},
945{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
946 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
947},
948{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
949 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
950},
951{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
952 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
953},
954{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
955 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
956},
957{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
958 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
959},
960{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
961 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
962},
963{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
964 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
965},
966{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
967 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
968},
969{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
970 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
971},
972{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
973 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
974}
Owen Taylor3473f882001-02-23 17:55:21 +0000975};
976
977/*
Owen Taylor3473f882001-02-23 17:55:21 +0000978 * start tags that imply the end of current element
979 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000980static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000981"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
982 "dl", "ul", "ol", "menu", "dir", "address", "pre",
983 "listing", "xmp", "head", NULL,
984"head", "p", NULL,
985"title", "p", NULL,
986"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000987"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000988"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
989 "pre", "listing", "xmp", "head", "li", NULL,
990"hr", "p", "head", NULL,
991"h1", "p", "head", NULL,
992"h2", "p", "head", NULL,
993"h3", "p", "head", NULL,
994"h4", "p", "head", NULL,
995"h5", "p", "head", NULL,
996"h6", "p", "head", NULL,
997"dir", "p", "head", NULL,
998"address", "p", "head", "ul", NULL,
999"pre", "p", "head", "ul", NULL,
1000"listing", "p", "head", NULL,
1001"xmp", "p", "head", NULL,
1002"blockquote", "p", "head", NULL,
1003"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1004 "xmp", "head", NULL,
1005"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1006 "head", "dd", NULL,
1007"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1008 "head", "dt", NULL,
1009"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1010 "listing", "xmp", NULL,
1011"ol", "p", "head", "ul", NULL,
1012"menu", "p", "head", "ul", NULL,
1013"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
1014"div", "p", "head", NULL,
1015"noscript", "p", "head", NULL,
1016"center", "font", "b", "i", "p", "head", NULL,
1017"a", "a", NULL,
1018"caption", "p", NULL,
1019"colgroup", "caption", "colgroup", "col", "p", NULL,
1020"col", "caption", "col", "p", NULL,
1021"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1022 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001023"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1024"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001025"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1026"thead", "caption", "col", "colgroup", NULL,
1027"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1028 "tbody", "p", NULL,
1029"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1030 "tfoot", "tbody", "p", NULL,
1031"optgroup", "option", NULL,
1032"option", "option", NULL,
1033"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1034 "pre", "listing", "xmp", "a", NULL,
1035NULL
1036};
1037
1038/*
1039 * The list of HTML elements which are supposed not to have
1040 * CDATA content and where a p element will be implied
1041 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001042 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001043 * implied paragraph
1044 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001045static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001046 "html",
1047 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001048 NULL
1049};
1050
1051/*
1052 * The list of HTML attributes which are of content %Script;
1053 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1054 * it assumes the name starts with 'on'
1055 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001056static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001057 "onclick",
1058 "ondblclick",
1059 "onmousedown",
1060 "onmouseup",
1061 "onmouseover",
1062 "onmousemove",
1063 "onmouseout",
1064 "onkeypress",
1065 "onkeydown",
1066 "onkeyup",
1067 "onload",
1068 "onunload",
1069 "onfocus",
1070 "onblur",
1071 "onsubmit",
1072 "onrest",
1073 "onchange",
1074 "onselect"
1075};
1076
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001077/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * This table is used by the htmlparser to know what to do with
1079 * broken html pages. By assigning different priorities to different
1080 * elements the parser can decide how to handle extra endtags.
1081 * Endtags are only allowed to close elements with lower or equal
1082 * priority.
1083 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001084
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001085typedef struct {
1086 const char *name;
1087 int priority;
1088} elementPriority;
1089
Daniel Veillard22090732001-07-16 00:06:07 +00001090static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001091 {"div", 150},
1092 {"td", 160},
1093 {"th", 160},
1094 {"tr", 170},
1095 {"thead", 180},
1096 {"tbody", 180},
1097 {"tfoot", 180},
1098 {"table", 190},
1099 {"head", 200},
1100 {"body", 200},
1101 {"html", 220},
1102 {NULL, 100} /* Default priority */
1103};
Owen Taylor3473f882001-02-23 17:55:21 +00001104
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001105static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001106static int htmlStartCloseIndexinitialized = 0;
1107
1108/************************************************************************
1109 * *
1110 * functions to handle HTML specific data *
1111 * *
1112 ************************************************************************/
1113
1114/**
1115 * htmlInitAutoClose:
1116 *
1117 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1118 * This is not reentrant. Call xmlInitParser() once before processing in
1119 * case of use in multithreaded programs.
1120 */
1121void
1122htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001123 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001124
1125 if (htmlStartCloseIndexinitialized) return;
1126
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001127 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1128 indx = 0;
1129 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001130 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001131 while (htmlStartClose[i] != NULL) i++;
1132 i++;
1133 }
1134 htmlStartCloseIndexinitialized = 1;
1135}
1136
1137/**
1138 * htmlTagLookup:
1139 * @tag: The tag name in lowercase
1140 *
1141 * Lookup the HTML tag in the ElementTable
1142 *
1143 * Returns the related htmlElemDescPtr or NULL if not found.
1144 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001145const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001146htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001147 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
1149 for (i = 0; i < (sizeof(html40ElementTable) /
1150 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001151 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001152 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154 return(NULL);
1155}
1156
1157/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001158 * htmlGetEndPriority:
1159 * @name: The name of the element to look up the priority for.
1160 *
1161 * Return value: The "endtag" priority.
1162 **/
1163static int
1164htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001165 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001166
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001167 while ((htmlEndPriority[i].name != NULL) &&
1168 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1169 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001170
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001172}
1173
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001174
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001175/**
Owen Taylor3473f882001-02-23 17:55:21 +00001176 * htmlCheckAutoClose:
1177 * @newtag: The new tag name
1178 * @oldtag: The old tag name
1179 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001180 * Checks whether the new tag is one of the registered valid tags for
1181 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001182 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1183 *
1184 * Returns 0 if no, 1 if yes.
1185 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1188{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001189 int i, indx;
1190 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001191
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001192 if (htmlStartCloseIndexinitialized == 0)
1193 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001194
1195 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001196 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001197 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 if (closed == NULL)
1199 return (0);
1200 if (xmlStrEqual(BAD_CAST * closed, newtag))
1201 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001202 }
1203
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001204 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001205 i++;
1206 while (htmlStartClose[i] != NULL) {
1207 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001208 return (1);
1209 }
1210 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001211 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001212 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001213}
1214
1215/**
1216 * htmlAutoCloseOnClose:
1217 * @ctxt: an HTML parser context
1218 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001219 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001220 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001221 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001222 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001223static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001224htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1225{
1226 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001227 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001228
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001230
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001231 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001232
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1234 break;
1235 /*
1236 * A missplaced endtag can only close elements with lower
1237 * or equal priority, so if we find an element with higher
1238 * priority before we find an element with
1239 * matching name, we just ignore this endtag
1240 */
1241 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1242 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001243 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 if (i < 0)
1245 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001246
1247 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001249 if ((info != NULL) && (info->endTag == 3)) {
1250 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1251 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001252 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001253 }
1254 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1255 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001256 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001257 }
1258}
1259
1260/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001261 * htmlAutoCloseOnEnd:
1262 * @ctxt: an HTML parser context
1263 *
1264 * Close all remaining tags at the end of the stream
1265 */
1266static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001267htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1268{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001269 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001270
William M. Brack899e64a2003-09-26 18:03:42 +00001271 if (ctxt->nameNr == 0)
1272 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001273 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001274 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1275 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001276 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001277 }
1278}
1279
1280/**
Owen Taylor3473f882001-02-23 17:55:21 +00001281 * htmlAutoClose:
1282 * @ctxt: an HTML parser context
1283 * @newtag: The new tag name or NULL
1284 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001285 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001286 * The list is kept in htmlStartClose array. This function is
1287 * called when a new tag has been detected and generates the
1288 * appropriates closes if possible/needed.
1289 * If newtag is NULL this mean we are at the end of the resource
1290 * and we should check
1291 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001292static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001293htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1294{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001295 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001296 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1298 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001299 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001300 }
1301 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001302 htmlAutoCloseOnEnd(ctxt);
1303 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001304 }
1305 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1307 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1308 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001309 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1310 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001311 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 }
Owen Taylor3473f882001-02-23 17:55:21 +00001313}
1314
1315/**
1316 * htmlAutoCloseTag:
1317 * @doc: the HTML document
1318 * @name: The tag name
1319 * @elem: the HTML element
1320 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001321 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001322 * The list is kept in htmlStartClose array. This function checks
1323 * if the element or one of it's children would autoclose the
1324 * given tag.
1325 *
1326 * Returns 1 if autoclose, 0 otherwise
1327 */
1328int
1329htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1330 htmlNodePtr child;
1331
1332 if (elem == NULL) return(1);
1333 if (xmlStrEqual(name, elem->name)) return(0);
1334 if (htmlCheckAutoClose(elem->name, name)) return(1);
1335 child = elem->children;
1336 while (child != NULL) {
1337 if (htmlAutoCloseTag(doc, name, child)) return(1);
1338 child = child->next;
1339 }
1340 return(0);
1341}
1342
1343/**
1344 * htmlIsAutoClosed:
1345 * @doc: the HTML document
1346 * @elem: the HTML element
1347 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001348 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001349 * The list is kept in htmlStartClose array. This function checks
1350 * if a tag is autoclosed by one of it's child
1351 *
1352 * Returns 1 if autoclosed, 0 otherwise
1353 */
1354int
1355htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1356 htmlNodePtr child;
1357
1358 if (elem == NULL) return(1);
1359 child = elem->children;
1360 while (child != NULL) {
1361 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1362 child = child->next;
1363 }
1364 return(0);
1365}
1366
1367/**
1368 * htmlCheckImplied:
1369 * @ctxt: an HTML parser context
1370 * @newtag: The new tag name
1371 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001372 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001373 * called when a new tag has been detected and generates the
1374 * appropriates implicit tags if missing
1375 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001376static void
Owen Taylor3473f882001-02-23 17:55:21 +00001377htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1378 if (!htmlOmittedDefaultValue)
1379 return;
1380 if (xmlStrEqual(newtag, BAD_CAST"html"))
1381 return;
1382 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1386 }
1387 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1388 return;
1389 if ((ctxt->nameNr <= 1) &&
1390 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1391 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1392 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1393 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1394 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1395 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1396 /*
1397 * dropped OBJECT ... i you put it first BODY will be
1398 * assumed !
1399 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001400 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001401 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1402 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1403 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1404 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1405 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1406 int i;
1407 for (i = 0;i < ctxt->nameNr;i++) {
1408 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1409 return;
1410 }
1411 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1412 return;
1413 }
1414 }
1415
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001416 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001417 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1418 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1419 }
1420}
1421
1422/**
1423 * htmlCheckParagraph
1424 * @ctxt: an HTML parser context
1425 *
1426 * Check whether a p element need to be implied before inserting
1427 * characters in the current element.
1428 *
1429 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1430 * in case of error.
1431 */
1432
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001433static int
Owen Taylor3473f882001-02-23 17:55:21 +00001434htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1435 const xmlChar *tag;
1436 int i;
1437
1438 if (ctxt == NULL)
1439 return(-1);
1440 tag = ctxt->name;
1441 if (tag == NULL) {
1442 htmlAutoClose(ctxt, BAD_CAST"p");
1443 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001444 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001445 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1446 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1447 return(1);
1448 }
1449 if (!htmlOmittedDefaultValue)
1450 return(0);
1451 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1452 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001453 htmlAutoClose(ctxt, BAD_CAST"p");
1454 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001455 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1458 return(1);
1459 }
1460 }
1461 return(0);
1462}
1463
1464/**
1465 * htmlIsScriptAttribute:
1466 * @name: an attribute name
1467 *
1468 * Check if an attribute is of content type Script
1469 *
1470 * Returns 1 is the attribute is a script 0 otherwise
1471 */
1472int
1473htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001474 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001475
1476 if (name == NULL)
1477 return(0);
1478 /*
1479 * all script attributes start with 'on'
1480 */
1481 if ((name[0] != 'o') || (name[1] != 'n'))
1482 return(0);
1483 for (i = 0;
1484 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1485 i++) {
1486 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1487 return(1);
1488 }
1489 return(0);
1490}
1491
1492/************************************************************************
1493 * *
1494 * The list of HTML predefined entities *
1495 * *
1496 ************************************************************************/
1497
1498
Daniel Veillard22090732001-07-16 00:06:07 +00001499static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001500/*
1501 * the 4 absolute ones, plus apostrophe.
1502 */
1503{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1504{ 38, "amp", "ampersand, U+0026 ISOnum" },
1505{ 39, "apos", "single quote" },
1506{ 60, "lt", "less-than sign, U+003C ISOnum" },
1507{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1508
1509/*
1510 * A bunch still in the 128-255 range
1511 * Replacing them depend really on the charset used.
1512 */
1513{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1514{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1515{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1516{ 163, "pound","pound sign, U+00A3 ISOnum" },
1517{ 164, "curren","currency sign, U+00A4 ISOnum" },
1518{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1519{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1520{ 167, "sect", "section sign, U+00A7 ISOnum" },
1521{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1522{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1523{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1524{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1525{ 172, "not", "not sign, U+00AC ISOnum" },
1526{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1527{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1528{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1529{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1530{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1531{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1532{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1533{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1534{ 181, "micro","micro sign, U+00B5 ISOnum" },
1535{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1536{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1537{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1538{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1539{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1540{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1541{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1542{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1543{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1544{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1545{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1546{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1547{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1548{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1549{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1550{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1551{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1552{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1553{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1554{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1555{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1556{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1557{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1558{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1559{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1560{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1561{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1562{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1563{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1564{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1565{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1566{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1567{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1568{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1569{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1570{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1571{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1572{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1573{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1574{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1575{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1576{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1577{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1578{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1579{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1580{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1581{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1582{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1583{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1584{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1585{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1586{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1587{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1588{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1589{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1590{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1591{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1592{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1593{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1594{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1595{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1596{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1597{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1598{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1599{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1600{ 247, "divide","division sign, U+00F7 ISOnum" },
1601{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1602{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1603{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1604{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1605{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1606{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1607{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1608{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1609
1610{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1611{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1612{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1613{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1614{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1615
1616/*
1617 * Anything below should really be kept as entities references
1618 */
1619{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1620
1621{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1622{ 732, "tilde","small tilde, U+02DC ISOdia" },
1623
1624{ 913, "Alpha","greek capital letter alpha, U+0391" },
1625{ 914, "Beta", "greek capital letter beta, U+0392" },
1626{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1627{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1628{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1629{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1630{ 919, "Eta", "greek capital letter eta, U+0397" },
1631{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1632{ 921, "Iota", "greek capital letter iota, U+0399" },
1633{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001634{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001635{ 924, "Mu", "greek capital letter mu, U+039C" },
1636{ 925, "Nu", "greek capital letter nu, U+039D" },
1637{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1638{ 927, "Omicron","greek capital letter omicron, U+039F" },
1639{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1640{ 929, "Rho", "greek capital letter rho, U+03A1" },
1641{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1642{ 932, "Tau", "greek capital letter tau, U+03A4" },
1643{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1644{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1645{ 935, "Chi", "greek capital letter chi, U+03A7" },
1646{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1647{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1648
1649{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1650{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1651{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1652{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1653{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1654{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1655{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1656{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1657{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1658{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1659{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1660{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1661{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1662{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1663{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1664{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1665{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1666{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1667{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1668{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1669{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1670{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1671{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1672{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1673{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1674{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1675{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1676{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1677
1678{ 8194, "ensp", "en space, U+2002 ISOpub" },
1679{ 8195, "emsp", "em space, U+2003 ISOpub" },
1680{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1681{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1682{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1683{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1684{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1685{ 8211, "ndash","en dash, U+2013 ISOpub" },
1686{ 8212, "mdash","em dash, U+2014 ISOpub" },
1687{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1688{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1689{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1690{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1691{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1692{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1693{ 8224, "dagger","dagger, U+2020 ISOpub" },
1694{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1695
1696{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1697{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1698
1699{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1700
1701{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1702{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1703
1704{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1705{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1706
1707{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1708{ 8260, "frasl","fraction slash, U+2044 NEW" },
1709
1710{ 8364, "euro", "euro sign, U+20AC NEW" },
1711
1712{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1713{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1714{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1715{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1716{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1717{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1718{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1719{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1720{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1721{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1722{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1723{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1724{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1725{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1726{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1727{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1728
1729{ 8704, "forall","for all, U+2200 ISOtech" },
1730{ 8706, "part", "partial differential, U+2202 ISOtech" },
1731{ 8707, "exist","there exists, U+2203 ISOtech" },
1732{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1733{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1734{ 8712, "isin", "element of, U+2208 ISOtech" },
1735{ 8713, "notin","not an element of, U+2209 ISOtech" },
1736{ 8715, "ni", "contains as member, U+220B ISOtech" },
1737{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001738{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001739{ 8722, "minus","minus sign, U+2212 ISOtech" },
1740{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1741{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1742{ 8733, "prop", "proportional to, U+221D ISOtech" },
1743{ 8734, "infin","infinity, U+221E ISOtech" },
1744{ 8736, "ang", "angle, U+2220 ISOamso" },
1745{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1746{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1747{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1748{ 8746, "cup", "union = cup, U+222A ISOtech" },
1749{ 8747, "int", "integral, U+222B ISOtech" },
1750{ 8756, "there4","therefore, U+2234 ISOtech" },
1751{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1752{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1753{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1754{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1755{ 8801, "equiv","identical to, U+2261 ISOtech" },
1756{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1757{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1758{ 8834, "sub", "subset of, U+2282 ISOtech" },
1759{ 8835, "sup", "superset of, U+2283 ISOtech" },
1760{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1761{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1762{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1763{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1764{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1765{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1766{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1767{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1768{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1769{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1770{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1771{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1772{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1773{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1774
1775{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1776{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1777{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1778{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1779
1780};
1781
1782/************************************************************************
1783 * *
1784 * Commodity functions to handle entities *
1785 * *
1786 ************************************************************************/
1787
1788/*
1789 * Macro used to grow the current buffer.
1790 */
1791#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001792 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001793 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001794 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1795 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001796 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001797 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(NULL); \
1799 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001800 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001801}
1802
1803/**
1804 * htmlEntityLookup:
1805 * @name: the entity name
1806 *
1807 * Lookup the given entity in EntitiesTable
1808 *
1809 * TODO: the linear scan is really ugly, an hash table is really needed.
1810 *
1811 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1812 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001813const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001814htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001815 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 for (i = 0;i < (sizeof(html40EntitiesTable)/
1818 sizeof(html40EntitiesTable[0]));i++) {
1819 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001820 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001821 }
1822 }
1823 return(NULL);
1824}
1825
1826/**
1827 * htmlEntityValueLookup:
1828 * @value: the entity's unicode value
1829 *
1830 * Lookup the given entity in EntitiesTable
1831 *
1832 * TODO: the linear scan is really ugly, an hash table is really needed.
1833 *
1834 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1835 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001836const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001837htmlEntityValueLookup(unsigned int value) {
1838 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001839
1840 for (i = 0;i < (sizeof(html40EntitiesTable)/
1841 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001842 if (html40EntitiesTable[i].value >= value) {
1843 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001844 break;
William M. Brack78637da2003-07-31 14:47:38 +00001845 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001846 }
Owen Taylor3473f882001-02-23 17:55:21 +00001847 }
1848 return(NULL);
1849}
1850
1851/**
1852 * UTF8ToHtml:
1853 * @out: a pointer to an array of bytes to store the result
1854 * @outlen: the length of @out
1855 * @in: a pointer to an array of UTF-8 chars
1856 * @inlen: the length of @in
1857 *
1858 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1859 * plus HTML entities block of chars out.
1860 *
1861 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1862 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001863 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001864 * The value of @outlen after return is the number of octets consumed.
1865 */
1866int
1867UTF8ToHtml(unsigned char* out, int *outlen,
1868 const unsigned char* in, int *inlen) {
1869 const unsigned char* processed = in;
1870 const unsigned char* outend;
1871 const unsigned char* outstart = out;
1872 const unsigned char* instart = in;
1873 const unsigned char* inend;
1874 unsigned int c, d;
1875 int trailing;
1876
Daniel Veillardce682bc2004-11-05 17:22:25 +00001877 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001878 if (in == NULL) {
1879 /*
1880 * initialization nothing to do
1881 */
1882 *outlen = 0;
1883 *inlen = 0;
1884 return(0);
1885 }
1886 inend = in + (*inlen);
1887 outend = out + (*outlen);
1888 while (in < inend) {
1889 d = *in++;
1890 if (d < 0x80) { c= d; trailing= 0; }
1891 else if (d < 0xC0) {
1892 /* trailing byte in leading position */
1893 *outlen = out - outstart;
1894 *inlen = processed - instart;
1895 return(-2);
1896 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1897 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1898 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1899 else {
1900 /* no chance for this in Ascii */
1901 *outlen = out - outstart;
1902 *inlen = processed - instart;
1903 return(-2);
1904 }
1905
1906 if (inend - in < trailing) {
1907 break;
1908 }
1909
1910 for ( ; trailing; trailing--) {
1911 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1912 break;
1913 c <<= 6;
1914 c |= d & 0x3F;
1915 }
1916
1917 /* assertion: c is a single UTF-4 value */
1918 if (c < 0x80) {
1919 if (out + 1 >= outend)
1920 break;
1921 *out++ = c;
1922 } else {
1923 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001924 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001925 const char *cp;
1926 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001927
1928 /*
1929 * Try to lookup a predefined HTML entity for it
1930 */
1931
1932 ent = htmlEntityValueLookup(c);
1933 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001934 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1935 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001936 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001937 else
1938 cp = ent->name;
1939 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001940 if (out + 2 + len >= outend)
1941 break;
1942 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001943 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001944 out += len;
1945 *out++ = ';';
1946 }
1947 processed = in;
1948 }
1949 *outlen = out - outstart;
1950 *inlen = processed - instart;
1951 return(0);
1952}
1953
1954/**
1955 * htmlEncodeEntities:
1956 * @out: a pointer to an array of bytes to store the result
1957 * @outlen: the length of @out
1958 * @in: a pointer to an array of UTF-8 chars
1959 * @inlen: the length of @in
1960 * @quoteChar: the quote character to escape (' or ") or zero.
1961 *
1962 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1963 * plus HTML entities block of chars out.
1964 *
1965 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1966 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001967 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001968 * The value of @outlen after return is the number of octets consumed.
1969 */
1970int
1971htmlEncodeEntities(unsigned char* out, int *outlen,
1972 const unsigned char* in, int *inlen, int quoteChar) {
1973 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001974 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001975 const unsigned char* outstart = out;
1976 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001977 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001978 unsigned int c, d;
1979 int trailing;
1980
Daniel Veillardce682bc2004-11-05 17:22:25 +00001981 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1982 return(-1);
1983 outend = out + (*outlen);
1984 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001985 while (in < inend) {
1986 d = *in++;
1987 if (d < 0x80) { c= d; trailing= 0; }
1988 else if (d < 0xC0) {
1989 /* trailing byte in leading position */
1990 *outlen = out - outstart;
1991 *inlen = processed - instart;
1992 return(-2);
1993 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1994 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1995 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1996 else {
1997 /* no chance for this in Ascii */
1998 *outlen = out - outstart;
1999 *inlen = processed - instart;
2000 return(-2);
2001 }
2002
2003 if (inend - in < trailing)
2004 break;
2005
2006 while (trailing--) {
2007 if (((d= *in++) & 0xC0) != 0x80) {
2008 *outlen = out - outstart;
2009 *inlen = processed - instart;
2010 return(-2);
2011 }
2012 c <<= 6;
2013 c |= d & 0x3F;
2014 }
2015
2016 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002017 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2018 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002019 if (out >= outend)
2020 break;
2021 *out++ = c;
2022 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002023 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002024 const char *cp;
2025 char nbuf[16];
2026 int len;
2027
2028 /*
2029 * Try to lookup a predefined HTML entity for it
2030 */
2031 ent = htmlEntityValueLookup(c);
2032 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002033 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002034 cp = nbuf;
2035 }
2036 else
2037 cp = ent->name;
2038 len = strlen(cp);
2039 if (out + 2 + len > outend)
2040 break;
2041 *out++ = '&';
2042 memcpy(out, cp, len);
2043 out += len;
2044 *out++ = ';';
2045 }
2046 processed = in;
2047 }
2048 *outlen = out - outstart;
2049 *inlen = processed - instart;
2050 return(0);
2051}
2052
Owen Taylor3473f882001-02-23 17:55:21 +00002053/************************************************************************
2054 * *
2055 * Commodity functions to handle streams *
2056 * *
2057 ************************************************************************/
2058
2059/**
Owen Taylor3473f882001-02-23 17:55:21 +00002060 * htmlNewInputStream:
2061 * @ctxt: an HTML parser context
2062 *
2063 * Create a new input stream structure
2064 * Returns the new input stream or NULL
2065 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002066static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002067htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2068 htmlParserInputPtr input;
2069
2070 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2071 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002072 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002073 return(NULL);
2074 }
2075 memset(input, 0, sizeof(htmlParserInput));
2076 input->filename = NULL;
2077 input->directory = NULL;
2078 input->base = NULL;
2079 input->cur = NULL;
2080 input->buf = NULL;
2081 input->line = 1;
2082 input->col = 1;
2083 input->buf = NULL;
2084 input->free = NULL;
2085 input->version = NULL;
2086 input->consumed = 0;
2087 input->length = 0;
2088 return(input);
2089}
2090
2091
2092/************************************************************************
2093 * *
2094 * Commodity functions, cleanup needed ? *
2095 * *
2096 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002097/*
2098 * all tags allowing pc data from the html 4.01 loose dtd
2099 * NOTE: it might be more apropriate to integrate this information
2100 * into the html40ElementTable array but I don't want to risk any
2101 * binary incomptibility
2102 */
2103static const char *allowPCData[] = {
2104 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2105 "blockquote", "body", "button", "caption", "center", "cite", "code",
2106 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2107 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2108 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2109 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2110};
Owen Taylor3473f882001-02-23 17:55:21 +00002111
2112/**
2113 * areBlanks:
2114 * @ctxt: an HTML parser context
2115 * @str: a xmlChar *
2116 * @len: the size of @str
2117 *
2118 * Is this a sequence of blank chars that one can ignore ?
2119 *
2120 * Returns 1 if ignorable 0 otherwise.
2121 */
2122
2123static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002124 unsigned int i;
2125 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002126 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002127 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002128
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002129 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002130 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002131
2132 if (CUR == 0) return(1);
2133 if (CUR != '<') return(0);
2134 if (ctxt->name == NULL)
2135 return(1);
2136 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2137 return(1);
2138 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2139 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002140
2141 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2142 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2143 dtd = xmlGetIntSubset(ctxt->myDoc);
2144 if (dtd != NULL && dtd->ExternalID != NULL) {
2145 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2146 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2147 return(1);
2148 }
2149 }
2150
Owen Taylor3473f882001-02-23 17:55:21 +00002151 if (ctxt->node == NULL) return(0);
2152 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002153 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2154 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002155 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002156 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2157 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002158 /* keep ws in constructs like ...<b> </b>...
2159 for all tags "b" allowing PCDATA */
2160 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2161 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2162 return(0);
2163 }
2164 }
Owen Taylor3473f882001-02-23 17:55:21 +00002165 } else if (xmlNodeIsText(lastChild)) {
2166 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002167 } else {
2168 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2169 for all tags "p" allowing PCDATA */
2170 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2171 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2172 return(0);
2173 }
2174 }
Owen Taylor3473f882001-02-23 17:55:21 +00002175 }
2176 return(1);
2177}
2178
2179/**
Owen Taylor3473f882001-02-23 17:55:21 +00002180 * htmlNewDocNoDtD:
2181 * @URI: URI for the dtd, or NULL
2182 * @ExternalID: the external ID of the DTD, or NULL
2183 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002184 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2185 * are NULL
2186 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002187 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002188 */
2189htmlDocPtr
2190htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2191 xmlDocPtr cur;
2192
2193 /*
2194 * Allocate a new document and fill the fields.
2195 */
2196 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2197 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002198 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002199 return(NULL);
2200 }
2201 memset(cur, 0, sizeof(xmlDoc));
2202
2203 cur->type = XML_HTML_DOCUMENT_NODE;
2204 cur->version = NULL;
2205 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002206 cur->doc = cur;
2207 cur->name = NULL;
2208 cur->children = NULL;
2209 cur->extSubset = NULL;
2210 cur->oldNs = NULL;
2211 cur->encoding = NULL;
2212 cur->standalone = 1;
2213 cur->compression = 0;
2214 cur->ids = NULL;
2215 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002216 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002217 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002218 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002219 if ((ExternalID != NULL) ||
2220 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002221 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002222 return(cur);
2223}
2224
2225/**
2226 * htmlNewDoc:
2227 * @URI: URI for the dtd, or NULL
2228 * @ExternalID: the external ID of the DTD, or NULL
2229 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002230 * Creates a new HTML document
2231 *
Owen Taylor3473f882001-02-23 17:55:21 +00002232 * Returns a new document
2233 */
2234htmlDocPtr
2235htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2236 if ((URI == NULL) && (ExternalID == NULL))
2237 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002238 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2239 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002240
2241 return(htmlNewDocNoDtD(URI, ExternalID));
2242}
2243
2244
2245/************************************************************************
2246 * *
2247 * The parser itself *
2248 * Relates to http://www.w3.org/TR/html40 *
2249 * *
2250 ************************************************************************/
2251
2252/************************************************************************
2253 * *
2254 * The parser itself *
2255 * *
2256 ************************************************************************/
2257
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002258static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002259
Owen Taylor3473f882001-02-23 17:55:21 +00002260/**
2261 * htmlParseHTMLName:
2262 * @ctxt: an HTML parser context
2263 *
2264 * parse an HTML tag or attribute name, note that we convert it to lowercase
2265 * since HTML names are not case-sensitive.
2266 *
2267 * Returns the Tag Name parsed or NULL
2268 */
2269
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002270static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002271htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002272 int i = 0;
2273 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2274
William M. Brackd1757ab2004-10-02 22:07:48 +00002275 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002276 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002277
2278 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002279 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002280 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2281 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002282 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2283 else loc[i] = CUR;
2284 i++;
2285
2286 NEXT;
2287 }
2288
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002289 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002290}
2291
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002292
2293/**
2294 * htmlParseHTMLName_nonInvasive:
2295 * @ctxt: an HTML parser context
2296 *
2297 * parse an HTML tag or attribute name, note that we convert it to lowercase
2298 * since HTML names are not case-sensitive, this doesn't consume the data
2299 * from the stream, it's a look-ahead
2300 *
2301 * Returns the Tag Name parsed or NULL
2302 */
2303
2304static const xmlChar *
2305htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2306 int i = 0;
2307 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2308
2309 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2310 (NXT(1) != ':')) return(NULL);
2311
2312 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2313 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2314 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2315 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2316 else loc[i] = NXT(1+i);
2317 i++;
2318 }
2319
2320 return(xmlDictLookup(ctxt->dict, loc, i));
2321}
2322
2323
Owen Taylor3473f882001-02-23 17:55:21 +00002324/**
2325 * htmlParseName:
2326 * @ctxt: an HTML parser context
2327 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002328 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002329 *
2330 * Returns the Name parsed or NULL
2331 */
2332
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002333static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002334htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002335 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002336 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002337 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002338
2339 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002340
2341 /*
2342 * Accelerator for simple ASCII names
2343 */
2344 in = ctxt->input->cur;
2345 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2346 ((*in >= 0x41) && (*in <= 0x5A)) ||
2347 (*in == '_') || (*in == ':')) {
2348 in++;
2349 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2350 ((*in >= 0x41) && (*in <= 0x5A)) ||
2351 ((*in >= 0x30) && (*in <= 0x39)) ||
2352 (*in == '_') || (*in == '-') ||
2353 (*in == ':') || (*in == '.'))
2354 in++;
2355 if ((*in > 0) && (*in < 0x80)) {
2356 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002357 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002358 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002359 ctxt->nbChars += count;
2360 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002361 return(ret);
2362 }
2363 }
2364 return(htmlParseNameComplex(ctxt));
2365}
2366
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002367static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002368htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002369 int len = 0, l;
2370 int c;
2371 int count = 0;
2372
2373 /*
2374 * Handler for more complex cases
2375 */
2376 GROW;
2377 c = CUR_CHAR(l);
2378 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2379 (!IS_LETTER(c) && (c != '_') &&
2380 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002381 return(NULL);
2382 }
2383
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002384 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2385 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2386 (c == '.') || (c == '-') ||
2387 (c == '_') || (c == ':') ||
2388 (IS_COMBINING(c)) ||
2389 (IS_EXTENDER(c)))) {
2390 if (count++ > 100) {
2391 count = 0;
2392 GROW;
2393 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002394 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002395 NEXTL(l);
2396 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002397 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002398 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002399}
2400
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002401
Owen Taylor3473f882001-02-23 17:55:21 +00002402/**
2403 * htmlParseHTMLAttribute:
2404 * @ctxt: an HTML parser context
2405 * @stop: a char stop value
2406 *
2407 * parse an HTML attribute value till the stop (quote), if
2408 * stop is 0 then it stops at the first space
2409 *
2410 * Returns the attribute parsed or NULL
2411 */
2412
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002413static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002414htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2415 xmlChar *buffer = NULL;
2416 int buffer_size = 0;
2417 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002418 const xmlChar *name = NULL;
2419 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002420 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002421
2422 /*
2423 * allocate a translation buffer.
2424 */
2425 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002426 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002427 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002428 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002429 return(NULL);
2430 }
2431 out = buffer;
2432
2433 /*
2434 * Ok loop until we reach one of the ending chars
2435 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002436 while ((CUR != 0) && (CUR != stop)) {
2437 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002438 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002439 if (CUR == '&') {
2440 if (NXT(1) == '#') {
2441 unsigned int c;
2442 int bits;
2443
2444 c = htmlParseCharRef(ctxt);
2445 if (c < 0x80)
2446 { *out++ = c; bits= -6; }
2447 else if (c < 0x800)
2448 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2449 else if (c < 0x10000)
2450 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2451 else
2452 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2453
2454 for ( ; bits >= 0; bits-= 6) {
2455 *out++ = ((c >> bits) & 0x3F) | 0x80;
2456 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002457
2458 if (out - buffer > buffer_size - 100) {
2459 int indx = out - buffer;
2460
2461 growBuffer(buffer);
2462 out = &buffer[indx];
2463 }
Owen Taylor3473f882001-02-23 17:55:21 +00002464 } else {
2465 ent = htmlParseEntityRef(ctxt, &name);
2466 if (name == NULL) {
2467 *out++ = '&';
2468 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002469 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002470
2471 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002472 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002473 }
2474 } else if (ent == NULL) {
2475 *out++ = '&';
2476 cur = name;
2477 while (*cur != 0) {
2478 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002479 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002480
2481 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002482 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002483 }
2484 *out++ = *cur++;
2485 }
Owen Taylor3473f882001-02-23 17:55:21 +00002486 } else {
2487 unsigned int c;
2488 int bits;
2489
2490 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002491 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002492
2493 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002494 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002495 }
Daniel Veillard48519092006-10-17 15:56:35 +00002496 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002497 if (c < 0x80)
2498 { *out++ = c; bits= -6; }
2499 else if (c < 0x800)
2500 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2501 else if (c < 0x10000)
2502 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2503 else
2504 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2505
2506 for ( ; bits >= 0; bits-= 6) {
2507 *out++ = ((c >> bits) & 0x3F) | 0x80;
2508 }
Owen Taylor3473f882001-02-23 17:55:21 +00002509 }
2510 }
2511 } else {
2512 unsigned int c;
2513 int bits, l;
2514
2515 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002516 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002517
2518 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002519 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002520 }
2521 c = CUR_CHAR(l);
2522 if (c < 0x80)
2523 { *out++ = c; bits= -6; }
2524 else if (c < 0x800)
2525 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2526 else if (c < 0x10000)
2527 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2528 else
2529 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2530
2531 for ( ; bits >= 0; bits-= 6) {
2532 *out++ = ((c >> bits) & 0x3F) | 0x80;
2533 }
2534 NEXT;
2535 }
2536 }
2537 *out++ = 0;
2538 return(buffer);
2539}
2540
2541/**
Owen Taylor3473f882001-02-23 17:55:21 +00002542 * htmlParseEntityRef:
2543 * @ctxt: an HTML parser context
2544 * @str: location to store the entity name
2545 *
2546 * parse an HTML ENTITY references
2547 *
2548 * [68] EntityRef ::= '&' Name ';'
2549 *
2550 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2551 * if non-NULL *str will have to be freed by the caller.
2552 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002553const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002554htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2555 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002556 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002557
2558 if (str != NULL) *str = NULL;
2559 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002560
2561 if (CUR == '&') {
2562 NEXT;
2563 name = htmlParseName(ctxt);
2564 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002565 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2566 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002567 } else {
2568 GROW;
2569 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002570 if (str != NULL)
2571 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002572
2573 /*
2574 * Lookup the entity in the table.
2575 */
2576 ent = htmlEntityLookup(name);
2577 if (ent != NULL) /* OK that's ugly !!! */
2578 NEXT;
2579 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002580 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2581 "htmlParseEntityRef: expecting ';'\n",
2582 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002583 if (str != NULL)
2584 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002585 }
2586 }
2587 }
2588 return(ent);
2589}
2590
2591/**
2592 * htmlParseAttValue:
2593 * @ctxt: an HTML parser context
2594 *
2595 * parse a value for an attribute
2596 * Note: the parser won't do substitution of entities here, this
2597 * will be handled later in xmlStringGetNodeList, unless it was
2598 * asked for ctxt->replaceEntities != 0
2599 *
2600 * Returns the AttValue parsed or NULL.
2601 */
2602
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002603static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002604htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2605 xmlChar *ret = NULL;
2606
2607 if (CUR == '"') {
2608 NEXT;
2609 ret = htmlParseHTMLAttribute(ctxt, '"');
2610 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002611 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2612 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002613 } else
2614 NEXT;
2615 } else if (CUR == '\'') {
2616 NEXT;
2617 ret = htmlParseHTMLAttribute(ctxt, '\'');
2618 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002619 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2620 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002621 } else
2622 NEXT;
2623 } else {
2624 /*
2625 * That's an HTMLism, the attribute value may not be quoted
2626 */
2627 ret = htmlParseHTMLAttribute(ctxt, 0);
2628 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002629 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2630 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002631 }
2632 }
2633 return(ret);
2634}
2635
2636/**
2637 * htmlParseSystemLiteral:
2638 * @ctxt: an HTML parser context
2639 *
2640 * parse an HTML Literal
2641 *
2642 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2643 *
2644 * Returns the SystemLiteral parsed or NULL
2645 */
2646
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002647static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002648htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2649 const xmlChar *q;
2650 xmlChar *ret = NULL;
2651
2652 if (CUR == '"') {
2653 NEXT;
2654 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002655 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002656 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002657 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002658 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2659 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002660 } else {
2661 ret = xmlStrndup(q, CUR_PTR - q);
2662 NEXT;
2663 }
2664 } else if (CUR == '\'') {
2665 NEXT;
2666 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002667 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002668 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002669 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002670 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2671 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002672 } else {
2673 ret = xmlStrndup(q, CUR_PTR - q);
2674 NEXT;
2675 }
2676 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002677 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2678 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002679 }
2680
2681 return(ret);
2682}
2683
2684/**
2685 * htmlParsePubidLiteral:
2686 * @ctxt: an HTML parser context
2687 *
2688 * parse an HTML public literal
2689 *
2690 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2691 *
2692 * Returns the PubidLiteral parsed or NULL.
2693 */
2694
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002695static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002696htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2697 const xmlChar *q;
2698 xmlChar *ret = NULL;
2699 /*
2700 * Name ::= (Letter | '_') (NameChar)*
2701 */
2702 if (CUR == '"') {
2703 NEXT;
2704 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002705 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002706 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002707 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2708 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002709 } else {
2710 ret = xmlStrndup(q, CUR_PTR - q);
2711 NEXT;
2712 }
2713 } else if (CUR == '\'') {
2714 NEXT;
2715 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002716 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002717 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002718 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002719 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2720 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002721 } else {
2722 ret = xmlStrndup(q, CUR_PTR - q);
2723 NEXT;
2724 }
2725 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002726 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2727 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002728 }
2729
2730 return(ret);
2731}
2732
2733/**
2734 * htmlParseScript:
2735 * @ctxt: an HTML parser context
2736 *
2737 * parse the content of an HTML SCRIPT or STYLE element
2738 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2739 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2740 * http://www.w3.org/TR/html4/types.html#type-script
2741 * http://www.w3.org/TR/html4/types.html#h-6.15
2742 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2743 *
2744 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2745 * element and the value of intrinsic event attributes. User agents must
2746 * not evaluate script data as HTML markup but instead must pass it on as
2747 * data to a script engine.
2748 * NOTES:
2749 * - The content is passed like CDATA
2750 * - the attributes for style and scripting "onXXX" are also described
2751 * as CDATA but SGML allows entities references in attributes so their
2752 * processing is identical as other attributes
2753 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002754static void
Owen Taylor3473f882001-02-23 17:55:21 +00002755htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002756 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002757 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002758 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002759
2760 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002761 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002762 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002763 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002764 /*
2765 * One should break here, the specification is clear:
2766 * Authors should therefore escape "</" within the content.
2767 * Escape mechanisms are specific to each scripting or
2768 * style sheet language.
2769 *
2770 * In recovery mode, only break if end tag match the
2771 * current tag, effectively ignoring all tags inside the
2772 * script/style block and treating the entire block as
2773 * CDATA.
2774 */
2775 if (ctxt->recovery) {
2776 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2777 xmlStrlen(ctxt->name)) == 0)
2778 {
2779 break; /* while */
2780 } else {
2781 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002782 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002783 ctxt->name, NULL);
2784 }
2785 } else {
2786 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2787 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2788 {
2789 break; /* while */
2790 }
2791 }
Owen Taylor3473f882001-02-23 17:55:21 +00002792 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002793 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002794 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2795 if (ctxt->sax->cdataBlock!= NULL) {
2796 /*
2797 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2798 */
2799 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002800 } else if (ctxt->sax->characters != NULL) {
2801 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002802 }
2803 nbchar = 0;
2804 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002805 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002806 NEXTL(l);
2807 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002808 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002809
Daniel Veillard68716a72006-10-16 09:32:17 +00002810 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002811 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2812 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002813 NEXT;
2814 }
2815
2816 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2817 if (ctxt->sax->cdataBlock!= NULL) {
2818 /*
2819 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2820 */
2821 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002822 } else if (ctxt->sax->characters != NULL) {
2823 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002824 }
2825 }
2826}
2827
2828
2829/**
2830 * htmlParseCharData:
2831 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002832 *
2833 * parse a CharData section.
2834 * if we are within a CDATA section ']]>' marks an end of section.
2835 *
2836 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2837 */
2838
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002839static void
2840htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002841 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2842 int nbchar = 0;
2843 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002844 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002845
2846 SHRINK;
2847 cur = CUR_CHAR(l);
2848 while (((cur != '<') || (ctxt->token == '<')) &&
2849 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002850 (cur != 0)) {
2851 if (!(IS_CHAR(cur))) {
2852 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2853 "Invalid char in CDATA 0x%X\n", cur);
2854 } else {
2855 COPY_BUF(l,buf,nbchar,cur);
2856 }
Owen Taylor3473f882001-02-23 17:55:21 +00002857 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2858 /*
2859 * Ok the segment is to be consumed as chars.
2860 */
2861 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2862 if (areBlanks(ctxt, buf, nbchar)) {
2863 if (ctxt->sax->ignorableWhitespace != NULL)
2864 ctxt->sax->ignorableWhitespace(ctxt->userData,
2865 buf, nbchar);
2866 } else {
2867 htmlCheckParagraph(ctxt);
2868 if (ctxt->sax->characters != NULL)
2869 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2870 }
2871 }
2872 nbchar = 0;
2873 }
2874 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002875 chunk++;
2876 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2877 chunk = 0;
2878 SHRINK;
2879 GROW;
2880 }
Owen Taylor3473f882001-02-23 17:55:21 +00002881 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002882 if (cur == 0) {
2883 SHRINK;
2884 GROW;
2885 cur = CUR_CHAR(l);
2886 }
Owen Taylor3473f882001-02-23 17:55:21 +00002887 }
2888 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002889 buf[nbchar] = 0;
2890
Owen Taylor3473f882001-02-23 17:55:21 +00002891 /*
2892 * Ok the segment is to be consumed as chars.
2893 */
2894 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2895 if (areBlanks(ctxt, buf, nbchar)) {
2896 if (ctxt->sax->ignorableWhitespace != NULL)
2897 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2898 } else {
2899 htmlCheckParagraph(ctxt);
2900 if (ctxt->sax->characters != NULL)
2901 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2902 }
2903 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002904 } else {
2905 /*
2906 * Loop detection
2907 */
2908 if (cur == 0)
2909 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002910 }
2911}
2912
2913/**
2914 * htmlParseExternalID:
2915 * @ctxt: an HTML parser context
2916 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002917 *
2918 * Parse an External ID or a Public ID
2919 *
Owen Taylor3473f882001-02-23 17:55:21 +00002920 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2921 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2922 *
2923 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2924 *
2925 * Returns the function returns SystemLiteral and in the second
2926 * case publicID receives PubidLiteral, is strict is off
2927 * it is possible to return NULL and have publicID set.
2928 */
2929
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002930static xmlChar *
2931htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002932 xmlChar *URI = NULL;
2933
2934 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2935 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2936 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2937 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002938 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002939 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2940 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002941 }
2942 SKIP_BLANKS;
2943 URI = htmlParseSystemLiteral(ctxt);
2944 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002945 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2946 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002947 }
2948 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2949 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2950 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2951 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002952 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002953 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2954 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002955 }
2956 SKIP_BLANKS;
2957 *publicID = htmlParsePubidLiteral(ctxt);
2958 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002959 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2960 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2961 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002962 }
2963 SKIP_BLANKS;
2964 if ((CUR == '"') || (CUR == '\'')) {
2965 URI = htmlParseSystemLiteral(ctxt);
2966 }
2967 }
2968 return(URI);
2969}
2970
2971/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002972 * xmlParsePI:
2973 * @ctxt: an XML parser context
2974 *
2975 * parse an XML Processing Instruction.
2976 *
2977 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2978 */
2979static void
2980htmlParsePI(htmlParserCtxtPtr ctxt) {
2981 xmlChar *buf = NULL;
2982 int len = 0;
2983 int size = HTML_PARSER_BUFFER_SIZE;
2984 int cur, l;
2985 const xmlChar *target;
2986 xmlParserInputState state;
2987 int count = 0;
2988
2989 if ((RAW == '<') && (NXT(1) == '?')) {
2990 state = ctxt->instate;
2991 ctxt->instate = XML_PARSER_PI;
2992 /*
2993 * this is a Processing Instruction.
2994 */
2995 SKIP(2);
2996 SHRINK;
2997
2998 /*
2999 * Parse the target name and check for special support like
3000 * namespace.
3001 */
3002 target = htmlParseName(ctxt);
3003 if (target != NULL) {
3004 if (RAW == '>') {
3005 SKIP(1);
3006
3007 /*
3008 * SAX: PI detected.
3009 */
3010 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3011 (ctxt->sax->processingInstruction != NULL))
3012 ctxt->sax->processingInstruction(ctxt->userData,
3013 target, NULL);
3014 ctxt->instate = state;
3015 return;
3016 }
3017 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3018 if (buf == NULL) {
3019 htmlErrMemory(ctxt, NULL);
3020 ctxt->instate = state;
3021 return;
3022 }
3023 cur = CUR;
3024 if (!IS_BLANK(cur)) {
3025 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3026 "ParsePI: PI %s space expected\n", target, NULL);
3027 }
3028 SKIP_BLANKS;
3029 cur = CUR_CHAR(l);
3030 while (IS_CHAR(cur) && (cur != '>')) {
3031 if (len + 5 >= size) {
3032 xmlChar *tmp;
3033
3034 size *= 2;
3035 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3036 if (tmp == NULL) {
3037 htmlErrMemory(ctxt, NULL);
3038 xmlFree(buf);
3039 ctxt->instate = state;
3040 return;
3041 }
3042 buf = tmp;
3043 }
3044 count++;
3045 if (count > 50) {
3046 GROW;
3047 count = 0;
3048 }
3049 COPY_BUF(l,buf,len,cur);
3050 NEXTL(l);
3051 cur = CUR_CHAR(l);
3052 if (cur == 0) {
3053 SHRINK;
3054 GROW;
3055 cur = CUR_CHAR(l);
3056 }
3057 }
3058 buf[len] = 0;
3059 if (cur != '>') {
3060 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3061 "ParsePI: PI %s never end ...\n", target, NULL);
3062 } else {
3063 SKIP(1);
3064
3065 /*
3066 * SAX: PI detected.
3067 */
3068 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3069 (ctxt->sax->processingInstruction != NULL))
3070 ctxt->sax->processingInstruction(ctxt->userData,
3071 target, buf);
3072 }
3073 xmlFree(buf);
3074 } else {
3075 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3076 "PI is not started correctly", NULL, NULL);
3077 }
3078 ctxt->instate = state;
3079 }
3080}
3081
3082/**
Owen Taylor3473f882001-02-23 17:55:21 +00003083 * htmlParseComment:
3084 * @ctxt: an HTML parser context
3085 *
3086 * Parse an XML (SGML) comment <!-- .... -->
3087 *
3088 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3089 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003090static void
Owen Taylor3473f882001-02-23 17:55:21 +00003091htmlParseComment(htmlParserCtxtPtr ctxt) {
3092 xmlChar *buf = NULL;
3093 int len;
3094 int size = HTML_PARSER_BUFFER_SIZE;
3095 int q, ql;
3096 int r, rl;
3097 int cur, l;
3098 xmlParserInputState state;
3099
3100 /*
3101 * Check that there is a comment right here.
3102 */
3103 if ((RAW != '<') || (NXT(1) != '!') ||
3104 (NXT(2) != '-') || (NXT(3) != '-')) return;
3105
3106 state = ctxt->instate;
3107 ctxt->instate = XML_PARSER_COMMENT;
3108 SHRINK;
3109 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003110 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003111 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003112 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003113 ctxt->instate = state;
3114 return;
3115 }
3116 q = CUR_CHAR(ql);
3117 NEXTL(ql);
3118 r = CUR_CHAR(rl);
3119 NEXTL(rl);
3120 cur = CUR_CHAR(l);
3121 len = 0;
3122 while (IS_CHAR(cur) &&
3123 ((cur != '>') ||
3124 (r != '-') || (q != '-'))) {
3125 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003126 xmlChar *tmp;
3127
Owen Taylor3473f882001-02-23 17:55:21 +00003128 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003129 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3130 if (tmp == NULL) {
3131 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003132 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003133 ctxt->instate = state;
3134 return;
3135 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003136 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003137 }
3138 COPY_BUF(ql,buf,len,q);
3139 q = r;
3140 ql = rl;
3141 r = cur;
3142 rl = l;
3143 NEXTL(l);
3144 cur = CUR_CHAR(l);
3145 if (cur == 0) {
3146 SHRINK;
3147 GROW;
3148 cur = CUR_CHAR(l);
3149 }
3150 }
3151 buf[len] = 0;
3152 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003153 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3154 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003155 xmlFree(buf);
3156 } else {
3157 NEXT;
3158 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3159 (!ctxt->disableSAX))
3160 ctxt->sax->comment(ctxt->userData, buf);
3161 xmlFree(buf);
3162 }
3163 ctxt->instate = state;
3164}
3165
3166/**
3167 * htmlParseCharRef:
3168 * @ctxt: an HTML parser context
3169 *
3170 * parse Reference declarations
3171 *
3172 * [66] CharRef ::= '&#' [0-9]+ ';' |
3173 * '&#x' [0-9a-fA-F]+ ';'
3174 *
3175 * Returns the value parsed (as an int)
3176 */
3177int
3178htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3179 int val = 0;
3180
Daniel Veillarda03e3652004-11-02 18:45:30 +00003181 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3182 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3183 "htmlParseCharRef: context error\n",
3184 NULL, NULL);
3185 return(0);
3186 }
Owen Taylor3473f882001-02-23 17:55:21 +00003187 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003188 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003189 SKIP(3);
3190 while (CUR != ';') {
3191 if ((CUR >= '0') && (CUR <= '9'))
3192 val = val * 16 + (CUR - '0');
3193 else if ((CUR >= 'a') && (CUR <= 'f'))
3194 val = val * 16 + (CUR - 'a') + 10;
3195 else if ((CUR >= 'A') && (CUR <= 'F'))
3196 val = val * 16 + (CUR - 'A') + 10;
3197 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003198 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003199 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003200 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003201 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003202 }
3203 NEXT;
3204 }
3205 if (CUR == ';')
3206 NEXT;
3207 } else if ((CUR == '&') && (NXT(1) == '#')) {
3208 SKIP(2);
3209 while (CUR != ';') {
3210 if ((CUR >= '0') && (CUR <= '9'))
3211 val = val * 10 + (CUR - '0');
3212 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003213 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003214 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003215 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003216 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003217 }
3218 NEXT;
3219 }
3220 if (CUR == ';')
3221 NEXT;
3222 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003223 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3224 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003225 }
3226 /*
3227 * Check the value IS_CHAR ...
3228 */
3229 if (IS_CHAR(val)) {
3230 return(val);
3231 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003232 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3233 "htmlParseCharRef: invalid xmlChar value %d\n",
3234 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003235 }
3236 return(0);
3237}
3238
3239
3240/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003241 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003242 * @ctxt: an HTML parser context
3243 *
3244 * parse a DOCTYPE declaration
3245 *
3246 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3247 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3248 */
3249
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003250static void
Owen Taylor3473f882001-02-23 17:55:21 +00003251htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003252 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003253 xmlChar *ExternalID = NULL;
3254 xmlChar *URI = NULL;
3255
3256 /*
3257 * We know that '<!DOCTYPE' has been detected.
3258 */
3259 SKIP(9);
3260
3261 SKIP_BLANKS;
3262
3263 /*
3264 * Parse the DOCTYPE name.
3265 */
3266 name = htmlParseName(ctxt);
3267 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003268 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3269 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3270 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003271 }
3272 /*
3273 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3274 */
3275
3276 SKIP_BLANKS;
3277
3278 /*
3279 * Check for SystemID and ExternalID
3280 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003281 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003282 SKIP_BLANKS;
3283
3284 /*
3285 * We should be at the end of the DOCTYPE declaration.
3286 */
3287 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003288 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3289 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003290 /* We shouldn't try to resynchronize ... */
3291 }
3292 NEXT;
3293
3294 /*
3295 * Create or update the document accordingly to the DOCTYPE
3296 */
3297 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3298 (!ctxt->disableSAX))
3299 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3300
3301 /*
3302 * Cleanup, since we don't use all those identifiers
3303 */
3304 if (URI != NULL) xmlFree(URI);
3305 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003306}
3307
3308/**
3309 * htmlParseAttribute:
3310 * @ctxt: an HTML parser context
3311 * @value: a xmlChar ** used to store the value of the attribute
3312 *
3313 * parse an attribute
3314 *
3315 * [41] Attribute ::= Name Eq AttValue
3316 *
3317 * [25] Eq ::= S? '=' S?
3318 *
3319 * With namespace:
3320 *
3321 * [NS 11] Attribute ::= QName Eq AttValue
3322 *
3323 * Also the case QName == xmlns:??? is handled independently as a namespace
3324 * definition.
3325 *
3326 * Returns the attribute name, and the value in *value.
3327 */
3328
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003329static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003330htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003331 const xmlChar *name;
3332 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003333
3334 *value = NULL;
3335 name = htmlParseHTMLName(ctxt);
3336 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003337 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3338 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003339 return(NULL);
3340 }
3341
3342 /*
3343 * read the value
3344 */
3345 SKIP_BLANKS;
3346 if (CUR == '=') {
3347 NEXT;
3348 SKIP_BLANKS;
3349 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003350 } else if (htmlIsBooleanAttr(name)) {
3351 /*
3352 * assume a minimized attribute
3353 */
3354 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
3356
3357 *value = val;
3358 return(name);
3359}
3360
3361/**
3362 * htmlCheckEncoding:
3363 * @ctxt: an HTML parser context
3364 * @attvalue: the attribute value
3365 *
3366 * Checks an http-equiv attribute from a Meta tag to detect
3367 * the encoding
3368 * If a new encoding is detected the parser is switched to decode
3369 * it and pass UTF8
3370 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003371static void
Owen Taylor3473f882001-02-23 17:55:21 +00003372htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3373 const xmlChar *encoding;
3374
3375 if ((ctxt == NULL) || (attvalue == NULL))
3376 return;
3377
3378 /* do not change encoding */
3379 if (ctxt->input->encoding != NULL)
3380 return;
3381
3382 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3383 if (encoding != NULL) {
3384 encoding += 8;
3385 } else {
3386 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3387 if (encoding != NULL)
3388 encoding += 9;
3389 }
3390 if (encoding != NULL) {
3391 xmlCharEncoding enc;
3392 xmlCharEncodingHandlerPtr handler;
3393
3394 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3395
3396 if (ctxt->input->encoding != NULL)
3397 xmlFree((xmlChar *) ctxt->input->encoding);
3398 ctxt->input->encoding = xmlStrdup(encoding);
3399
3400 enc = xmlParseCharEncoding((const char *) encoding);
3401 /*
3402 * registered set of known encodings
3403 */
3404 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillard7e303562006-10-16 13:14:55 +00003405 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3406 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3407 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3408 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3409 (ctxt->input->buf != NULL) &&
3410 (ctxt->input->buf->encoder == NULL)) {
3411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3412 "htmlCheckEncoding: wrong encoding meta\n",
3413 NULL, NULL);
3414 } else {
3415 xmlSwitchEncoding(ctxt, enc);
3416 }
Owen Taylor3473f882001-02-23 17:55:21 +00003417 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3418 } else {
3419 /*
3420 * fallback for unknown encodings
3421 */
3422 handler = xmlFindCharEncodingHandler((const char *) encoding);
3423 if (handler != NULL) {
3424 xmlSwitchToEncoding(ctxt, handler);
3425 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3426 } else {
3427 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3428 }
3429 }
3430
3431 if ((ctxt->input->buf != NULL) &&
3432 (ctxt->input->buf->encoder != NULL) &&
3433 (ctxt->input->buf->raw != NULL) &&
3434 (ctxt->input->buf->buffer != NULL)) {
3435 int nbchars;
3436 int processed;
3437
3438 /*
3439 * convert as much as possible to the parser reading buffer.
3440 */
3441 processed = ctxt->input->cur - ctxt->input->base;
3442 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3443 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3444 ctxt->input->buf->buffer,
3445 ctxt->input->buf->raw);
3446 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003447 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3448 "htmlCheckEncoding: encoder error\n",
3449 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003450 }
3451 ctxt->input->base =
3452 ctxt->input->cur = ctxt->input->buf->buffer->content;
3453 }
3454 }
3455}
3456
3457/**
3458 * htmlCheckMeta:
3459 * @ctxt: an HTML parser context
3460 * @atts: the attributes values
3461 *
3462 * Checks an attributes from a Meta tag
3463 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003464static void
Owen Taylor3473f882001-02-23 17:55:21 +00003465htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3466 int i;
3467 const xmlChar *att, *value;
3468 int http = 0;
3469 const xmlChar *content = NULL;
3470
3471 if ((ctxt == NULL) || (atts == NULL))
3472 return;
3473
3474 i = 0;
3475 att = atts[i++];
3476 while (att != NULL) {
3477 value = atts[i++];
3478 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3479 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3480 http = 1;
3481 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3482 content = value;
3483 att = atts[i++];
3484 }
3485 if ((http) && (content != NULL))
3486 htmlCheckEncoding(ctxt, content);
3487
3488}
3489
3490/**
3491 * htmlParseStartTag:
3492 * @ctxt: an HTML parser context
3493 *
3494 * parse a start of tag either for rule element or
3495 * EmptyElement. In both case we don't parse the tag closing chars.
3496 *
3497 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3498 *
3499 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3500 *
3501 * With namespace:
3502 *
3503 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3504 *
3505 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3506 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003507 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003508 */
3509
Daniel Veillard597f1c12005-07-03 23:00:18 +00003510static int
Owen Taylor3473f882001-02-23 17:55:21 +00003511htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003512 const xmlChar *name;
3513 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003514 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003515 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003516 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003517 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003518 int meta = 0;
3519 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003520 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003521
Daniel Veillarda03e3652004-11-02 18:45:30 +00003522 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3523 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3524 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003525 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003526 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003527 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003528 NEXT;
3529
Daniel Veillard30e76072006-03-09 14:13:55 +00003530 atts = ctxt->atts;
3531 maxatts = ctxt->maxatts;
3532
Owen Taylor3473f882001-02-23 17:55:21 +00003533 GROW;
3534 name = htmlParseHTMLName(ctxt);
3535 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003536 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3537 "htmlParseStartTag: invalid element name\n",
3538 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003539 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003540 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003541 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003542 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003543 }
3544 if (xmlStrEqual(name, BAD_CAST"meta"))
3545 meta = 1;
3546
3547 /*
3548 * Check for auto-closure of HTML elements.
3549 */
3550 htmlAutoClose(ctxt, name);
3551
3552 /*
3553 * Check for implied HTML elements.
3554 */
3555 htmlCheckImplied(ctxt, name);
3556
3557 /*
3558 * Avoid html at any level > 0, head at any level != 1
3559 * or any attempt to recurse body
3560 */
3561 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003562 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3563 "htmlParseStartTag: misplaced <html> tag\n",
3564 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003565 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003566 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003567 }
3568 if ((ctxt->nameNr != 1) &&
3569 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003570 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3571 "htmlParseStartTag: misplaced <head> tag\n",
3572 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003573 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003574 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003575 }
3576 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003577 int indx;
3578 for (indx = 0;indx < ctxt->nameNr;indx++) {
3579 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003580 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3581 "htmlParseStartTag: misplaced <body> tag\n",
3582 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003583 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003584 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003585 }
3586 }
3587 }
3588
3589 /*
3590 * Now parse the attributes, it ends up with the ending
3591 *
3592 * (S Attribute)* S?
3593 */
3594 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003595 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003596 (CUR != '>') &&
3597 ((CUR != '/') || (NXT(1) != '>'))) {
3598 long cons = ctxt->nbChars;
3599
3600 GROW;
3601 attname = htmlParseAttribute(ctxt, &attvalue);
3602 if (attname != NULL) {
3603
3604 /*
3605 * Well formedness requires at most one declaration of an attribute
3606 */
3607 for (i = 0; i < nbatts;i += 2) {
3608 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003609 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3610 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003611 if (attvalue != NULL)
3612 xmlFree(attvalue);
3613 goto failed;
3614 }
3615 }
3616
3617 /*
3618 * Add the pair to atts
3619 */
3620 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003621 maxatts = 22; /* allow for 10 attrs by default */
3622 atts = (const xmlChar **)
3623 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003624 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003625 htmlErrMemory(ctxt, NULL);
3626 if (attvalue != NULL)
3627 xmlFree(attvalue);
3628 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003629 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003630 ctxt->atts = atts;
3631 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003632 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003633 const xmlChar **n;
3634
Owen Taylor3473f882001-02-23 17:55:21 +00003635 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003636 n = (const xmlChar **) xmlRealloc((void *) atts,
3637 maxatts * sizeof(const xmlChar *));
3638 if (n == NULL) {
3639 htmlErrMemory(ctxt, NULL);
3640 if (attvalue != NULL)
3641 xmlFree(attvalue);
3642 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003643 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003644 atts = n;
3645 ctxt->atts = atts;
3646 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003647 }
3648 atts[nbatts++] = attname;
3649 atts[nbatts++] = attvalue;
3650 atts[nbatts] = NULL;
3651 atts[nbatts + 1] = NULL;
3652 }
3653 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003654 if (attvalue != NULL)
3655 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003656 /* Dump the bogus attribute string up to the next blank or
3657 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003658 while ((IS_CHAR_CH(CUR)) &&
3659 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003660 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003661 NEXT;
3662 }
3663
3664failed:
3665 SKIP_BLANKS;
3666 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003667 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3668 "htmlParseStartTag: problem parsing attributes\n",
3669 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003670 break;
3671 }
3672 }
3673
3674 /*
3675 * Handle specific association to the META tag
3676 */
William M. Bracke978ae22007-03-21 06:16:02 +00003677 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003678 htmlCheckMeta(ctxt, atts);
3679
3680 /*
3681 * SAX: Start of Element !
3682 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003683 if (!discardtag) {
3684 htmlnamePush(ctxt, name);
3685 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3686 if (nbatts != 0)
3687 ctxt->sax->startElement(ctxt->userData, name, atts);
3688 else
3689 ctxt->sax->startElement(ctxt->userData, name, NULL);
3690 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003691 }
Owen Taylor3473f882001-02-23 17:55:21 +00003692
3693 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003694 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003695 if (atts[i] != NULL)
3696 xmlFree((xmlChar *) atts[i]);
3697 }
Owen Taylor3473f882001-02-23 17:55:21 +00003698 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003699
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003700 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003701}
3702
3703/**
3704 * htmlParseEndTag:
3705 * @ctxt: an HTML parser context
3706 *
3707 * parse an end of tag
3708 *
3709 * [42] ETag ::= '</' Name S? '>'
3710 *
3711 * With namespace
3712 *
3713 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003714 *
3715 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003716 */
3717
Daniel Veillardf420ac52001-07-04 16:04:09 +00003718static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003719htmlParseEndTag(htmlParserCtxtPtr ctxt)
3720{
3721 const xmlChar *name;
3722 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003723 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003724
3725 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003726 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3727 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003728 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003729 }
3730 SKIP(2);
3731
3732 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003733 if (name == NULL)
3734 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003735 /*
3736 * We should definitely be at the ending "S? '>'" part
3737 */
3738 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003739 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003740 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3741 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003742 if (ctxt->recovery) {
3743 /*
3744 * We're not at the ending > !!
3745 * Error, unless in recover mode where we search forwards
3746 * until we find a >
3747 */
3748 while (CUR != '\0' && CUR != '>') NEXT;
3749 NEXT;
3750 }
Owen Taylor3473f882001-02-23 17:55:21 +00003751 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003752 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003753
3754 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003755 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3756 * out now.
3757 */
3758 if ((ctxt->depth > 0) &&
3759 (xmlStrEqual(name, BAD_CAST "html") ||
3760 xmlStrEqual(name, BAD_CAST "body") ||
3761 xmlStrEqual(name, BAD_CAST "head"))) {
3762 ctxt->depth--;
3763 return (0);
3764 }
3765
3766 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003767 * If the name read is not one of the element in the parsing stack
3768 * then return, it's just an error.
3769 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003770 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3771 if (xmlStrEqual(name, ctxt->nameTab[i]))
3772 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003773 }
3774 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003775 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3776 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003777 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003778 }
3779
3780
3781 /*
3782 * Check for auto-closure of HTML elements.
3783 */
3784
3785 htmlAutoCloseOnClose(ctxt, name);
3786
3787 /*
3788 * Well formedness constraints, opening and closing must match.
3789 * With the exception that the autoclose may have popped stuff out
3790 * of the stack.
3791 */
3792 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003793 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003794 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3795 "Opening and ending tag mismatch: %s and %s\n",
3796 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003797 }
3798 }
3799
3800 /*
3801 * SAX: End of Tag
3802 */
3803 oldname = ctxt->name;
3804 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003805 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3806 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003807 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003808 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003809 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003810 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003811 }
3812
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003813 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003814}
3815
3816
3817/**
3818 * htmlParseReference:
3819 * @ctxt: an HTML parser context
3820 *
3821 * parse and handle entity references in content,
3822 * this will end-up in a call to character() since this is either a
3823 * CharRef, or a predefined entity.
3824 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003825static void
Owen Taylor3473f882001-02-23 17:55:21 +00003826htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003827 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003828 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003829 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003830 if (CUR != '&') return;
3831
3832 if (NXT(1) == '#') {
3833 unsigned int c;
3834 int bits, i = 0;
3835
3836 c = htmlParseCharRef(ctxt);
3837 if (c == 0)
3838 return;
3839
3840 if (c < 0x80) { out[i++]= c; bits= -6; }
3841 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3842 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3843 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3844
3845 for ( ; bits >= 0; bits-= 6) {
3846 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3847 }
3848 out[i] = 0;
3849
3850 htmlCheckParagraph(ctxt);
3851 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3852 ctxt->sax->characters(ctxt->userData, out, i);
3853 } else {
3854 ent = htmlParseEntityRef(ctxt, &name);
3855 if (name == NULL) {
3856 htmlCheckParagraph(ctxt);
3857 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3858 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3859 return;
3860 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003861 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003862 htmlCheckParagraph(ctxt);
3863 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3864 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3865 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3866 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3867 }
3868 } else {
3869 unsigned int c;
3870 int bits, i = 0;
3871
3872 c = ent->value;
3873 if (c < 0x80)
3874 { out[i++]= c; bits= -6; }
3875 else if (c < 0x800)
3876 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3877 else if (c < 0x10000)
3878 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3879 else
3880 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3881
3882 for ( ; bits >= 0; bits-= 6) {
3883 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3884 }
3885 out[i] = 0;
3886
3887 htmlCheckParagraph(ctxt);
3888 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3889 ctxt->sax->characters(ctxt->userData, out, i);
3890 }
Owen Taylor3473f882001-02-23 17:55:21 +00003891 }
3892}
3893
3894/**
3895 * htmlParseContent:
3896 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003897 *
3898 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003899 */
3900
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003901static void
Owen Taylor3473f882001-02-23 17:55:21 +00003902htmlParseContent(htmlParserCtxtPtr ctxt) {
3903 xmlChar *currentNode;
3904 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003905 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003906
3907 currentNode = xmlStrdup(ctxt->name);
3908 depth = ctxt->nameNr;
3909 while (1) {
3910 long cons = ctxt->nbChars;
3911
3912 GROW;
3913 /*
3914 * Our tag or one of it's parent or children is ending.
3915 */
3916 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003917 if (htmlParseEndTag(ctxt) &&
3918 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3919 if (currentNode != NULL)
3920 xmlFree(currentNode);
3921 return;
3922 }
3923 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003924 }
3925
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003926 else if ((CUR == '<') &&
3927 ((IS_ASCII_LETTER(NXT(1))) ||
3928 (NXT(1) == '_') || (NXT(1) == ':'))) {
3929 name = htmlParseHTMLName_nonInvasive(ctxt);
3930 if (name == NULL) {
3931 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3932 "htmlParseStartTag: invalid element name\n",
3933 NULL, NULL);
3934 /* Dump the bogus tag like browsers do */
3935 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3936 NEXT;
3937
3938 if (currentNode != NULL)
3939 xmlFree(currentNode);
3940 return;
3941 }
3942
3943 if (ctxt->name != NULL) {
3944 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3945 htmlAutoClose(ctxt, name);
3946 continue;
3947 }
3948 }
3949 }
3950
Owen Taylor3473f882001-02-23 17:55:21 +00003951 /*
3952 * Has this node been popped out during parsing of
3953 * the next element
3954 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003955 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3956 (!xmlStrEqual(currentNode, ctxt->name)))
3957 {
Owen Taylor3473f882001-02-23 17:55:21 +00003958 if (currentNode != NULL) xmlFree(currentNode);
3959 return;
3960 }
3961
Daniel Veillardf9533d12001-03-03 10:04:57 +00003962 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3963 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003964 /*
3965 * Handle SCRIPT/STYLE separately
3966 */
3967 htmlParseScript(ctxt);
3968 } else {
3969 /*
3970 * Sometimes DOCTYPE arrives in the middle of the document
3971 */
3972 if ((CUR == '<') && (NXT(1) == '!') &&
3973 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3974 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3975 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3976 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003977 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3978 "Misplaced DOCTYPE declaration\n",
3979 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003980 htmlParseDocTypeDecl(ctxt);
3981 }
3982
3983 /*
3984 * First case : a comment
3985 */
3986 if ((CUR == '<') && (NXT(1) == '!') &&
3987 (NXT(2) == '-') && (NXT(3) == '-')) {
3988 htmlParseComment(ctxt);
3989 }
3990
3991 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003992 * Second case : a Processing Instruction.
3993 */
3994 else if ((CUR == '<') && (NXT(1) == '?')) {
3995 htmlParsePI(ctxt);
3996 }
3997
3998 /*
3999 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004000 */
4001 else if (CUR == '<') {
4002 htmlParseElement(ctxt);
4003 }
4004
4005 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004006 * Fourth case : a reference. If if has not been resolved,
Owen Taylor3473f882001-02-23 17:55:21 +00004007 * parsing returns it's Name, create the node
4008 */
4009 else if (CUR == '&') {
4010 htmlParseReference(ctxt);
4011 }
4012
4013 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004014 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004015 */
4016 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004017 htmlAutoCloseOnEnd(ctxt);
4018 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004019 }
4020
4021 /*
4022 * Last case, text. Note that References are handled directly.
4023 */
4024 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004025 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004026 }
4027
4028 if (cons == ctxt->nbChars) {
4029 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004030 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4031 "detected an error in element content\n",
4032 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004033 }
4034 break;
4035 }
4036 }
4037 GROW;
4038 }
4039 if (currentNode != NULL) xmlFree(currentNode);
4040}
4041
4042/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004043 * htmlParseContent:
4044 * @ctxt: an HTML parser context
4045 *
4046 * Parse a content: comment, sub-element, reference or text.
4047 */
4048
4049void
4050__htmlParseContent(void *ctxt) {
4051 if (ctxt != NULL)
4052 htmlParseContent((htmlParserCtxtPtr) ctxt);
4053}
4054
4055/**
Owen Taylor3473f882001-02-23 17:55:21 +00004056 * htmlParseElement:
4057 * @ctxt: an HTML parser context
4058 *
4059 * parse an HTML element, this is highly recursive
4060 *
4061 * [39] element ::= EmptyElemTag | STag content ETag
4062 *
4063 * [41] Attribute ::= Name Eq AttValue
4064 */
4065
4066void
4067htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004068 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004069 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004070 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004071 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004072 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004073 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004074 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004075
Daniel Veillarda03e3652004-11-02 18:45:30 +00004076 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4077 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004078 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004079 return;
4080 }
Owen Taylor3473f882001-02-23 17:55:21 +00004081 /* Capture start position */
4082 if (ctxt->record_info) {
4083 node_info.begin_pos = ctxt->input->consumed +
4084 (CUR_PTR - ctxt->input->base);
4085 node_info.begin_line = ctxt->input->line;
4086 }
4087
Daniel Veillard597f1c12005-07-03 23:00:18 +00004088 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004089 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004090 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004091 if (CUR == '>')
4092 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004093 return;
4094 }
Owen Taylor3473f882001-02-23 17:55:21 +00004095
4096 /*
4097 * Lookup the info for that element.
4098 */
4099 info = htmlTagLookup(name);
4100 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004101 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4102 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004103 }
4104
4105 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004106 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004107 */
4108 if ((CUR == '/') && (NXT(1) == '>')) {
4109 SKIP(2);
4110 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4111 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004112 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004113 return;
4114 }
4115
4116 if (CUR == '>') {
4117 NEXT;
4118 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004119 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4120 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004121
4122 /*
4123 * end of parsing of this node.
4124 */
4125 if (xmlStrEqual(name, ctxt->name)) {
4126 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004127 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004128 }
4129
4130 /*
4131 * Capture end position and add node
4132 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004133 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004134 node_info.end_pos = ctxt->input->consumed +
4135 (CUR_PTR - ctxt->input->base);
4136 node_info.end_line = ctxt->input->line;
4137 node_info.node = ctxt->node;
4138 xmlParserAddNodeInfo(ctxt, &node_info);
4139 }
4140 return;
4141 }
4142
4143 /*
4144 * Check for an Empty Element from DTD definition
4145 */
4146 if ((info != NULL) && (info->empty)) {
4147 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4148 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004149 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004150 return;
4151 }
4152
4153 /*
4154 * Parse the content of the element:
4155 */
4156 currentNode = xmlStrdup(ctxt->name);
4157 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004158 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004159 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004160 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004161 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00004162 if (ctxt->nameNr < depth) break;
4163 }
4164
Owen Taylor3473f882001-02-23 17:55:21 +00004165 /*
4166 * Capture end position and add node
4167 */
4168 if ( currentNode != NULL && ctxt->record_info ) {
4169 node_info.end_pos = ctxt->input->consumed +
4170 (CUR_PTR - ctxt->input->base);
4171 node_info.end_line = ctxt->input->line;
4172 node_info.node = ctxt->node;
4173 xmlParserAddNodeInfo(ctxt, &node_info);
4174 }
William M. Brack76e95df2003-10-18 16:20:14 +00004175 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004176 htmlAutoCloseOnEnd(ctxt);
4177 }
4178
Owen Taylor3473f882001-02-23 17:55:21 +00004179 if (currentNode != NULL)
4180 xmlFree(currentNode);
4181}
4182
4183/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004184 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004185 * @ctxt: an HTML parser context
4186 *
4187 * parse an HTML document (and build a tree if using the standard SAX
4188 * interface).
4189 *
4190 * Returns 0, -1 in case of error. the parser context is augmented
4191 * as a result of the parsing.
4192 */
4193
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004194int
Owen Taylor3473f882001-02-23 17:55:21 +00004195htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004196 xmlChar start[4];
4197 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004198 xmlDtdPtr dtd;
4199
Daniel Veillardd0463562001-10-13 09:15:48 +00004200 xmlInitParser();
4201
Owen Taylor3473f882001-02-23 17:55:21 +00004202 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004203
Daniel Veillarda03e3652004-11-02 18:45:30 +00004204 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4205 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4206 "htmlParseDocument: context error\n", NULL, NULL);
4207 return(XML_ERR_INTERNAL_ERROR);
4208 }
4209 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004210 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004211 GROW;
4212 /*
4213 * SAX: beginning of the document processing.
4214 */
4215 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4216 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4217
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004218 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4219 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4220 /*
4221 * Get the 4 first bytes and decode the charset
4222 * if enc != XML_CHAR_ENCODING_NONE
4223 * plug some encoding conversion routines.
4224 */
4225 start[0] = RAW;
4226 start[1] = NXT(1);
4227 start[2] = NXT(2);
4228 start[3] = NXT(3);
4229 enc = xmlDetectCharEncoding(&start[0], 4);
4230 if (enc != XML_CHAR_ENCODING_NONE) {
4231 xmlSwitchEncoding(ctxt, enc);
4232 }
4233 }
4234
Owen Taylor3473f882001-02-23 17:55:21 +00004235 /*
4236 * Wipe out everything which is before the first '<'
4237 */
4238 SKIP_BLANKS;
4239 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004240 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4241 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004242 }
4243
4244 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4245 ctxt->sax->startDocument(ctxt->userData);
4246
4247
4248 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004249 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004250 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004251 while (((CUR == '<') && (NXT(1) == '!') &&
4252 (NXT(2) == '-') && (NXT(3) == '-')) ||
4253 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004254 htmlParseComment(ctxt);
4255 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004256 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004257 }
Owen Taylor3473f882001-02-23 17:55:21 +00004258
4259
4260 /*
4261 * Then possibly doc type declaration(s) and more Misc
4262 * (doctypedecl Misc*)?
4263 */
4264 if ((CUR == '<') && (NXT(1) == '!') &&
4265 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4266 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4267 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4268 (UPP(8) == 'E')) {
4269 htmlParseDocTypeDecl(ctxt);
4270 }
4271 SKIP_BLANKS;
4272
4273 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004274 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004275 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004276 while (((CUR == '<') && (NXT(1) == '!') &&
4277 (NXT(2) == '-') && (NXT(3) == '-')) ||
4278 ((CUR == '<') && (NXT(1) == '?'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004279 htmlParseComment(ctxt);
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004280 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004281 SKIP_BLANKS;
4282 }
4283
4284 /*
4285 * Time to start parsing the tree itself
4286 */
4287 htmlParseContent(ctxt);
4288
4289 /*
4290 * autoclose
4291 */
4292 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004293 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004294
4295
4296 /*
4297 * SAX: end of the document processing.
4298 */
4299 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4300 ctxt->sax->endDocument(ctxt->userData);
4301
4302 if (ctxt->myDoc != NULL) {
4303 dtd = xmlGetIntSubset(ctxt->myDoc);
4304 if (dtd == NULL)
4305 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004306 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004307 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4308 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4309 }
4310 if (! ctxt->wellFormed) return(-1);
4311 return(0);
4312}
4313
4314
4315/************************************************************************
4316 * *
4317 * Parser contexts handling *
4318 * *
4319 ************************************************************************/
4320
4321/**
William M. Brackedb65a72004-02-06 07:36:04 +00004322 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004323 * @ctxt: an HTML parser context
4324 *
4325 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004326 *
4327 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004328 */
4329
Daniel Veillardf403d292003-10-05 13:51:35 +00004330static int
Owen Taylor3473f882001-02-23 17:55:21 +00004331htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4332{
4333 htmlSAXHandler *sax;
4334
Daniel Veillardf403d292003-10-05 13:51:35 +00004335 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004336 memset(ctxt, 0, sizeof(htmlParserCtxt));
4337
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004338 ctxt->dict = xmlDictCreate();
4339 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004340 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4341 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004342 }
Owen Taylor3473f882001-02-23 17:55:21 +00004343 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4344 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004345 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4346 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004347 }
4348 else
4349 memset(sax, 0, sizeof(htmlSAXHandler));
4350
4351 /* Allocate the Input stack */
4352 ctxt->inputTab = (htmlParserInputPtr *)
4353 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4354 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004355 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004356 ctxt->inputNr = 0;
4357 ctxt->inputMax = 0;
4358 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004359 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004360 }
4361 ctxt->inputNr = 0;
4362 ctxt->inputMax = 5;
4363 ctxt->input = NULL;
4364 ctxt->version = NULL;
4365 ctxt->encoding = NULL;
4366 ctxt->standalone = -1;
4367 ctxt->instate = XML_PARSER_START;
4368
4369 /* Allocate the Node stack */
4370 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4371 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004372 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004373 ctxt->nodeNr = 0;
4374 ctxt->nodeMax = 0;
4375 ctxt->node = NULL;
4376 ctxt->inputNr = 0;
4377 ctxt->inputMax = 0;
4378 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004379 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004380 }
4381 ctxt->nodeNr = 0;
4382 ctxt->nodeMax = 10;
4383 ctxt->node = NULL;
4384
4385 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004386 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004387 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004388 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004389 ctxt->nameNr = 0;
4390 ctxt->nameMax = 10;
4391 ctxt->name = NULL;
4392 ctxt->nodeNr = 0;
4393 ctxt->nodeMax = 0;
4394 ctxt->node = NULL;
4395 ctxt->inputNr = 0;
4396 ctxt->inputMax = 0;
4397 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004398 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004399 }
4400 ctxt->nameNr = 0;
4401 ctxt->nameMax = 10;
4402 ctxt->name = NULL;
4403
Daniel Veillard092643b2003-09-25 14:29:29 +00004404 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004405 else {
4406 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004407 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004408 }
4409 ctxt->userData = ctxt;
4410 ctxt->myDoc = NULL;
4411 ctxt->wellFormed = 1;
4412 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004413 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004414 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004415 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004416 ctxt->vctxt.userData = ctxt;
4417 ctxt->vctxt.error = xmlParserValidityError;
4418 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004419 ctxt->record_info = 0;
4420 ctxt->validate = 0;
4421 ctxt->nbChars = 0;
4422 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004423 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004424 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004425 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004426}
4427
4428/**
4429 * htmlFreeParserCtxt:
4430 * @ctxt: an HTML parser context
4431 *
4432 * Free all the memory used by a parser context. However the parsed
4433 * document in ctxt->myDoc is not freed.
4434 */
4435
4436void
4437htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4438{
4439 xmlFreeParserCtxt(ctxt);
4440}
4441
4442/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004443 * htmlNewParserCtxt:
4444 *
4445 * Allocate and initialize a new parser context.
4446 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004447 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004448 */
4449
Daniel Veillard34c647c2006-09-21 06:53:59 +00004450htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004451htmlNewParserCtxt(void)
4452{
4453 xmlParserCtxtPtr ctxt;
4454
4455 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4456 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004457 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004458 return(NULL);
4459 }
4460 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004461 if (htmlInitParserCtxt(ctxt) < 0) {
4462 htmlFreeParserCtxt(ctxt);
4463 return(NULL);
4464 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004465 return(ctxt);
4466}
4467
4468/**
4469 * htmlCreateMemoryParserCtxt:
4470 * @buffer: a pointer to a char array
4471 * @size: the size of the array
4472 *
4473 * Create a parser context for an HTML in-memory document.
4474 *
4475 * Returns the new parser context or NULL
4476 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004477htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004478htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4479 xmlParserCtxtPtr ctxt;
4480 xmlParserInputPtr input;
4481 xmlParserInputBufferPtr buf;
4482
4483 if (buffer == NULL)
4484 return(NULL);
4485 if (size <= 0)
4486 return(NULL);
4487
4488 ctxt = htmlNewParserCtxt();
4489 if (ctxt == NULL)
4490 return(NULL);
4491
4492 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4493 if (buf == NULL) return(NULL);
4494
4495 input = xmlNewInputStream(ctxt);
4496 if (input == NULL) {
4497 xmlFreeParserCtxt(ctxt);
4498 return(NULL);
4499 }
4500
4501 input->filename = NULL;
4502 input->buf = buf;
4503 input->base = input->buf->buffer->content;
4504 input->cur = input->buf->buffer->content;
4505 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4506
4507 inputPush(ctxt, input);
4508 return(ctxt);
4509}
4510
4511/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004512 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004513 * @cur: a pointer to an array of xmlChar
4514 * @encoding: a free form C string describing the HTML document encoding, or NULL
4515 *
4516 * Create a parser context for an HTML document.
4517 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004518 * TODO: check the need to add encoding handling there
4519 *
Owen Taylor3473f882001-02-23 17:55:21 +00004520 * Returns the new parser context or NULL
4521 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004522static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004523htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004524 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004525 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004526
Daniel Veillard1d995272002-07-22 16:43:32 +00004527 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004528 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004529 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004530 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004531 if (ctxt == NULL)
4532 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004533
4534 if (encoding != NULL) {
4535 xmlCharEncoding enc;
4536 xmlCharEncodingHandlerPtr handler;
4537
4538 if (ctxt->input->encoding != NULL)
4539 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004540 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004541
4542 enc = xmlParseCharEncoding(encoding);
4543 /*
4544 * registered set of known encodings
4545 */
4546 if (enc != XML_CHAR_ENCODING_ERROR) {
4547 xmlSwitchEncoding(ctxt, enc);
4548 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004549 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4550 "Unsupported encoding %s\n",
4551 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004552 }
4553 } else {
4554 /*
4555 * fallback for unknown encodings
4556 */
4557 handler = xmlFindCharEncodingHandler((const char *) encoding);
4558 if (handler != NULL) {
4559 xmlSwitchToEncoding(ctxt, handler);
4560 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004561 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4562 "Unsupported encoding %s\n",
4563 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004564 }
4565 }
4566 }
4567 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004568}
4569
Daniel Veillard73b013f2003-09-30 12:36:01 +00004570#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004571/************************************************************************
4572 * *
4573 * Progressive parsing interfaces *
4574 * *
4575 ************************************************************************/
4576
4577/**
4578 * htmlParseLookupSequence:
4579 * @ctxt: an HTML parser context
4580 * @first: the first char to lookup
4581 * @next: the next char to lookup or zero
4582 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004583 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004584 *
4585 * Try to find if a sequence (first, next, third) or just (first next) or
4586 * (first) is available in the input stream.
4587 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4588 * to avoid rescanning sequences of bytes, it DOES change the state of the
4589 * parser, do not use liberally.
4590 * This is basically similar to xmlParseLookupSequence()
4591 *
4592 * Returns the index to the current parsing point if the full sequence
4593 * is available, -1 otherwise.
4594 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004595static int
Owen Taylor3473f882001-02-23 17:55:21 +00004596htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004597 xmlChar next, xmlChar third, int iscomment,
4598 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004599 int base, len;
4600 htmlParserInputPtr in;
4601 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004602 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004603 int invalue = 0;
4604 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004605
4606 in = ctxt->input;
4607 if (in == NULL) return(-1);
4608 base = in->cur - in->base;
4609 if (base < 0) return(-1);
4610 if (ctxt->checkIndex > base)
4611 base = ctxt->checkIndex;
4612 if (in->buf == NULL) {
4613 buf = in->base;
4614 len = in->length;
4615 } else {
4616 buf = in->buf->buffer->content;
4617 len = in->buf->buffer->use;
4618 }
4619 /* take into account the sequence length */
4620 if (third) len -= 2;
4621 else if (next) len --;
4622 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004623 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004624 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4625 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4626 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004627 /* do not increment past <! - some people use <!--> */
4628 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004629 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004630 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004631 if (ignoreattrval) {
4632 if (buf[base] == '"' || buf[base] == '\'') {
4633 if (invalue) {
4634 if (buf[base] == valdellim) {
4635 invalue = 0;
4636 continue;
4637 }
4638 } else {
4639 valdellim = buf[base];
4640 invalue = 1;
4641 continue;
4642 }
4643 } else if (invalue) {
4644 continue;
4645 }
4646 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004647 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004648 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004649 return(-1);
4650 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4651 (buf[base + 2] == '>')) {
4652 incomment = 0;
4653 base += 2;
4654 }
4655 continue;
4656 }
Owen Taylor3473f882001-02-23 17:55:21 +00004657 if (buf[base] == first) {
4658 if (third != 0) {
4659 if ((buf[base + 1] != next) ||
4660 (buf[base + 2] != third)) continue;
4661 } else if (next != 0) {
4662 if (buf[base + 1] != next) continue;
4663 }
4664 ctxt->checkIndex = 0;
4665#ifdef DEBUG_PUSH
4666 if (next == 0)
4667 xmlGenericError(xmlGenericErrorContext,
4668 "HPP: lookup '%c' found at %d\n",
4669 first, base);
4670 else if (third == 0)
4671 xmlGenericError(xmlGenericErrorContext,
4672 "HPP: lookup '%c%c' found at %d\n",
4673 first, next, base);
4674 else
4675 xmlGenericError(xmlGenericErrorContext,
4676 "HPP: lookup '%c%c%c' found at %d\n",
4677 first, next, third, base);
4678#endif
4679 return(base - (in->cur - in->base));
4680 }
4681 }
4682 ctxt->checkIndex = base;
4683#ifdef DEBUG_PUSH
4684 if (next == 0)
4685 xmlGenericError(xmlGenericErrorContext,
4686 "HPP: lookup '%c' failed\n", first);
4687 else if (third == 0)
4688 xmlGenericError(xmlGenericErrorContext,
4689 "HPP: lookup '%c%c' failed\n", first, next);
4690 else
4691 xmlGenericError(xmlGenericErrorContext,
4692 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4693#endif
4694 return(-1);
4695}
4696
4697/**
4698 * htmlParseTryOrFinish:
4699 * @ctxt: an HTML parser context
4700 * @terminate: last chunk indicator
4701 *
4702 * Try to progress on parsing
4703 *
4704 * Returns zero if no parsing was possible
4705 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004706static int
Owen Taylor3473f882001-02-23 17:55:21 +00004707htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4708 int ret = 0;
4709 htmlParserInputPtr in;
4710 int avail = 0;
4711 xmlChar cur, next;
4712
4713#ifdef DEBUG_PUSH
4714 switch (ctxt->instate) {
4715 case XML_PARSER_EOF:
4716 xmlGenericError(xmlGenericErrorContext,
4717 "HPP: try EOF\n"); break;
4718 case XML_PARSER_START:
4719 xmlGenericError(xmlGenericErrorContext,
4720 "HPP: try START\n"); break;
4721 case XML_PARSER_MISC:
4722 xmlGenericError(xmlGenericErrorContext,
4723 "HPP: try MISC\n");break;
4724 case XML_PARSER_COMMENT:
4725 xmlGenericError(xmlGenericErrorContext,
4726 "HPP: try COMMENT\n");break;
4727 case XML_PARSER_PROLOG:
4728 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: try PROLOG\n");break;
4730 case XML_PARSER_START_TAG:
4731 xmlGenericError(xmlGenericErrorContext,
4732 "HPP: try START_TAG\n");break;
4733 case XML_PARSER_CONTENT:
4734 xmlGenericError(xmlGenericErrorContext,
4735 "HPP: try CONTENT\n");break;
4736 case XML_PARSER_CDATA_SECTION:
4737 xmlGenericError(xmlGenericErrorContext,
4738 "HPP: try CDATA_SECTION\n");break;
4739 case XML_PARSER_END_TAG:
4740 xmlGenericError(xmlGenericErrorContext,
4741 "HPP: try END_TAG\n");break;
4742 case XML_PARSER_ENTITY_DECL:
4743 xmlGenericError(xmlGenericErrorContext,
4744 "HPP: try ENTITY_DECL\n");break;
4745 case XML_PARSER_ENTITY_VALUE:
4746 xmlGenericError(xmlGenericErrorContext,
4747 "HPP: try ENTITY_VALUE\n");break;
4748 case XML_PARSER_ATTRIBUTE_VALUE:
4749 xmlGenericError(xmlGenericErrorContext,
4750 "HPP: try ATTRIBUTE_VALUE\n");break;
4751 case XML_PARSER_DTD:
4752 xmlGenericError(xmlGenericErrorContext,
4753 "HPP: try DTD\n");break;
4754 case XML_PARSER_EPILOG:
4755 xmlGenericError(xmlGenericErrorContext,
4756 "HPP: try EPILOG\n");break;
4757 case XML_PARSER_PI:
4758 xmlGenericError(xmlGenericErrorContext,
4759 "HPP: try PI\n");break;
4760 case XML_PARSER_SYSTEM_LITERAL:
4761 xmlGenericError(xmlGenericErrorContext,
4762 "HPP: try SYSTEM_LITERAL\n");break;
4763 }
4764#endif
4765
4766 while (1) {
4767
4768 in = ctxt->input;
4769 if (in == NULL) break;
4770 if (in->buf == NULL)
4771 avail = in->length - (in->cur - in->base);
4772 else
4773 avail = in->buf->buffer->use - (in->cur - in->base);
4774 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004775 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004776 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4777 /*
4778 * SAX: end of the document processing.
4779 */
4780 ctxt->instate = XML_PARSER_EOF;
4781 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4782 ctxt->sax->endDocument(ctxt->userData);
4783 }
4784 }
4785 if (avail < 1)
4786 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004787 cur = in->cur[0];
4788 if (cur == 0) {
4789 SKIP(1);
4790 continue;
4791 }
4792
Owen Taylor3473f882001-02-23 17:55:21 +00004793 switch (ctxt->instate) {
4794 case XML_PARSER_EOF:
4795 /*
4796 * Document parsing is done !
4797 */
4798 goto done;
4799 case XML_PARSER_START:
4800 /*
4801 * Very first chars read from the document flow.
4802 */
4803 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004804 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004805 SKIP_BLANKS;
4806 if (in->buf == NULL)
4807 avail = in->length - (in->cur - in->base);
4808 else
4809 avail = in->buf->buffer->use - (in->cur - in->base);
4810 }
4811 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4812 ctxt->sax->setDocumentLocator(ctxt->userData,
4813 &xmlDefaultSAXLocator);
4814 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4815 (!ctxt->disableSAX))
4816 ctxt->sax->startDocument(ctxt->userData);
4817
4818 cur = in->cur[0];
4819 next = in->cur[1];
4820 if ((cur == '<') && (next == '!') &&
4821 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4822 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4823 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4824 (UPP(8) == 'E')) {
4825 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004826 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004827 goto done;
4828#ifdef DEBUG_PUSH
4829 xmlGenericError(xmlGenericErrorContext,
4830 "HPP: Parsing internal subset\n");
4831#endif
4832 htmlParseDocTypeDecl(ctxt);
4833 ctxt->instate = XML_PARSER_PROLOG;
4834#ifdef DEBUG_PUSH
4835 xmlGenericError(xmlGenericErrorContext,
4836 "HPP: entering PROLOG\n");
4837#endif
4838 } else {
4839 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004840#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004841 xmlGenericError(xmlGenericErrorContext,
4842 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004843#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004844 }
Owen Taylor3473f882001-02-23 17:55:21 +00004845 break;
4846 case XML_PARSER_MISC:
4847 SKIP_BLANKS;
4848 if (in->buf == NULL)
4849 avail = in->length - (in->cur - in->base);
4850 else
4851 avail = in->buf->buffer->use - (in->cur - in->base);
4852 if (avail < 2)
4853 goto done;
4854 cur = in->cur[0];
4855 next = in->cur[1];
4856 if ((cur == '<') && (next == '!') &&
4857 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4858 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004859 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004860 goto done;
4861#ifdef DEBUG_PUSH
4862 xmlGenericError(xmlGenericErrorContext,
4863 "HPP: Parsing Comment\n");
4864#endif
4865 htmlParseComment(ctxt);
4866 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004867 } else if ((cur == '<') && (next == '?')) {
4868 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004869 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004870 goto done;
4871#ifdef DEBUG_PUSH
4872 xmlGenericError(xmlGenericErrorContext,
4873 "HPP: Parsing PI\n");
4874#endif
4875 htmlParsePI(ctxt);
4876 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004877 } else if ((cur == '<') && (next == '!') &&
4878 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4879 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4880 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4881 (UPP(8) == 'E')) {
4882 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004883 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004884 goto done;
4885#ifdef DEBUG_PUSH
4886 xmlGenericError(xmlGenericErrorContext,
4887 "HPP: Parsing internal subset\n");
4888#endif
4889 htmlParseDocTypeDecl(ctxt);
4890 ctxt->instate = XML_PARSER_PROLOG;
4891#ifdef DEBUG_PUSH
4892 xmlGenericError(xmlGenericErrorContext,
4893 "HPP: entering PROLOG\n");
4894#endif
4895 } else if ((cur == '<') && (next == '!') &&
4896 (avail < 9)) {
4897 goto done;
4898 } else {
4899 ctxt->instate = XML_PARSER_START_TAG;
4900#ifdef DEBUG_PUSH
4901 xmlGenericError(xmlGenericErrorContext,
4902 "HPP: entering START_TAG\n");
4903#endif
4904 }
4905 break;
4906 case XML_PARSER_PROLOG:
4907 SKIP_BLANKS;
4908 if (in->buf == NULL)
4909 avail = in->length - (in->cur - in->base);
4910 else
4911 avail = in->buf->buffer->use - (in->cur - in->base);
4912 if (avail < 2)
4913 goto done;
4914 cur = in->cur[0];
4915 next = in->cur[1];
4916 if ((cur == '<') && (next == '!') &&
4917 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4918 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004919 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004920 goto done;
4921#ifdef DEBUG_PUSH
4922 xmlGenericError(xmlGenericErrorContext,
4923 "HPP: Parsing Comment\n");
4924#endif
4925 htmlParseComment(ctxt);
4926 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004927 } else if ((cur == '<') && (next == '?')) {
4928 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004929 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004930 goto done;
4931#ifdef DEBUG_PUSH
4932 xmlGenericError(xmlGenericErrorContext,
4933 "HPP: Parsing PI\n");
4934#endif
4935 htmlParsePI(ctxt);
4936 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004937 } else if ((cur == '<') && (next == '!') &&
4938 (avail < 4)) {
4939 goto done;
4940 } else {
4941 ctxt->instate = XML_PARSER_START_TAG;
4942#ifdef DEBUG_PUSH
4943 xmlGenericError(xmlGenericErrorContext,
4944 "HPP: entering START_TAG\n");
4945#endif
4946 }
4947 break;
4948 case XML_PARSER_EPILOG:
4949 if (in->buf == NULL)
4950 avail = in->length - (in->cur - in->base);
4951 else
4952 avail = in->buf->buffer->use - (in->cur - in->base);
4953 if (avail < 1)
4954 goto done;
4955 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004956 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004957 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004958 goto done;
4959 }
4960 if (avail < 2)
4961 goto done;
4962 next = in->cur[1];
4963 if ((cur == '<') && (next == '!') &&
4964 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4965 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004966 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004967 goto done;
4968#ifdef DEBUG_PUSH
4969 xmlGenericError(xmlGenericErrorContext,
4970 "HPP: Parsing Comment\n");
4971#endif
4972 htmlParseComment(ctxt);
4973 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004974 } else if ((cur == '<') && (next == '?')) {
4975 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004976 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004977 goto done;
4978#ifdef DEBUG_PUSH
4979 xmlGenericError(xmlGenericErrorContext,
4980 "HPP: Parsing PI\n");
4981#endif
4982 htmlParsePI(ctxt);
4983 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004984 } else if ((cur == '<') && (next == '!') &&
4985 (avail < 4)) {
4986 goto done;
4987 } else {
4988 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004989 ctxt->wellFormed = 0;
4990 ctxt->instate = XML_PARSER_EOF;
4991#ifdef DEBUG_PUSH
4992 xmlGenericError(xmlGenericErrorContext,
4993 "HPP: entering EOF\n");
4994#endif
4995 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4996 ctxt->sax->endDocument(ctxt->userData);
4997 goto done;
4998 }
4999 break;
5000 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005001 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005002 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005003 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005004
5005 if (avail < 2)
5006 goto done;
5007 cur = in->cur[0];
5008 if (cur != '<') {
5009 ctxt->instate = XML_PARSER_CONTENT;
5010#ifdef DEBUG_PUSH
5011 xmlGenericError(xmlGenericErrorContext,
5012 "HPP: entering CONTENT\n");
5013#endif
5014 break;
5015 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005016 if (in->cur[1] == '/') {
5017 ctxt->instate = XML_PARSER_END_TAG;
5018 ctxt->checkIndex = 0;
5019#ifdef DEBUG_PUSH
5020 xmlGenericError(xmlGenericErrorContext,
5021 "HPP: entering END_TAG\n");
5022#endif
5023 break;
5024 }
Owen Taylor3473f882001-02-23 17:55:21 +00005025 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005026 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005027 goto done;
5028
Daniel Veillard597f1c12005-07-03 23:00:18 +00005029 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005030 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005031 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005032 (name == NULL)) {
5033 if (CUR == '>')
5034 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005035 break;
5036 }
Owen Taylor3473f882001-02-23 17:55:21 +00005037
5038 /*
5039 * Lookup the info for that element.
5040 */
5041 info = htmlTagLookup(name);
5042 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005043 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5044 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005045 }
5046
5047 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005048 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005049 */
5050 if ((CUR == '/') && (NXT(1) == '>')) {
5051 SKIP(2);
5052 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5053 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005054 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005055 ctxt->instate = XML_PARSER_CONTENT;
5056#ifdef DEBUG_PUSH
5057 xmlGenericError(xmlGenericErrorContext,
5058 "HPP: entering CONTENT\n");
5059#endif
5060 break;
5061 }
5062
5063 if (CUR == '>') {
5064 NEXT;
5065 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005066 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5067 "Couldn't find end of Start Tag %s\n",
5068 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005069
5070 /*
5071 * end of parsing of this node.
5072 */
5073 if (xmlStrEqual(name, ctxt->name)) {
5074 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005075 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005076 }
5077
5078 ctxt->instate = XML_PARSER_CONTENT;
5079#ifdef DEBUG_PUSH
5080 xmlGenericError(xmlGenericErrorContext,
5081 "HPP: entering CONTENT\n");
5082#endif
5083 break;
5084 }
5085
5086 /*
5087 * Check for an Empty Element from DTD definition
5088 */
5089 if ((info != NULL) && (info->empty)) {
5090 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5091 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005092 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005093 }
5094 ctxt->instate = XML_PARSER_CONTENT;
5095#ifdef DEBUG_PUSH
5096 xmlGenericError(xmlGenericErrorContext,
5097 "HPP: entering CONTENT\n");
5098#endif
5099 break;
5100 }
5101 case XML_PARSER_CONTENT: {
5102 long cons;
5103 /*
5104 * Handle preparsed entities and charRef
5105 */
5106 if (ctxt->token != 0) {
5107 xmlChar chr[2] = { 0 , 0 } ;
5108
5109 chr[0] = (xmlChar) ctxt->token;
5110 htmlCheckParagraph(ctxt);
5111 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5112 ctxt->sax->characters(ctxt->userData, chr, 1);
5113 ctxt->token = 0;
5114 ctxt->checkIndex = 0;
5115 }
5116 if ((avail == 1) && (terminate)) {
5117 cur = in->cur[0];
5118 if ((cur != '<') && (cur != '&')) {
5119 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005120 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005121 if (ctxt->sax->ignorableWhitespace != NULL)
5122 ctxt->sax->ignorableWhitespace(
5123 ctxt->userData, &cur, 1);
5124 } else {
5125 htmlCheckParagraph(ctxt);
5126 if (ctxt->sax->characters != NULL)
5127 ctxt->sax->characters(
5128 ctxt->userData, &cur, 1);
5129 }
5130 }
5131 ctxt->token = 0;
5132 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005133 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005134 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005135 }
Owen Taylor3473f882001-02-23 17:55:21 +00005136 }
5137 if (avail < 2)
5138 goto done;
5139 cur = in->cur[0];
5140 next = in->cur[1];
5141 cons = ctxt->nbChars;
5142 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5143 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5144 /*
5145 * Handle SCRIPT/STYLE separately
5146 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005147 if (!terminate) {
5148 int idx;
5149 xmlChar val;
5150
Jiri Netolicky446e1262009-08-07 17:05:36 +02005151 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005152 if (idx < 0)
5153 goto done;
5154 val = in->cur[idx + 2];
5155 if (val == 0) /* bad cut of input */
5156 goto done;
5157 }
Owen Taylor3473f882001-02-23 17:55:21 +00005158 htmlParseScript(ctxt);
5159 if ((cur == '<') && (next == '/')) {
5160 ctxt->instate = XML_PARSER_END_TAG;
5161 ctxt->checkIndex = 0;
5162#ifdef DEBUG_PUSH
5163 xmlGenericError(xmlGenericErrorContext,
5164 "HPP: entering END_TAG\n");
5165#endif
5166 break;
5167 }
5168 } else {
5169 /*
5170 * Sometimes DOCTYPE arrives in the middle of the document
5171 */
5172 if ((cur == '<') && (next == '!') &&
5173 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5174 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5175 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5176 (UPP(8) == 'E')) {
5177 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005178 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005179 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005180 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5181 "Misplaced DOCTYPE declaration\n",
5182 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005183 htmlParseDocTypeDecl(ctxt);
5184 } else if ((cur == '<') && (next == '!') &&
5185 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5186 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005187 (htmlParseLookupSequence(
Jiri Netolicky446e1262009-08-07 17:05:36 +02005188 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005189 goto done;
5190#ifdef DEBUG_PUSH
5191 xmlGenericError(xmlGenericErrorContext,
5192 "HPP: Parsing Comment\n");
5193#endif
5194 htmlParseComment(ctxt);
5195 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005196 } else if ((cur == '<') && (next == '?')) {
5197 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005198 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005199 goto done;
5200#ifdef DEBUG_PUSH
5201 xmlGenericError(xmlGenericErrorContext,
5202 "HPP: Parsing PI\n");
5203#endif
5204 htmlParsePI(ctxt);
5205 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005206 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5207 goto done;
5208 } else if ((cur == '<') && (next == '/')) {
5209 ctxt->instate = XML_PARSER_END_TAG;
5210 ctxt->checkIndex = 0;
5211#ifdef DEBUG_PUSH
5212 xmlGenericError(xmlGenericErrorContext,
5213 "HPP: entering END_TAG\n");
5214#endif
5215 break;
5216 } else if (cur == '<') {
5217 ctxt->instate = XML_PARSER_START_TAG;
5218 ctxt->checkIndex = 0;
5219#ifdef DEBUG_PUSH
5220 xmlGenericError(xmlGenericErrorContext,
5221 "HPP: entering START_TAG\n");
5222#endif
5223 break;
5224 } else if (cur == '&') {
5225 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005226 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005227 goto done;
5228#ifdef DEBUG_PUSH
5229 xmlGenericError(xmlGenericErrorContext,
5230 "HPP: Parsing Reference\n");
5231#endif
5232 /* TODO: check generation of subtrees if noent !!! */
5233 htmlParseReference(ctxt);
5234 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005235 /*
5236 * check that the text sequence is complete
5237 * before handing out the data to the parser
5238 * to avoid problems with erroneous end of
5239 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005240 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005241 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005242 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005243 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005244 ctxt->checkIndex = 0;
5245#ifdef DEBUG_PUSH
5246 xmlGenericError(xmlGenericErrorContext,
5247 "HPP: Parsing char data\n");
5248#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005249 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005250 }
5251 }
5252 if (cons == ctxt->nbChars) {
5253 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005254 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5255 "detected an error in element content\n",
5256 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005257 }
5258 NEXT;
5259 break;
5260 }
5261
5262 break;
5263 }
5264 case XML_PARSER_END_TAG:
5265 if (avail < 2)
5266 goto done;
5267 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005268 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005269 goto done;
5270 htmlParseEndTag(ctxt);
5271 if (ctxt->nameNr == 0) {
5272 ctxt->instate = XML_PARSER_EPILOG;
5273 } else {
5274 ctxt->instate = XML_PARSER_CONTENT;
5275 }
5276 ctxt->checkIndex = 0;
5277#ifdef DEBUG_PUSH
5278 xmlGenericError(xmlGenericErrorContext,
5279 "HPP: entering CONTENT\n");
5280#endif
5281 break;
5282 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005283 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5284 "HPP: internal error, state == CDATA\n",
5285 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005286 ctxt->instate = XML_PARSER_CONTENT;
5287 ctxt->checkIndex = 0;
5288#ifdef DEBUG_PUSH
5289 xmlGenericError(xmlGenericErrorContext,
5290 "HPP: entering CONTENT\n");
5291#endif
5292 break;
5293 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005294 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5295 "HPP: internal error, state == DTD\n",
5296 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005297 ctxt->instate = XML_PARSER_CONTENT;
5298 ctxt->checkIndex = 0;
5299#ifdef DEBUG_PUSH
5300 xmlGenericError(xmlGenericErrorContext,
5301 "HPP: entering CONTENT\n");
5302#endif
5303 break;
5304 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005305 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5306 "HPP: internal error, state == COMMENT\n",
5307 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005308 ctxt->instate = XML_PARSER_CONTENT;
5309 ctxt->checkIndex = 0;
5310#ifdef DEBUG_PUSH
5311 xmlGenericError(xmlGenericErrorContext,
5312 "HPP: entering CONTENT\n");
5313#endif
5314 break;
5315 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005316 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5317 "HPP: internal error, state == PI\n",
5318 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005319 ctxt->instate = XML_PARSER_CONTENT;
5320 ctxt->checkIndex = 0;
5321#ifdef DEBUG_PUSH
5322 xmlGenericError(xmlGenericErrorContext,
5323 "HPP: entering CONTENT\n");
5324#endif
5325 break;
5326 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005327 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5328 "HPP: internal error, state == ENTITY_DECL\n",
5329 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005330 ctxt->instate = XML_PARSER_CONTENT;
5331 ctxt->checkIndex = 0;
5332#ifdef DEBUG_PUSH
5333 xmlGenericError(xmlGenericErrorContext,
5334 "HPP: entering CONTENT\n");
5335#endif
5336 break;
5337 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005338 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5339 "HPP: internal error, state == ENTITY_VALUE\n",
5340 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005341 ctxt->instate = XML_PARSER_CONTENT;
5342 ctxt->checkIndex = 0;
5343#ifdef DEBUG_PUSH
5344 xmlGenericError(xmlGenericErrorContext,
5345 "HPP: entering DTD\n");
5346#endif
5347 break;
5348 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005349 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5350 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5351 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005352 ctxt->instate = XML_PARSER_START_TAG;
5353 ctxt->checkIndex = 0;
5354#ifdef DEBUG_PUSH
5355 xmlGenericError(xmlGenericErrorContext,
5356 "HPP: entering START_TAG\n");
5357#endif
5358 break;
5359 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005360 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5361 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5362 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005363 ctxt->instate = XML_PARSER_CONTENT;
5364 ctxt->checkIndex = 0;
5365#ifdef DEBUG_PUSH
5366 xmlGenericError(xmlGenericErrorContext,
5367 "HPP: entering CONTENT\n");
5368#endif
5369 break;
5370 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005371 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5372 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5373 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005374 ctxt->instate = XML_PARSER_CONTENT;
5375 ctxt->checkIndex = 0;
5376#ifdef DEBUG_PUSH
5377 xmlGenericError(xmlGenericErrorContext,
5378 "HPP: entering CONTENT\n");
5379#endif
5380 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005381 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005382 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5383 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5384 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005385 ctxt->instate = XML_PARSER_CONTENT;
5386 ctxt->checkIndex = 0;
5387#ifdef DEBUG_PUSH
5388 xmlGenericError(xmlGenericErrorContext,
5389 "HPP: entering CONTENT\n");
5390#endif
5391 break;
5392
Owen Taylor3473f882001-02-23 17:55:21 +00005393 }
5394 }
5395done:
5396 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005397 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005398 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5399 /*
5400 * SAX: end of the document processing.
5401 */
5402 ctxt->instate = XML_PARSER_EOF;
5403 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5404 ctxt->sax->endDocument(ctxt->userData);
5405 }
5406 }
5407 if ((ctxt->myDoc != NULL) &&
5408 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5409 (ctxt->instate == XML_PARSER_EPILOG))) {
5410 xmlDtdPtr dtd;
5411 dtd = xmlGetIntSubset(ctxt->myDoc);
5412 if (dtd == NULL)
5413 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00005414 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005415 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5416 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5417 }
5418#ifdef DEBUG_PUSH
5419 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5420#endif
5421 return(ret);
5422}
5423
5424/**
Owen Taylor3473f882001-02-23 17:55:21 +00005425 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005426 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005427 * @chunk: an char array
5428 * @size: the size in byte of the chunk
5429 * @terminate: last chunk indicator
5430 *
5431 * Parse a Chunk of memory
5432 *
5433 * Returns zero if no error, the xmlParserErrors otherwise.
5434 */
5435int
5436htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5437 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005438 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5439 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5440 "htmlParseChunk: context error\n", NULL, NULL);
5441 return(XML_ERR_INTERNAL_ERROR);
5442 }
Owen Taylor3473f882001-02-23 17:55:21 +00005443 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5444 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5445 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5446 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005447 int res;
Owen Taylor3473f882001-02-23 17:55:21 +00005448
Daniel Veillardd2755a82005-08-07 23:42:39 +00005449 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5450 if (res < 0) {
5451 ctxt->errNo = XML_PARSER_EOF;
5452 ctxt->disableSAX = 1;
5453 return (XML_PARSER_EOF);
5454 }
Owen Taylor3473f882001-02-23 17:55:21 +00005455 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5456 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005457 ctxt->input->end =
5458 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005459#ifdef DEBUG_PUSH
5460 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5461#endif
5462
Daniel Veillard14f752c2003-08-09 11:44:50 +00005463#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005464 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5465 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005466#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005467 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005468 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5469 xmlParserInputBufferPtr in = ctxt->input->buf;
5470 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5471 (in->raw != NULL)) {
5472 int nbchars;
5473
5474 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5475 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005476 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5477 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005478 return(XML_ERR_INVALID_ENCODING);
5479 }
5480 }
5481 }
Owen Taylor3473f882001-02-23 17:55:21 +00005482 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005483 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005484 if (terminate) {
5485 if ((ctxt->instate != XML_PARSER_EOF) &&
5486 (ctxt->instate != XML_PARSER_EPILOG) &&
5487 (ctxt->instate != XML_PARSER_MISC)) {
5488 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005489 ctxt->wellFormed = 0;
5490 }
5491 if (ctxt->instate != XML_PARSER_EOF) {
5492 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5493 ctxt->sax->endDocument(ctxt->userData);
5494 }
5495 ctxt->instate = XML_PARSER_EOF;
5496 }
5497 return((xmlParserErrors) ctxt->errNo);
5498}
5499
5500/************************************************************************
5501 * *
5502 * User entry points *
5503 * *
5504 ************************************************************************/
5505
5506/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005507 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005508 * @sax: a SAX handler
5509 * @user_data: The user data returned on SAX callbacks
5510 * @chunk: a pointer to an array of chars
5511 * @size: number of chars in the array
5512 * @filename: an optional file name or URI
5513 * @enc: an optional encoding
5514 *
5515 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005516 * The value of @filename is used for fetching external entities
5517 * and error/warning reports.
5518 *
5519 * Returns the new parser context or NULL
5520 */
5521htmlParserCtxtPtr
5522htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5523 const char *chunk, int size, const char *filename,
5524 xmlCharEncoding enc) {
5525 htmlParserCtxtPtr ctxt;
5526 htmlParserInputPtr inputStream;
5527 xmlParserInputBufferPtr buf;
5528
Daniel Veillardd0463562001-10-13 09:15:48 +00005529 xmlInitParser();
5530
Owen Taylor3473f882001-02-23 17:55:21 +00005531 buf = xmlAllocParserInputBuffer(enc);
5532 if (buf == NULL) return(NULL);
5533
Daniel Veillardf403d292003-10-05 13:51:35 +00005534 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005535 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005536 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005537 return(NULL);
5538 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005539 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5540 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005541 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005542 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005543 xmlFree(ctxt->sax);
5544 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5545 if (ctxt->sax == NULL) {
5546 xmlFree(buf);
5547 xmlFree(ctxt);
5548 return(NULL);
5549 }
5550 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5551 if (user_data != NULL)
5552 ctxt->userData = user_data;
5553 }
5554 if (filename == NULL) {
5555 ctxt->directory = NULL;
5556 } else {
5557 ctxt->directory = xmlParserGetDirectory(filename);
5558 }
5559
5560 inputStream = htmlNewInputStream(ctxt);
5561 if (inputStream == NULL) {
5562 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005563 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005564 return(NULL);
5565 }
5566
5567 if (filename == NULL)
5568 inputStream->filename = NULL;
5569 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005570 inputStream->filename = (char *)
5571 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005572 inputStream->buf = buf;
5573 inputStream->base = inputStream->buf->buffer->content;
5574 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005575 inputStream->end =
5576 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005577
5578 inputPush(ctxt, inputStream);
5579
5580 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5581 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005582 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5583 int cur = ctxt->input->cur - ctxt->input->base;
5584
Owen Taylor3473f882001-02-23 17:55:21 +00005585 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005586
5587 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5588 ctxt->input->cur = ctxt->input->base + cur;
5589 ctxt->input->end =
5590 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005591#ifdef DEBUG_PUSH
5592 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5593#endif
5594 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005595 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005596
5597 return(ctxt);
5598}
William M. Brack21e4ef22005-01-02 09:53:13 +00005599#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005600
5601/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005602 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005603 * @cur: a pointer to an array of xmlChar
5604 * @encoding: a free form C string describing the HTML document encoding, or NULL
5605 * @sax: the SAX handler block
5606 * @userData: if using SAX, this pointer will be provided on callbacks.
5607 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005608 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5609 * to handle parse events. If sax is NULL, fallback to the default DOM
5610 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005611 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005612 * Returns the resulting document tree unless SAX is NULL or the document is
5613 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005614 */
5615
5616htmlDocPtr
5617htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5618 htmlDocPtr ret;
5619 htmlParserCtxtPtr ctxt;
5620
Daniel Veillardd0463562001-10-13 09:15:48 +00005621 xmlInitParser();
5622
Owen Taylor3473f882001-02-23 17:55:21 +00005623 if (cur == NULL) return(NULL);
5624
5625
5626 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5627 if (ctxt == NULL) return(NULL);
5628 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005629 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005630 ctxt->sax = sax;
5631 ctxt->userData = userData;
5632 }
5633
5634 htmlParseDocument(ctxt);
5635 ret = ctxt->myDoc;
5636 if (sax != NULL) {
5637 ctxt->sax = NULL;
5638 ctxt->userData = NULL;
5639 }
5640 htmlFreeParserCtxt(ctxt);
5641
5642 return(ret);
5643}
5644
5645/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005646 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005647 * @cur: a pointer to an array of xmlChar
5648 * @encoding: a free form C string describing the HTML document encoding, or NULL
5649 *
5650 * parse an HTML in-memory document and build a tree.
5651 *
5652 * Returns the resulting document tree
5653 */
5654
5655htmlDocPtr
5656htmlParseDoc(xmlChar *cur, const char *encoding) {
5657 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5658}
5659
5660
5661/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005662 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005663 * @filename: the filename
5664 * @encoding: a free form C string describing the HTML document encoding, or NULL
5665 *
5666 * Create a parser context for a file content.
5667 * Automatic support for ZLIB/Compress compressed document is provided
5668 * by default if found at compile-time.
5669 *
5670 * Returns the new parser context or NULL
5671 */
5672htmlParserCtxtPtr
5673htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5674{
5675 htmlParserCtxtPtr ctxt;
5676 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005677 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005678 /* htmlCharEncoding enc; */
5679 xmlChar *content, *content_line = (xmlChar *) "charset=";
5680
Daniel Veillarda03e3652004-11-02 18:45:30 +00005681 if (filename == NULL)
5682 return(NULL);
5683
Daniel Veillardf403d292003-10-05 13:51:35 +00005684 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005685 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005686 return(NULL);
5687 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005688 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5689 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005690#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005691 if (xmlDefaultSAXHandler.error != NULL) {
5692 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5693 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005694#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005695 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005696 return(NULL);
5697 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005698
5699 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5700 xmlFree(canonicFilename);
5701 if (inputStream == NULL) {
5702 xmlFreeParserCtxt(ctxt);
5703 return(NULL);
5704 }
Owen Taylor3473f882001-02-23 17:55:21 +00005705
5706 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005707
Owen Taylor3473f882001-02-23 17:55:21 +00005708 /* set encoding */
5709 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005710 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005711 if (content) {
5712 strcpy ((char *)content, (char *)content_line);
5713 strcat ((char *)content, (char *)encoding);
5714 htmlCheckEncoding (ctxt, content);
5715 xmlFree (content);
5716 }
5717 }
5718
5719 return(ctxt);
5720}
5721
5722/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005723 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005724 * @filename: the filename
5725 * @encoding: a free form C string describing the HTML document encoding, or NULL
5726 * @sax: the SAX handler block
5727 * @userData: if using SAX, this pointer will be provided on callbacks.
5728 *
5729 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5730 * compressed document is provided by default if found at compile-time.
5731 * It use the given SAX function block to handle the parsing callback.
5732 * If sax is NULL, fallback to the default DOM tree building routines.
5733 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005734 * Returns the resulting document tree unless SAX is NULL or the document is
5735 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005736 */
5737
5738htmlDocPtr
5739htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5740 void *userData) {
5741 htmlDocPtr ret;
5742 htmlParserCtxtPtr ctxt;
5743 htmlSAXHandlerPtr oldsax = NULL;
5744
Daniel Veillardd0463562001-10-13 09:15:48 +00005745 xmlInitParser();
5746
Owen Taylor3473f882001-02-23 17:55:21 +00005747 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5748 if (ctxt == NULL) return(NULL);
5749 if (sax != NULL) {
5750 oldsax = ctxt->sax;
5751 ctxt->sax = sax;
5752 ctxt->userData = userData;
5753 }
5754
5755 htmlParseDocument(ctxt);
5756
5757 ret = ctxt->myDoc;
5758 if (sax != NULL) {
5759 ctxt->sax = oldsax;
5760 ctxt->userData = NULL;
5761 }
5762 htmlFreeParserCtxt(ctxt);
5763
5764 return(ret);
5765}
5766
5767/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005768 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005769 * @filename: the filename
5770 * @encoding: a free form C string describing the HTML document encoding, or NULL
5771 *
5772 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5773 * compressed document is provided by default if found at compile-time.
5774 *
5775 * Returns the resulting document tree
5776 */
5777
5778htmlDocPtr
5779htmlParseFile(const char *filename, const char *encoding) {
5780 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5781}
5782
5783/**
5784 * htmlHandleOmittedElem:
5785 * @val: int 0 or 1
5786 *
5787 * Set and return the previous value for handling HTML omitted tags.
5788 *
5789 * Returns the last value for 0 for no handling, 1 for auto insertion.
5790 */
5791
5792int
5793htmlHandleOmittedElem(int val) {
5794 int old = htmlOmittedDefaultValue;
5795
5796 htmlOmittedDefaultValue = val;
5797 return(old);
5798}
5799
Daniel Veillard930dfb62003-02-05 10:17:38 +00005800/**
5801 * htmlElementAllowedHere:
5802 * @parent: HTML parent element
5803 * @elt: HTML element
5804 *
5805 * Checks whether an HTML element may be a direct child of a parent element.
5806 * Note - doesn't check for deprecated elements
5807 *
5808 * Returns 1 if allowed; 0 otherwise.
5809 */
5810int
5811htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5812 const char** p ;
5813
5814 if ( ! elt || ! parent || ! parent->subelts )
5815 return 0 ;
5816
5817 for ( p = parent->subelts; *p; ++p )
5818 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5819 return 1 ;
5820
5821 return 0 ;
5822}
5823/**
5824 * htmlElementStatusHere:
5825 * @parent: HTML parent element
5826 * @elt: HTML element
5827 *
5828 * Checks whether an HTML element may be a direct child of a parent element.
5829 * and if so whether it is valid or deprecated.
5830 *
5831 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5832 */
5833htmlStatus
5834htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5835 if ( ! parent || ! elt )
5836 return HTML_INVALID ;
5837 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5838 return HTML_INVALID ;
5839
5840 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5841}
5842/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005843 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005844 * @elt: HTML element
5845 * @attr: HTML attribute
5846 * @legacy: whether to allow deprecated attributes
5847 *
5848 * Checks whether an attribute is valid for an element
5849 * Has full knowledge of Required and Deprecated attributes
5850 *
5851 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5852 */
5853htmlStatus
5854htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5855 const char** p ;
5856
5857 if ( !elt || ! attr )
5858 return HTML_INVALID ;
5859
5860 if ( elt->attrs_req )
5861 for ( p = elt->attrs_req; *p; ++p)
5862 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5863 return HTML_REQUIRED ;
5864
5865 if ( elt->attrs_opt )
5866 for ( p = elt->attrs_opt; *p; ++p)
5867 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5868 return HTML_VALID ;
5869
5870 if ( legacy && elt->attrs_depr )
5871 for ( p = elt->attrs_depr; *p; ++p)
5872 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5873 return HTML_DEPRECATED ;
5874
5875 return HTML_INVALID ;
5876}
5877/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005878 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005879 * @node: an htmlNodePtr in a tree
5880 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005881 * for Element nodes)
5882 *
5883 * Checks whether the tree node is valid. Experimental (the author
5884 * only uses the HTML enhancements in a SAX parser)
5885 *
5886 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5887 * legacy allowed) or htmlElementStatusHere (otherwise).
5888 * for Attribute nodes, a return from htmlAttrAllowed
5889 * for other nodes, HTML_NA (no checks performed)
5890 */
5891htmlStatus
5892htmlNodeStatus(const htmlNodePtr node, int legacy) {
5893 if ( ! node )
5894 return HTML_INVALID ;
5895
5896 switch ( node->type ) {
5897 case XML_ELEMENT_NODE:
5898 return legacy
5899 ? ( htmlElementAllowedHere (
5900 htmlTagLookup(node->parent->name) , node->name
5901 ) ? HTML_VALID : HTML_INVALID )
5902 : htmlElementStatusHere(
5903 htmlTagLookup(node->parent->name) ,
5904 htmlTagLookup(node->name) )
5905 ;
5906 case XML_ATTRIBUTE_NODE:
5907 return htmlAttrAllowed(
5908 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5909 default: return HTML_NA ;
5910 }
5911}
Daniel Veillard9475a352003-09-26 12:47:50 +00005912/************************************************************************
5913 * *
5914 * New set (2.6.0) of simpler and more flexible APIs *
5915 * *
5916 ************************************************************************/
5917/**
5918 * DICT_FREE:
5919 * @str: a string
5920 *
5921 * Free a string if it is not owned by the "dict" dictionnary in the
5922 * current scope
5923 */
5924#define DICT_FREE(str) \
5925 if ((str) && ((!dict) || \
5926 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5927 xmlFree((char *)(str));
5928
5929/**
5930 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005931 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005932 *
5933 * Reset a parser context
5934 */
5935void
5936htmlCtxtReset(htmlParserCtxtPtr ctxt)
5937{
5938 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005939 xmlDictPtr dict;
5940
5941 if (ctxt == NULL)
5942 return;
5943
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005944 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005945 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005946
5947 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5948 xmlFreeInputStream(input);
5949 }
5950 ctxt->inputNr = 0;
5951 ctxt->input = NULL;
5952
5953 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005954 if (ctxt->spaceTab != NULL) {
5955 ctxt->spaceTab[0] = -1;
5956 ctxt->space = &ctxt->spaceTab[0];
5957 } else {
5958 ctxt->space = NULL;
5959 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005960
5961
5962 ctxt->nodeNr = 0;
5963 ctxt->node = NULL;
5964
5965 ctxt->nameNr = 0;
5966 ctxt->name = NULL;
5967
5968 DICT_FREE(ctxt->version);
5969 ctxt->version = NULL;
5970 DICT_FREE(ctxt->encoding);
5971 ctxt->encoding = NULL;
5972 DICT_FREE(ctxt->directory);
5973 ctxt->directory = NULL;
5974 DICT_FREE(ctxt->extSubURI);
5975 ctxt->extSubURI = NULL;
5976 DICT_FREE(ctxt->extSubSystem);
5977 ctxt->extSubSystem = NULL;
5978 if (ctxt->myDoc != NULL)
5979 xmlFreeDoc(ctxt->myDoc);
5980 ctxt->myDoc = NULL;
5981
5982 ctxt->standalone = -1;
5983 ctxt->hasExternalSubset = 0;
5984 ctxt->hasPErefs = 0;
5985 ctxt->html = 1;
5986 ctxt->external = 0;
5987 ctxt->instate = XML_PARSER_START;
5988 ctxt->token = 0;
5989
5990 ctxt->wellFormed = 1;
5991 ctxt->nsWellFormed = 1;
5992 ctxt->valid = 1;
5993 ctxt->vctxt.userData = ctxt;
5994 ctxt->vctxt.error = xmlParserValidityError;
5995 ctxt->vctxt.warning = xmlParserValidityWarning;
5996 ctxt->record_info = 0;
5997 ctxt->nbChars = 0;
5998 ctxt->checkIndex = 0;
5999 ctxt->inSubset = 0;
6000 ctxt->errNo = XML_ERR_OK;
6001 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006002 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006003 ctxt->catalogs = NULL;
6004 xmlInitNodeInfoSeq(&ctxt->node_seq);
6005
6006 if (ctxt->attsDefault != NULL) {
6007 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6008 ctxt->attsDefault = NULL;
6009 }
6010 if (ctxt->attsSpecial != NULL) {
6011 xmlHashFree(ctxt->attsSpecial, NULL);
6012 ctxt->attsSpecial = NULL;
6013 }
6014}
6015
6016/**
6017 * htmlCtxtUseOptions:
6018 * @ctxt: an HTML parser context
6019 * @options: a combination of htmlParserOption(s)
6020 *
6021 * Applies the options to the parser context
6022 *
6023 * Returns 0 in case of success, the set of unknown or unimplemented options
6024 * in case of error.
6025 */
6026int
6027htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6028{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006029 if (ctxt == NULL)
6030 return(-1);
6031
Daniel Veillard9475a352003-09-26 12:47:50 +00006032 if (options & HTML_PARSE_NOWARNING) {
6033 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006034 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006035 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006036 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006037 }
6038 if (options & HTML_PARSE_NOERROR) {
6039 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006040 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006041 ctxt->sax->fatalError = NULL;
6042 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006043 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006044 }
6045 if (options & HTML_PARSE_PEDANTIC) {
6046 ctxt->pedantic = 1;
6047 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006048 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006049 } else
6050 ctxt->pedantic = 0;
6051 if (options & XML_PARSE_NOBLANKS) {
6052 ctxt->keepBlanks = 0;
6053 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6054 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006055 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006056 } else
6057 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006058 if (options & HTML_PARSE_RECOVER) {
6059 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006060 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006061 } else
6062 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006063 if (options & HTML_PARSE_COMPACT) {
6064 ctxt->options |= HTML_PARSE_COMPACT;
6065 options -= HTML_PARSE_COMPACT;
6066 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006067 ctxt->dictNames = 0;
6068 return (options);
6069}
6070
6071/**
6072 * htmlDoRead:
6073 * @ctxt: an HTML parser context
6074 * @URL: the base URL to use for the document
6075 * @encoding: the document encoding, or NULL
6076 * @options: a combination of htmlParserOption(s)
6077 * @reuse: keep the context for reuse
6078 *
6079 * Common front-end for the htmlRead functions
6080 *
6081 * Returns the resulting document tree or NULL
6082 */
6083static htmlDocPtr
6084htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6085 int options, int reuse)
6086{
6087 htmlDocPtr ret;
6088
6089 htmlCtxtUseOptions(ctxt, options);
6090 ctxt->html = 1;
6091 if (encoding != NULL) {
6092 xmlCharEncodingHandlerPtr hdlr;
6093
6094 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006095 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006096 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006097 if (ctxt->input->encoding != NULL)
6098 xmlFree((xmlChar *) ctxt->input->encoding);
6099 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6100 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006101 }
6102 if ((URL != NULL) && (ctxt->input != NULL) &&
6103 (ctxt->input->filename == NULL))
6104 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6105 htmlParseDocument(ctxt);
6106 ret = ctxt->myDoc;
6107 ctxt->myDoc = NULL;
6108 if (!reuse) {
6109 if ((ctxt->dictNames) &&
6110 (ret != NULL) &&
6111 (ret->dict == ctxt->dict))
6112 ctxt->dict = NULL;
6113 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006114 }
6115 return (ret);
6116}
6117
6118/**
6119 * htmlReadDoc:
6120 * @cur: a pointer to a zero terminated string
6121 * @URL: the base URL to use for the document
6122 * @encoding: the document encoding, or NULL
6123 * @options: a combination of htmlParserOption(s)
6124 *
6125 * parse an XML in-memory document and build a tree.
6126 *
6127 * Returns the resulting document tree
6128 */
6129htmlDocPtr
6130htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6131{
6132 htmlParserCtxtPtr ctxt;
6133
6134 if (cur == NULL)
6135 return (NULL);
6136
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006137 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006138 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006139 if (ctxt == NULL)
6140 return (NULL);
6141 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6142}
6143
6144/**
6145 * htmlReadFile:
6146 * @filename: a file or URL
6147 * @encoding: the document encoding, or NULL
6148 * @options: a combination of htmlParserOption(s)
6149 *
6150 * parse an XML file from the filesystem or the network.
6151 *
6152 * Returns the resulting document tree
6153 */
6154htmlDocPtr
6155htmlReadFile(const char *filename, const char *encoding, int options)
6156{
6157 htmlParserCtxtPtr ctxt;
6158
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006159 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006160 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6161 if (ctxt == NULL)
6162 return (NULL);
6163 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6164}
6165
6166/**
6167 * htmlReadMemory:
6168 * @buffer: a pointer to a char array
6169 * @size: the size of the array
6170 * @URL: the base URL to use for the document
6171 * @encoding: the document encoding, or NULL
6172 * @options: a combination of htmlParserOption(s)
6173 *
6174 * parse an XML in-memory document and build a tree.
6175 *
6176 * Returns the resulting document tree
6177 */
6178htmlDocPtr
6179htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6180{
6181 htmlParserCtxtPtr ctxt;
6182
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006183 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006184 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6185 if (ctxt == NULL)
6186 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006187 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006188 if (ctxt->sax != NULL)
6189 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006190 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6191}
6192
6193/**
6194 * htmlReadFd:
6195 * @fd: an open file descriptor
6196 * @URL: the base URL to use for the document
6197 * @encoding: the document encoding, or NULL
6198 * @options: a combination of htmlParserOption(s)
6199 *
6200 * parse an XML from a file descriptor and build a tree.
6201 *
6202 * Returns the resulting document tree
6203 */
6204htmlDocPtr
6205htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6206{
6207 htmlParserCtxtPtr ctxt;
6208 xmlParserInputBufferPtr input;
6209 xmlParserInputPtr stream;
6210
6211 if (fd < 0)
6212 return (NULL);
6213
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006214 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006215 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6216 if (input == NULL)
6217 return (NULL);
6218 ctxt = xmlNewParserCtxt();
6219 if (ctxt == NULL) {
6220 xmlFreeParserInputBuffer(input);
6221 return (NULL);
6222 }
6223 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6224 if (stream == NULL) {
6225 xmlFreeParserInputBuffer(input);
6226 xmlFreeParserCtxt(ctxt);
6227 return (NULL);
6228 }
6229 inputPush(ctxt, stream);
6230 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6231}
6232
6233/**
6234 * htmlReadIO:
6235 * @ioread: an I/O read function
6236 * @ioclose: an I/O close function
6237 * @ioctx: an I/O handler
6238 * @URL: the base URL to use for the document
6239 * @encoding: the document encoding, or NULL
6240 * @options: a combination of htmlParserOption(s)
6241 *
6242 * parse an HTML document from I/O functions and source and build a tree.
6243 *
6244 * Returns the resulting document tree
6245 */
6246htmlDocPtr
6247htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6248 void *ioctx, const char *URL, const char *encoding, int options)
6249{
6250 htmlParserCtxtPtr ctxt;
6251 xmlParserInputBufferPtr input;
6252 xmlParserInputPtr stream;
6253
6254 if (ioread == NULL)
6255 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006256 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006257
6258 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6259 XML_CHAR_ENCODING_NONE);
6260 if (input == NULL)
6261 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006262 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006263 if (ctxt == NULL) {
6264 xmlFreeParserInputBuffer(input);
6265 return (NULL);
6266 }
6267 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6268 if (stream == NULL) {
6269 xmlFreeParserInputBuffer(input);
6270 xmlFreeParserCtxt(ctxt);
6271 return (NULL);
6272 }
6273 inputPush(ctxt, stream);
6274 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6275}
6276
6277/**
6278 * htmlCtxtReadDoc:
6279 * @ctxt: an HTML parser context
6280 * @cur: a pointer to a zero terminated string
6281 * @URL: the base URL to use for the document
6282 * @encoding: the document encoding, or NULL
6283 * @options: a combination of htmlParserOption(s)
6284 *
6285 * parse an XML in-memory document and build a tree.
6286 * This reuses the existing @ctxt parser context
6287 *
6288 * Returns the resulting document tree
6289 */
6290htmlDocPtr
6291htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6292 const char *URL, const char *encoding, int options)
6293{
6294 xmlParserInputPtr stream;
6295
6296 if (cur == NULL)
6297 return (NULL);
6298 if (ctxt == NULL)
6299 return (NULL);
6300
6301 htmlCtxtReset(ctxt);
6302
6303 stream = xmlNewStringInputStream(ctxt, cur);
6304 if (stream == NULL) {
6305 return (NULL);
6306 }
6307 inputPush(ctxt, stream);
6308 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6309}
6310
6311/**
6312 * htmlCtxtReadFile:
6313 * @ctxt: an HTML parser context
6314 * @filename: a file or URL
6315 * @encoding: the document encoding, or NULL
6316 * @options: a combination of htmlParserOption(s)
6317 *
6318 * parse an XML file from the filesystem or the network.
6319 * This reuses the existing @ctxt parser context
6320 *
6321 * Returns the resulting document tree
6322 */
6323htmlDocPtr
6324htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6325 const char *encoding, int options)
6326{
6327 xmlParserInputPtr stream;
6328
6329 if (filename == NULL)
6330 return (NULL);
6331 if (ctxt == NULL)
6332 return (NULL);
6333
6334 htmlCtxtReset(ctxt);
6335
Daniel Veillard29614c72004-11-26 10:47:26 +00006336 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006337 if (stream == NULL) {
6338 return (NULL);
6339 }
6340 inputPush(ctxt, stream);
6341 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6342}
6343
6344/**
6345 * htmlCtxtReadMemory:
6346 * @ctxt: an HTML parser context
6347 * @buffer: a pointer to a char array
6348 * @size: the size of the array
6349 * @URL: the base URL to use for the document
6350 * @encoding: the document encoding, or NULL
6351 * @options: a combination of htmlParserOption(s)
6352 *
6353 * parse an XML in-memory document and build a tree.
6354 * This reuses the existing @ctxt parser context
6355 *
6356 * Returns the resulting document tree
6357 */
6358htmlDocPtr
6359htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6360 const char *URL, const char *encoding, int options)
6361{
6362 xmlParserInputBufferPtr input;
6363 xmlParserInputPtr stream;
6364
6365 if (ctxt == NULL)
6366 return (NULL);
6367 if (buffer == NULL)
6368 return (NULL);
6369
6370 htmlCtxtReset(ctxt);
6371
6372 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6373 if (input == NULL) {
6374 return(NULL);
6375 }
6376
6377 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6378 if (stream == NULL) {
6379 xmlFreeParserInputBuffer(input);
6380 return(NULL);
6381 }
6382
6383 inputPush(ctxt, stream);
6384 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6385}
6386
6387/**
6388 * htmlCtxtReadFd:
6389 * @ctxt: an HTML parser context
6390 * @fd: an open file descriptor
6391 * @URL: the base URL to use for the document
6392 * @encoding: the document encoding, or NULL
6393 * @options: a combination of htmlParserOption(s)
6394 *
6395 * parse an XML from a file descriptor and build a tree.
6396 * This reuses the existing @ctxt parser context
6397 *
6398 * Returns the resulting document tree
6399 */
6400htmlDocPtr
6401htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6402 const char *URL, const char *encoding, int options)
6403{
6404 xmlParserInputBufferPtr input;
6405 xmlParserInputPtr stream;
6406
6407 if (fd < 0)
6408 return (NULL);
6409 if (ctxt == NULL)
6410 return (NULL);
6411
6412 htmlCtxtReset(ctxt);
6413
6414
6415 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6416 if (input == NULL)
6417 return (NULL);
6418 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6419 if (stream == NULL) {
6420 xmlFreeParserInputBuffer(input);
6421 return (NULL);
6422 }
6423 inputPush(ctxt, stream);
6424 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6425}
6426
6427/**
6428 * htmlCtxtReadIO:
6429 * @ctxt: an HTML parser context
6430 * @ioread: an I/O read function
6431 * @ioclose: an I/O close function
6432 * @ioctx: an I/O handler
6433 * @URL: the base URL to use for the document
6434 * @encoding: the document encoding, or NULL
6435 * @options: a combination of htmlParserOption(s)
6436 *
6437 * parse an HTML document from I/O functions and source and build a tree.
6438 * This reuses the existing @ctxt parser context
6439 *
6440 * Returns the resulting document tree
6441 */
6442htmlDocPtr
6443htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6444 xmlInputCloseCallback ioclose, void *ioctx,
6445 const char *URL,
6446 const char *encoding, int options)
6447{
6448 xmlParserInputBufferPtr input;
6449 xmlParserInputPtr stream;
6450
6451 if (ioread == NULL)
6452 return (NULL);
6453 if (ctxt == NULL)
6454 return (NULL);
6455
6456 htmlCtxtReset(ctxt);
6457
6458 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6459 XML_CHAR_ENCODING_NONE);
6460 if (input == NULL)
6461 return (NULL);
6462 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6463 if (stream == NULL) {
6464 xmlFreeParserInputBuffer(input);
6465 return (NULL);
6466 }
6467 inputPush(ctxt, stream);
6468 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6469}
6470
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006471#define bottom_HTMLparser
6472#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006473#endif /* LIBXML_HTML_ENABLED */