blob: c64590e004e9bf14384fc6c4e80f22b9a670b85d [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
212/*
213 * Macros for accessing the content. Those should be used only by the parser,
214 * and not exported.
215 *
216 * Dirty macros, i.e. one need to make assumption on the context to use them
217 *
218 * CUR_PTR return the current pointer to the xmlChar to be parsed.
219 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
220 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
221 * in UNICODE mode. This should be used internally by the parser
222 * only to compare to ASCII values otherwise it would break when
223 * running with UTF-8 encoding.
224 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
225 * to compare on ASCII based substring.
226 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
227 * it should be used only to compare on ASCII based substring.
228 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 *
231 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
232 *
233 * CURRENT Returns the current char value, with the full decoding of
234 * UTF-8 if we are using this mode. It returns an int.
235 * NEXT Skip to the next character, this does the proper decoding
236 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000237 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000238 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
239 */
240
241#define UPPER (toupper(*ctxt->input->cur))
242
Daniel Veillard77a90a72003-03-22 00:04:05 +0000243#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000244
245#define NXT(val) ctxt->input->cur[(val)]
246
247#define UPP(val) (toupper(ctxt->input->cur[(val)]))
248
249#define CUR_PTR ctxt->input->cur
250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
252 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
253 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000255#define GROW if ((ctxt->progressive == 0) && \
256 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
257 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000258
259#define CURRENT ((int) (*ctxt->input->cur))
260
261#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
262
263/* Inported from XML */
264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
266#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000267#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000268
Daniel Veillard561b7f82002-03-20 21:55:57 +0000269#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000270#define NXT(val) ctxt->input->cur[(val)]
271#define CUR_PTR ctxt->input->cur
272
273
274#define NEXTL(l) do { \
275 if (*(ctxt->input->cur) == '\n') { \
276 ctxt->input->line++; ctxt->input->col = 1; \
277 } else ctxt->input->col++; \
278 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200280
Owen Taylor3473f882001-02-23 17:55:21 +0000281/************
282 \
283 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
284 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
285 ************/
286
287#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
288#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
289
290#define COPY_BUF(l,b,i,v) \
291 if (l == 1) b[i++] = (xmlChar) v; \
292 else i += xmlCopyChar(l,&b[i],v)
293
294/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200295 * htmlFindEncoding:
296 * @the HTML parser context
297 *
298 * Ty to find and encoding in the current data available in the input
299 * buffer this is needed to try to switch to the proper encoding when
300 * one face a character error.
301 * That's an heuristic, since it's operating outside of parsing it could
302 * try to use a meta which had been commented out, that's the reason it
303 * should only be used in case of error, not as a default.
304 *
305 * Returns an encoding string or NULL if not found, the string need to
306 * be freed
307 */
308static xmlChar *
309htmlFindEncoding(xmlParserCtxtPtr ctxt) {
310 const xmlChar *start, *cur, *end;
311
312 if ((ctxt == NULL) || (ctxt->input == NULL) ||
313 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
314 (ctxt->input->buf->encoder != NULL))
315 return(NULL);
316 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
317 return(NULL);
318
319 start = ctxt->input->cur;
320 end = ctxt->input->end;
321 /* we also expect the input buffer to be zero terminated */
322 if (*end != 0)
323 return(NULL);
324
325 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
326 if (cur == NULL)
327 return(NULL);
328 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
329 if (cur == NULL)
330 return(NULL);
331 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
332 if (cur == NULL)
333 return(NULL);
334 cur += 8;
335 start = cur;
336 while (((*cur >= 'A') && (*cur <= 'Z')) ||
337 ((*cur >= 'a') && (*cur <= 'z')) ||
338 ((*cur >= '0') && (*cur <= '9')) ||
339 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
340 cur++;
341 if (cur == start)
342 return(NULL);
343 return(xmlStrndup(start, cur - start));
344}
345
346/**
Owen Taylor3473f882001-02-23 17:55:21 +0000347 * htmlCurrentChar:
348 * @ctxt: the HTML parser context
349 * @len: pointer to the length of the char read
350 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000351 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000352 * bytes in the input buffer. Implement the end of line normalization:
353 * 2.11 End-of-Line Handling
354 * If the encoding is unspecified, in the case we find an ISO-Latin-1
355 * char, then the encoding converter is plugged in automatically.
356 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000357 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
359
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000360static int
Owen Taylor3473f882001-02-23 17:55:21 +0000361htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
362 if (ctxt->instate == XML_PARSER_EOF)
363 return(0);
364
365 if (ctxt->token != 0) {
366 *len = 0;
367 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200368 }
Owen Taylor3473f882001-02-23 17:55:21 +0000369 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
370 /*
371 * We are supposed to handle UTF8, check it's valid
372 * From rfc2044: encoding of the Unicode values on UTF-8:
373 *
374 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
375 * 0000 0000-0000 007F 0xxxxxxx
376 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200377 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000378 *
379 * Check for the 0x110000 limit too
380 */
381 const unsigned char *cur = ctxt->input->cur;
382 unsigned char c;
383 unsigned int val;
384
385 c = *cur;
386 if (c & 0x80) {
387 if (cur[1] == 0)
388 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
389 if ((cur[1] & 0xc0) != 0x80)
390 goto encoding_error;
391 if ((c & 0xe0) == 0xe0) {
392
393 if (cur[2] == 0)
394 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
395 if ((cur[2] & 0xc0) != 0x80)
396 goto encoding_error;
397 if ((c & 0xf0) == 0xf0) {
398 if (cur[3] == 0)
399 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
400 if (((c & 0xf8) != 0xf0) ||
401 ((cur[3] & 0xc0) != 0x80))
402 goto encoding_error;
403 /* 4-byte code */
404 *len = 4;
405 val = (cur[0] & 0x7) << 18;
406 val |= (cur[1] & 0x3f) << 12;
407 val |= (cur[2] & 0x3f) << 6;
408 val |= cur[3] & 0x3f;
409 } else {
410 /* 3-byte code */
411 *len = 3;
412 val = (cur[0] & 0xf) << 12;
413 val |= (cur[1] & 0x3f) << 6;
414 val |= cur[2] & 0x3f;
415 }
416 } else {
417 /* 2-byte code */
418 *len = 2;
419 val = (cur[0] & 0x1f) << 6;
420 val |= cur[1] & 0x3f;
421 }
422 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000423 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
424 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200425 }
Owen Taylor3473f882001-02-23 17:55:21 +0000426 return(val);
427 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200428 if ((*ctxt->input->cur == 0) &&
429 (ctxt->input->cur < ctxt->input->end)) {
430 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
431 "Char 0x%X out of allowed range\n", 0);
432 *len = 1;
433 return(' ');
434 }
Owen Taylor3473f882001-02-23 17:55:21 +0000435 /* 1-byte code */
436 *len = 1;
437 return((int) *ctxt->input->cur);
438 }
439 }
440 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000441 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000442 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000443 * XML constructs only use < 128 chars
444 */
445 *len = 1;
446 if ((int) *ctxt->input->cur < 0x80)
447 return((int) *ctxt->input->cur);
448
449 /*
450 * Humm this is bad, do an automatic flow conversion
451 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200452 {
453 xmlChar * guess;
454 xmlCharEncodingHandlerPtr handler;
455
456 guess = htmlFindEncoding(ctxt);
457 if (guess == NULL) {
458 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
459 } else {
460 if (ctxt->input->encoding != NULL)
461 xmlFree((xmlChar *) ctxt->input->encoding);
462 ctxt->input->encoding = guess;
463 handler = xmlFindCharEncodingHandler((const char *) guess);
464 if (handler != NULL) {
465 xmlSwitchToEncoding(ctxt, handler);
466 } else {
467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468 "Unsupported encoding %s", guess, NULL);
469 }
470 }
471 ctxt->charset = XML_CHAR_ENCODING_UTF8;
472 }
473
Owen Taylor3473f882001-02-23 17:55:21 +0000474 return(xmlCurrentChar(ctxt, len));
475
476encoding_error:
477 /*
478 * If we detect an UTF8 error that probably mean that the
479 * input encoding didn't get properly advertized in the
480 * declaration header. Report the error and switch the encoding
481 * to ISO-Latin-1 (if you don't like this policy, just declare the
482 * encoding !)
483 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000484 {
485 char buffer[150];
486
Daniel Veillard861101d2007-06-12 08:38:57 +0000487 if (ctxt->input->end - ctxt->input->cur >= 4) {
488 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
489 ctxt->input->cur[0], ctxt->input->cur[1],
490 ctxt->input->cur[2], ctxt->input->cur[3]);
491 } else {
492 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
493 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000494 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
495 "Input is not proper UTF-8, indicate encoding !\n",
496 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000497 }
498
Daniel Veillarde77db162009-08-22 11:32:38 +0200499 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000500 *len = 1;
501 return((int) *ctxt->input->cur);
502}
503
504/**
Owen Taylor3473f882001-02-23 17:55:21 +0000505 * htmlSkipBlankChars:
506 * @ctxt: the HTML parser context
507 *
508 * skip all blanks character found at that point in the input streams.
509 *
510 * Returns the number of space chars skipped
511 */
512
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000513static int
Owen Taylor3473f882001-02-23 17:55:21 +0000514htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
515 int res = 0;
516
William M. Brack76e95df2003-10-18 16:20:14 +0000517 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000518 if ((*ctxt->input->cur == 0) &&
519 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
520 xmlPopInput(ctxt);
521 } else {
522 if (*(ctxt->input->cur) == '\n') {
523 ctxt->input->line++; ctxt->input->col = 1;
524 } else ctxt->input->col++;
525 ctxt->input->cur++;
526 ctxt->nbChars++;
527 if (*ctxt->input->cur == 0)
528 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
529 }
530 res++;
531 }
532 return(res);
533}
534
535
536
537/************************************************************************
538 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200539 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000540 * *
541 ************************************************************************/
542
543/*
544 * Start Tag: 1 means the start tag can be ommited
545 * End Tag: 1 means the end tag can be ommited
546 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000547 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000548 * Depr: this element is deprecated
549 * DTD: 1 means that this element is valid only in the Loose DTD
550 * 2 means that this element is valid only in the Frameset DTD
551 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000552 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000554 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000555
556/* Definitions and a couple of vars for HTML Elements */
557
558#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000559#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000560#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000561#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000562#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
563#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000564#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000565#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
566#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
567#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000568#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000569#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000570#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000571#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000572#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000573#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000575#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000576#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000577#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000578#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000579#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000580#define EMPTY NULL
581
582
Daniel Veillard065abe82006-07-03 08:55:04 +0000583static const char* const html_flow[] = { FLOW, NULL } ;
584static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000585
586/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000587static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000588#define html_cdata html_pcdata
589
590
591/* ... and for HTML Attributes */
592
593#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000594#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000595#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000596#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000597#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000598#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000599#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000600#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000601#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000602#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000604#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000605
Daniel Veillard065abe82006-07-03 08:55:04 +0000606static const char* const html_attrs[] = { ATTRS, NULL } ;
607static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
608static const char* const core_attrs[] = { COREATTRS, NULL } ;
609static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000610
611
612/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000613static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000614 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
615 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000616static const char* const target_attr[] = { "target", NULL } ;
617static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
618static const char* const alt_attr[] = { "alt", NULL } ;
619static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
620static const char* const href_attrs[] = { "href", NULL } ;
621static const char* const clear_attrs[] = { "clear", NULL } ;
622static const char* const inline_p[] = { INLINE, "p", NULL } ;
623
624static const char* const flow_param[] = { FLOW, "param", NULL } ;
625static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000626 "archive", "alt", "name", "height", "width", "align",
627 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000628static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000629 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000630static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000631 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000632static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
633static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
634static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
635static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000636 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000637static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000638 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
639
640
Daniel Veillard065abe82006-07-03 08:55:04 +0000641static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
642static const char* const col_elt[] = { "col", NULL } ;
643static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
644static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
645static const char* const dl_contents[] = { "dt", "dd", NULL } ;
646static const char* const compact_attr[] = { "compact", NULL } ;
647static const char* const label_attr[] = { "label", NULL } ;
648static const char* const fieldset_contents[] = { FLOW, "legend" } ;
649static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
650static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
651static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
652static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
653static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
654static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
655static const char* const head_attrs[] = { I18N, "profile", NULL } ;
656static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
657static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
658static const char* const version_attr[] = { "version", NULL } ;
659static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
660static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
661static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000662static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000663static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
664static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
665static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
666static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
667static const char* const align_attr[] = { "align", NULL } ;
668static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
669static const char* const map_contents[] = { BLOCK, "area", NULL } ;
670static const char* const name_attr[] = { "name", NULL } ;
671static const char* const action_attr[] = { "action", NULL } ;
672static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
673static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
674static const char* const content_attr[] = { "content", NULL } ;
675static const char* const type_attr[] = { "type", NULL } ;
676static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
677static const char* const object_contents[] = { FLOW, "param", NULL } ;
678static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
679static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
680static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
681static const char* const option_elt[] = { "option", NULL } ;
682static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
683static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
684static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
685static const char* const width_attr[] = { "width", NULL } ;
686static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
687static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
688static const char* const language_attr[] = { "language", NULL } ;
689static const char* const select_content[] = { "optgroup", "option", NULL } ;
690static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
691static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200692static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000693static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
694static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
695static const char* const tr_elt[] = { "tr", NULL } ;
696static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
697static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
698static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
699static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
700static const char* const tr_contents[] = { "th", "td", NULL } ;
701static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
702static const char* const li_elt[] = { "li", NULL } ;
703static const char* const ul_depr[] = { "type", "compact", NULL} ;
704static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000705
706#define DECL (const char**)
707
Daniel Veillard22090732001-07-16 00:06:07 +0000708static const htmlElemDesc
709html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000710{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
711 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
712},
713{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
714 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
715},
716{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
717 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
718},
719{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
720 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
721},
722{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
723 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
724},
725{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
726 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
727},
728{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
729 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
730},
731{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
732 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
733},
734{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
735 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
736},
737{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
738 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
739},
740{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
741 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
742},
743{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
744 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
745},
746{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
747 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
748},
749{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
750 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
751},
752{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
753 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
754},
755{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
756 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
757},
758{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
759 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
760},
761{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
762 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
763},
764{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
765 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
766},
767{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
768 EMPTY , NULL , DECL col_attrs , NULL, NULL
769},
770{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
771 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
772},
773{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
774 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
775},
776{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
777 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
778},
779{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
780 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
781},
782{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
783 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
784},
785{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
786 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
787},
788{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000789 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000790},
791{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
792 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
793},
794{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
795 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
796},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000797{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000798 EMPTY, NULL, DECL embed_attrs, NULL, NULL
799},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000800{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
801 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
802},
803{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
804 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
805},
806{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
807 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
808},
809{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
810 EMPTY, NULL, NULL, DECL frame_attrs, NULL
811},
812{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
813 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
814},
815{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
816 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
817},
818{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
819 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
820},
821{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
822 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
823},
824{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
825 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
826},
827{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
828 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
829},
830{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
831 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
832},
833{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
834 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
835},
836{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
837 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
838},
839{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
840 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
841},
842{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
843 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
844},
845{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
846 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
847},
848{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000849 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000850},
851{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
852 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
853},
854{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
855 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
856},
857{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
858 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
859},
860{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
861 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
862},
863{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
864 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
865},
866{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
867 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
868},
869{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
870 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
871},
872{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
873 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
874},
875{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000876 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000877},
878{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
879 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
880},
881{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
882 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
883},
884{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
885 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
886},
887{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
888 DECL html_flow, "div", DECL html_attrs, NULL, NULL
889},
890{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
891 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
892},
893{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
894 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
895},
896{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000897 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000898},
899{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
900 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
901},
902{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904},
905{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000906 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000907},
908{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
909 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
910},
911{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
912 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
913},
914{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
915 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
916},
917{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919},
920{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
921 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
922},
923{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
924 DECL select_content, NULL, DECL select_attrs, NULL, NULL
925},
926{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
927 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
928},
929{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931},
932{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
933 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
934},
935{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
936 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
937},
938{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
939 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
940},
941{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
942 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
943},
944{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
945 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
946},
947{ "table", 0, 0, 0, 0, 0, 0, 0, "",
948 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
949},
950{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
951 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
952},
953{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
954 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
955},
956{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
957 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
958},
959{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
960 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
961},
962{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
963 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
964},
965{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
966 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
967},
968{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
969 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
970},
971{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
972 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
973},
974{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976},
977{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
978 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
979},
980{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
981 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
982},
983{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985}
Owen Taylor3473f882001-02-23 17:55:21 +0000986};
987
988/*
Owen Taylor3473f882001-02-23 17:55:21 +0000989 * start tags that imply the end of current element
990 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000991static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000992"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
993 "dl", "ul", "ol", "menu", "dir", "address", "pre",
994 "listing", "xmp", "head", NULL,
995"head", "p", NULL,
996"title", "p", NULL,
997"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000998"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000999"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1000 "pre", "listing", "xmp", "head", "li", NULL,
1001"hr", "p", "head", NULL,
1002"h1", "p", "head", NULL,
1003"h2", "p", "head", NULL,
1004"h3", "p", "head", NULL,
1005"h4", "p", "head", NULL,
1006"h5", "p", "head", NULL,
1007"h6", "p", "head", NULL,
1008"dir", "p", "head", NULL,
1009"address", "p", "head", "ul", NULL,
1010"pre", "p", "head", "ul", NULL,
1011"listing", "p", "head", NULL,
1012"xmp", "p", "head", NULL,
1013"blockquote", "p", "head", NULL,
1014"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1015 "xmp", "head", NULL,
1016"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1017 "head", "dd", NULL,
1018"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1019 "head", "dt", NULL,
1020"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1021 "listing", "xmp", NULL,
1022"ol", "p", "head", "ul", NULL,
1023"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001024"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001025"div", "p", "head", NULL,
1026"noscript", "p", "head", NULL,
1027"center", "font", "b", "i", "p", "head", NULL,
1028"a", "a", NULL,
1029"caption", "p", NULL,
1030"colgroup", "caption", "colgroup", "col", "p", NULL,
1031"col", "caption", "col", "p", NULL,
1032"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1033 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001034"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001035"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001036"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1037"thead", "caption", "col", "colgroup", NULL,
1038"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1039 "tbody", "p", NULL,
1040"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1041 "tfoot", "tbody", "p", NULL,
1042"optgroup", "option", NULL,
1043"option", "option", NULL,
1044"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1045 "pre", "listing", "xmp", "a", NULL,
1046NULL
1047};
1048
1049/*
1050 * The list of HTML elements which are supposed not to have
1051 * CDATA content and where a p element will be implied
1052 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001053 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001054 * implied paragraph
1055 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001056static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001057 "html",
1058 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001059 NULL
1060};
1061
1062/*
1063 * The list of HTML attributes which are of content %Script;
1064 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1065 * it assumes the name starts with 'on'
1066 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001067static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001068 "onclick",
1069 "ondblclick",
1070 "onmousedown",
1071 "onmouseup",
1072 "onmouseover",
1073 "onmousemove",
1074 "onmouseout",
1075 "onkeypress",
1076 "onkeydown",
1077 "onkeyup",
1078 "onload",
1079 "onunload",
1080 "onfocus",
1081 "onblur",
1082 "onsubmit",
1083 "onrest",
1084 "onchange",
1085 "onselect"
1086};
1087
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001088/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001089 * This table is used by the htmlparser to know what to do with
1090 * broken html pages. By assigning different priorities to different
1091 * elements the parser can decide how to handle extra endtags.
1092 * Endtags are only allowed to close elements with lower or equal
1093 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001094 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001095
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001096typedef struct {
1097 const char *name;
1098 int priority;
1099} elementPriority;
1100
Daniel Veillard22090732001-07-16 00:06:07 +00001101static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001102 {"div", 150},
1103 {"td", 160},
1104 {"th", 160},
1105 {"tr", 170},
1106 {"thead", 180},
1107 {"tbody", 180},
1108 {"tfoot", 180},
1109 {"table", 190},
1110 {"head", 200},
1111 {"body", 200},
1112 {"html", 220},
1113 {NULL, 100} /* Default priority */
1114};
Owen Taylor3473f882001-02-23 17:55:21 +00001115
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001116static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001117static int htmlStartCloseIndexinitialized = 0;
1118
1119/************************************************************************
1120 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001121 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001122 * *
1123 ************************************************************************/
1124
1125/**
1126 * htmlInitAutoClose:
1127 *
1128 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1129 * This is not reentrant. Call xmlInitParser() once before processing in
1130 * case of use in multithreaded programs.
1131 */
1132void
1133htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001134 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001135
1136 if (htmlStartCloseIndexinitialized) return;
1137
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001138 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1139 indx = 0;
1140 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001141 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001142 while (htmlStartClose[i] != NULL) i++;
1143 i++;
1144 }
1145 htmlStartCloseIndexinitialized = 1;
1146}
1147
1148/**
1149 * htmlTagLookup:
1150 * @tag: The tag name in lowercase
1151 *
1152 * Lookup the HTML tag in the ElementTable
1153 *
1154 * Returns the related htmlElemDescPtr or NULL if not found.
1155 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001156const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001157htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001158 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001159
1160 for (i = 0; i < (sizeof(html40ElementTable) /
1161 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001162 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001163 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001164 }
1165 return(NULL);
1166}
1167
1168/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001169 * htmlGetEndPriority:
1170 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001171 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001172 * Return value: The "endtag" priority.
1173 **/
1174static int
1175htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001176 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001177
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001178 while ((htmlEndPriority[i].name != NULL) &&
1179 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1180 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001181
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001182 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001183}
1184
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001185
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001186/**
Owen Taylor3473f882001-02-23 17:55:21 +00001187 * htmlCheckAutoClose:
1188 * @newtag: The new tag name
1189 * @oldtag: The old tag name
1190 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001191 * Checks whether the new tag is one of the registered valid tags for
1192 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001193 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1194 *
1195 * Returns 0 if no, 1 if yes.
1196 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001197static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1199{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001200 int i, indx;
1201 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001202
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001203 if (htmlStartCloseIndexinitialized == 0)
1204 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001205
1206 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001207 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001208 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001209 if (closed == NULL)
1210 return (0);
1211 if (xmlStrEqual(BAD_CAST * closed, newtag))
1212 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001213 }
1214
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001215 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001216 i++;
1217 while (htmlStartClose[i] != NULL) {
1218 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001219 return (1);
1220 }
1221 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001222 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001223 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001224}
1225
1226/**
1227 * htmlAutoCloseOnClose:
1228 * @ctxt: an HTML parser context
1229 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001230 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001231 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001232 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001233 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001234static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001235htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1236{
1237 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001238 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001239
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001240 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001241
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001242 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001243
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1245 break;
1246 /*
1247 * A missplaced endtag can only close elements with lower
1248 * or equal priority, so if we find an element with higher
1249 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001250 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001251 */
1252 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1253 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001254 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001255 if (i < 0)
1256 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001257
1258 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001259 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001260 if ((info != NULL) && (info->endTag == 3)) {
1261 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1262 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001263 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001264 }
1265 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1266 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001267 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001268 }
1269}
1270
1271/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001272 * htmlAutoCloseOnEnd:
1273 * @ctxt: an HTML parser context
1274 *
1275 * Close all remaining tags at the end of the stream
1276 */
1277static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001278htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1279{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001280 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001281
William M. Brack899e64a2003-09-26 18:03:42 +00001282 if (ctxt->nameNr == 0)
1283 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001284 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001285 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1286 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001287 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001288 }
1289}
1290
1291/**
Owen Taylor3473f882001-02-23 17:55:21 +00001292 * htmlAutoClose:
1293 * @ctxt: an HTML parser context
1294 * @newtag: The new tag name or NULL
1295 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001296 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001297 * The list is kept in htmlStartClose array. This function is
1298 * called when a new tag has been detected and generates the
1299 * appropriates closes if possible/needed.
1300 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001301 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001302 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001303static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001304htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1305{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001307 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001308 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1309 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001310 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001311 }
1312 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001313 htmlAutoCloseOnEnd(ctxt);
1314 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001315 }
1316 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001317 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1318 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1319 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1321 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001322 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001323 }
Owen Taylor3473f882001-02-23 17:55:21 +00001324}
1325
1326/**
1327 * htmlAutoCloseTag:
1328 * @doc: the HTML document
1329 * @name: The tag name
1330 * @elem: the HTML element
1331 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001332 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001333 * The list is kept in htmlStartClose array. This function checks
1334 * if the element or one of it's children would autoclose the
1335 * given tag.
1336 *
1337 * Returns 1 if autoclose, 0 otherwise
1338 */
1339int
1340htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1341 htmlNodePtr child;
1342
1343 if (elem == NULL) return(1);
1344 if (xmlStrEqual(name, elem->name)) return(0);
1345 if (htmlCheckAutoClose(elem->name, name)) return(1);
1346 child = elem->children;
1347 while (child != NULL) {
1348 if (htmlAutoCloseTag(doc, name, child)) return(1);
1349 child = child->next;
1350 }
1351 return(0);
1352}
1353
1354/**
1355 * htmlIsAutoClosed:
1356 * @doc: the HTML document
1357 * @elem: the HTML element
1358 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001359 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001360 * The list is kept in htmlStartClose array. This function checks
1361 * if a tag is autoclosed by one of it's child
1362 *
1363 * Returns 1 if autoclosed, 0 otherwise
1364 */
1365int
1366htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1367 htmlNodePtr child;
1368
1369 if (elem == NULL) return(1);
1370 child = elem->children;
1371 while (child != NULL) {
1372 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1373 child = child->next;
1374 }
1375 return(0);
1376}
1377
1378/**
1379 * htmlCheckImplied:
1380 * @ctxt: an HTML parser context
1381 * @newtag: The new tag name
1382 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001383 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001384 * called when a new tag has been detected and generates the
1385 * appropriates implicit tags if missing
1386 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001387static void
Owen Taylor3473f882001-02-23 17:55:21 +00001388htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001389 int i;
1390
Owen Taylor3473f882001-02-23 17:55:21 +00001391 if (!htmlOmittedDefaultValue)
1392 return;
1393 if (xmlStrEqual(newtag, BAD_CAST"html"))
1394 return;
1395 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001396 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001397 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1398 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1399 }
1400 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1401 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001402 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001403 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1404 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1405 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1406 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1407 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1408 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001409 if (ctxt->html >= 3) {
1410 /* we already saw or generated an <head> before */
1411 return;
1412 }
1413 /*
1414 * dropped OBJECT ... i you put it first BODY will be
1415 * assumed !
1416 */
1417 htmlnamePush(ctxt, BAD_CAST"head");
1418 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1419 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001420 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1421 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1422 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001423 if (ctxt->html >= 10) {
1424 /* we already saw or generated a <body> before */
1425 return;
1426 }
Owen Taylor3473f882001-02-23 17:55:21 +00001427 for (i = 0;i < ctxt->nameNr;i++) {
1428 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1429 return;
1430 }
1431 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1432 return;
1433 }
1434 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001435
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001436 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001437 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1438 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1439 }
1440}
1441
1442/**
1443 * htmlCheckParagraph
1444 * @ctxt: an HTML parser context
1445 *
1446 * Check whether a p element need to be implied before inserting
1447 * characters in the current element.
1448 *
1449 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1450 * in case of error.
1451 */
1452
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001453static int
Owen Taylor3473f882001-02-23 17:55:21 +00001454htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1455 const xmlChar *tag;
1456 int i;
1457
1458 if (ctxt == NULL)
1459 return(-1);
1460 tag = ctxt->name;
1461 if (tag == NULL) {
1462 htmlAutoClose(ctxt, BAD_CAST"p");
1463 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001464 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001465 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1466 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1467 return(1);
1468 }
1469 if (!htmlOmittedDefaultValue)
1470 return(0);
1471 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1472 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001473 htmlAutoClose(ctxt, BAD_CAST"p");
1474 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001475 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001476 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1477 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1478 return(1);
1479 }
1480 }
1481 return(0);
1482}
1483
1484/**
1485 * htmlIsScriptAttribute:
1486 * @name: an attribute name
1487 *
1488 * Check if an attribute is of content type Script
1489 *
1490 * Returns 1 is the attribute is a script 0 otherwise
1491 */
1492int
1493htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001494 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001495
1496 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001497 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001498 /*
1499 * all script attributes start with 'on'
1500 */
1501 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001502 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001503 for (i = 0;
1504 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1505 i++) {
1506 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1507 return(1);
1508 }
1509 return(0);
1510}
1511
1512/************************************************************************
1513 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001514 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001515 * *
1516 ************************************************************************/
1517
1518
Daniel Veillard22090732001-07-16 00:06:07 +00001519static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001520/*
1521 * the 4 absolute ones, plus apostrophe.
1522 */
1523{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1524{ 38, "amp", "ampersand, U+0026 ISOnum" },
1525{ 39, "apos", "single quote" },
1526{ 60, "lt", "less-than sign, U+003C ISOnum" },
1527{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1528
1529/*
1530 * A bunch still in the 128-255 range
1531 * Replacing them depend really on the charset used.
1532 */
1533{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1534{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1535{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1536{ 163, "pound","pound sign, U+00A3 ISOnum" },
1537{ 164, "curren","currency sign, U+00A4 ISOnum" },
1538{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1539{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1540{ 167, "sect", "section sign, U+00A7 ISOnum" },
1541{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1542{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1543{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1544{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1545{ 172, "not", "not sign, U+00AC ISOnum" },
1546{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1547{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1548{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1549{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1550{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1551{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1552{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1553{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1554{ 181, "micro","micro sign, U+00B5 ISOnum" },
1555{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1556{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1557{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1558{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1559{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1560{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1561{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1562{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1563{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1564{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1565{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1566{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1567{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1568{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1569{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1570{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1571{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1572{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1573{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1574{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1575{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1576{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1577{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1578{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1579{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1580{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1581{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1582{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1583{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1584{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1585{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1586{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1587{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1588{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1589{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1590{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1591{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1592{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1593{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1594{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1595{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1596{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1597{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1598{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1599{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1600{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1601{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1602{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1603{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1604{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1605{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1606{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1607{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1608{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1609{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1610{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1611{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1612{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1613{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1614{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1615{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1616{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1617{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1618{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1619{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1620{ 247, "divide","division sign, U+00F7 ISOnum" },
1621{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1622{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1623{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1624{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1625{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1626{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1627{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1628{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1629
1630{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1631{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1632{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1633{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1634{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1635
1636/*
1637 * Anything below should really be kept as entities references
1638 */
1639{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1640
1641{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1642{ 732, "tilde","small tilde, U+02DC ISOdia" },
1643
1644{ 913, "Alpha","greek capital letter alpha, U+0391" },
1645{ 914, "Beta", "greek capital letter beta, U+0392" },
1646{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1647{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1648{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1649{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1650{ 919, "Eta", "greek capital letter eta, U+0397" },
1651{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1652{ 921, "Iota", "greek capital letter iota, U+0399" },
1653{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001654{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001655{ 924, "Mu", "greek capital letter mu, U+039C" },
1656{ 925, "Nu", "greek capital letter nu, U+039D" },
1657{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1658{ 927, "Omicron","greek capital letter omicron, U+039F" },
1659{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1660{ 929, "Rho", "greek capital letter rho, U+03A1" },
1661{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1662{ 932, "Tau", "greek capital letter tau, U+03A4" },
1663{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1664{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1665{ 935, "Chi", "greek capital letter chi, U+03A7" },
1666{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1667{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1668
1669{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1670{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1671{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1672{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1673{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1674{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1675{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1676{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1677{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1678{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1679{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1680{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1681{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1682{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1683{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1684{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1685{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1686{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1687{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1688{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1689{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1690{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1691{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1692{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1693{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1694{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1695{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1696{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1697
1698{ 8194, "ensp", "en space, U+2002 ISOpub" },
1699{ 8195, "emsp", "em space, U+2003 ISOpub" },
1700{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1701{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1702{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1703{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1704{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1705{ 8211, "ndash","en dash, U+2013 ISOpub" },
1706{ 8212, "mdash","em dash, U+2014 ISOpub" },
1707{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1708{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1709{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1710{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1711{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1712{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1713{ 8224, "dagger","dagger, U+2020 ISOpub" },
1714{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1715
1716{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1717{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1718
1719{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1720
1721{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1722{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1723
1724{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1725{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1726
1727{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1728{ 8260, "frasl","fraction slash, U+2044 NEW" },
1729
1730{ 8364, "euro", "euro sign, U+20AC NEW" },
1731
1732{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1733{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1734{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1735{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1736{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1737{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1738{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1739{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1740{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1741{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1742{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1743{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1744{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1745{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1746{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1747{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1748
1749{ 8704, "forall","for all, U+2200 ISOtech" },
1750{ 8706, "part", "partial differential, U+2202 ISOtech" },
1751{ 8707, "exist","there exists, U+2203 ISOtech" },
1752{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1753{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1754{ 8712, "isin", "element of, U+2208 ISOtech" },
1755{ 8713, "notin","not an element of, U+2209 ISOtech" },
1756{ 8715, "ni", "contains as member, U+220B ISOtech" },
1757{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001758{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001759{ 8722, "minus","minus sign, U+2212 ISOtech" },
1760{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1761{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1762{ 8733, "prop", "proportional to, U+221D ISOtech" },
1763{ 8734, "infin","infinity, U+221E ISOtech" },
1764{ 8736, "ang", "angle, U+2220 ISOamso" },
1765{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1766{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1767{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1768{ 8746, "cup", "union = cup, U+222A ISOtech" },
1769{ 8747, "int", "integral, U+222B ISOtech" },
1770{ 8756, "there4","therefore, U+2234 ISOtech" },
1771{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1772{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1773{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1774{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1775{ 8801, "equiv","identical to, U+2261 ISOtech" },
1776{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1777{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1778{ 8834, "sub", "subset of, U+2282 ISOtech" },
1779{ 8835, "sup", "superset of, U+2283 ISOtech" },
1780{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1781{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1782{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1783{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1784{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1785{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1786{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1787{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1788{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1789{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1790{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1791{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1792{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1793{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1794
1795{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1796{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1797{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1798{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1799
1800};
1801
1802/************************************************************************
1803 * *
1804 * Commodity functions to handle entities *
1805 * *
1806 ************************************************************************/
1807
1808/*
1809 * Macro used to grow the current buffer.
1810 */
1811#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001812 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001813 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001814 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1815 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001816 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001817 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001818 return(NULL); \
1819 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001820 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001821}
1822
1823/**
1824 * htmlEntityLookup:
1825 * @name: the entity name
1826 *
1827 * Lookup the given entity in EntitiesTable
1828 *
1829 * TODO: the linear scan is really ugly, an hash table is really needed.
1830 *
1831 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1832 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001833const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001834htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001835 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001836
1837 for (i = 0;i < (sizeof(html40EntitiesTable)/
1838 sizeof(html40EntitiesTable[0]));i++) {
1839 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001840 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001841 }
1842 }
1843 return(NULL);
1844}
1845
1846/**
1847 * htmlEntityValueLookup:
1848 * @value: the entity's unicode value
1849 *
1850 * Lookup the given entity in EntitiesTable
1851 *
1852 * TODO: the linear scan is really ugly, an hash table is really needed.
1853 *
1854 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1855 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001856const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001857htmlEntityValueLookup(unsigned int value) {
1858 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001859
1860 for (i = 0;i < (sizeof(html40EntitiesTable)/
1861 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001862 if (html40EntitiesTable[i].value >= value) {
1863 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001864 break;
William M. Brack78637da2003-07-31 14:47:38 +00001865 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001866 }
Owen Taylor3473f882001-02-23 17:55:21 +00001867 }
1868 return(NULL);
1869}
1870
1871/**
1872 * UTF8ToHtml:
1873 * @out: a pointer to an array of bytes to store the result
1874 * @outlen: the length of @out
1875 * @in: a pointer to an array of UTF-8 chars
1876 * @inlen: the length of @in
1877 *
1878 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1879 * plus HTML entities block of chars out.
1880 *
1881 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1882 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001883 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001884 * The value of @outlen after return is the number of octets consumed.
1885 */
1886int
1887UTF8ToHtml(unsigned char* out, int *outlen,
1888 const unsigned char* in, int *inlen) {
1889 const unsigned char* processed = in;
1890 const unsigned char* outend;
1891 const unsigned char* outstart = out;
1892 const unsigned char* instart = in;
1893 const unsigned char* inend;
1894 unsigned int c, d;
1895 int trailing;
1896
Daniel Veillardce682bc2004-11-05 17:22:25 +00001897 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001898 if (in == NULL) {
1899 /*
1900 * initialization nothing to do
1901 */
1902 *outlen = 0;
1903 *inlen = 0;
1904 return(0);
1905 }
1906 inend = in + (*inlen);
1907 outend = out + (*outlen);
1908 while (in < inend) {
1909 d = *in++;
1910 if (d < 0x80) { c= d; trailing= 0; }
1911 else if (d < 0xC0) {
1912 /* trailing byte in leading position */
1913 *outlen = out - outstart;
1914 *inlen = processed - instart;
1915 return(-2);
1916 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1917 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1918 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1919 else {
1920 /* no chance for this in Ascii */
1921 *outlen = out - outstart;
1922 *inlen = processed - instart;
1923 return(-2);
1924 }
1925
1926 if (inend - in < trailing) {
1927 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001928 }
Owen Taylor3473f882001-02-23 17:55:21 +00001929
1930 for ( ; trailing; trailing--) {
1931 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1932 break;
1933 c <<= 6;
1934 c |= d & 0x3F;
1935 }
1936
1937 /* assertion: c is a single UTF-4 value */
1938 if (c < 0x80) {
1939 if (out + 1 >= outend)
1940 break;
1941 *out++ = c;
1942 } else {
1943 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001944 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001945 const char *cp;
1946 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001947
1948 /*
1949 * Try to lookup a predefined HTML entity for it
1950 */
1951
1952 ent = htmlEntityValueLookup(c);
1953 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001954 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1955 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001956 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001957 else
1958 cp = ent->name;
1959 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001960 if (out + 2 + len >= outend)
1961 break;
1962 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001963 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001964 out += len;
1965 *out++ = ';';
1966 }
1967 processed = in;
1968 }
1969 *outlen = out - outstart;
1970 *inlen = processed - instart;
1971 return(0);
1972}
1973
1974/**
1975 * htmlEncodeEntities:
1976 * @out: a pointer to an array of bytes to store the result
1977 * @outlen: the length of @out
1978 * @in: a pointer to an array of UTF-8 chars
1979 * @inlen: the length of @in
1980 * @quoteChar: the quote character to escape (' or ") or zero.
1981 *
1982 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1983 * plus HTML entities block of chars out.
1984 *
1985 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1986 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001987 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001988 * The value of @outlen after return is the number of octets consumed.
1989 */
1990int
1991htmlEncodeEntities(unsigned char* out, int *outlen,
1992 const unsigned char* in, int *inlen, int quoteChar) {
1993 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001994 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001995 const unsigned char* outstart = out;
1996 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001997 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001998 unsigned int c, d;
1999 int trailing;
2000
Daniel Veillardce682bc2004-11-05 17:22:25 +00002001 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2002 return(-1);
2003 outend = out + (*outlen);
2004 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002005 while (in < inend) {
2006 d = *in++;
2007 if (d < 0x80) { c= d; trailing= 0; }
2008 else if (d < 0xC0) {
2009 /* trailing byte in leading position */
2010 *outlen = out - outstart;
2011 *inlen = processed - instart;
2012 return(-2);
2013 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2014 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2015 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2016 else {
2017 /* no chance for this in Ascii */
2018 *outlen = out - outstart;
2019 *inlen = processed - instart;
2020 return(-2);
2021 }
2022
2023 if (inend - in < trailing)
2024 break;
2025
2026 while (trailing--) {
2027 if (((d= *in++) & 0xC0) != 0x80) {
2028 *outlen = out - outstart;
2029 *inlen = processed - instart;
2030 return(-2);
2031 }
2032 c <<= 6;
2033 c |= d & 0x3F;
2034 }
2035
2036 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002037 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2038 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002039 if (out >= outend)
2040 break;
2041 *out++ = c;
2042 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002043 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002044 const char *cp;
2045 char nbuf[16];
2046 int len;
2047
2048 /*
2049 * Try to lookup a predefined HTML entity for it
2050 */
2051 ent = htmlEntityValueLookup(c);
2052 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002053 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002054 cp = nbuf;
2055 }
2056 else
2057 cp = ent->name;
2058 len = strlen(cp);
2059 if (out + 2 + len > outend)
2060 break;
2061 *out++ = '&';
2062 memcpy(out, cp, len);
2063 out += len;
2064 *out++ = ';';
2065 }
2066 processed = in;
2067 }
2068 *outlen = out - outstart;
2069 *inlen = processed - instart;
2070 return(0);
2071}
2072
Owen Taylor3473f882001-02-23 17:55:21 +00002073/************************************************************************
2074 * *
2075 * Commodity functions to handle streams *
2076 * *
2077 ************************************************************************/
2078
2079/**
Owen Taylor3473f882001-02-23 17:55:21 +00002080 * htmlNewInputStream:
2081 * @ctxt: an HTML parser context
2082 *
2083 * Create a new input stream structure
2084 * Returns the new input stream or NULL
2085 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002086static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002087htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2088 htmlParserInputPtr input;
2089
2090 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2091 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002092 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002093 return(NULL);
2094 }
2095 memset(input, 0, sizeof(htmlParserInput));
2096 input->filename = NULL;
2097 input->directory = NULL;
2098 input->base = NULL;
2099 input->cur = NULL;
2100 input->buf = NULL;
2101 input->line = 1;
2102 input->col = 1;
2103 input->buf = NULL;
2104 input->free = NULL;
2105 input->version = NULL;
2106 input->consumed = 0;
2107 input->length = 0;
2108 return(input);
2109}
2110
2111
2112/************************************************************************
2113 * *
2114 * Commodity functions, cleanup needed ? *
2115 * *
2116 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002117/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002118 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002119 * NOTE: it might be more apropriate to integrate this information
2120 * into the html40ElementTable array but I don't want to risk any
2121 * binary incomptibility
2122 */
2123static const char *allowPCData[] = {
2124 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2125 "blockquote", "body", "button", "caption", "center", "cite", "code",
2126 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2127 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2128 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2129 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2130};
Owen Taylor3473f882001-02-23 17:55:21 +00002131
2132/**
2133 * areBlanks:
2134 * @ctxt: an HTML parser context
2135 * @str: a xmlChar *
2136 * @len: the size of @str
2137 *
2138 * Is this a sequence of blank chars that one can ignore ?
2139 *
2140 * Returns 1 if ignorable 0 otherwise.
2141 */
2142
2143static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002144 unsigned int i;
2145 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002146 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002147 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002148
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002149 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002150 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002151
2152 if (CUR == 0) return(1);
2153 if (CUR != '<') return(0);
2154 if (ctxt->name == NULL)
2155 return(1);
2156 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2157 return(1);
2158 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2159 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002160
2161 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2162 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2163 dtd = xmlGetIntSubset(ctxt->myDoc);
2164 if (dtd != NULL && dtd->ExternalID != NULL) {
2165 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2166 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2167 return(1);
2168 }
2169 }
2170
Owen Taylor3473f882001-02-23 17:55:21 +00002171 if (ctxt->node == NULL) return(0);
2172 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002173 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2174 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002175 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002176 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2177 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002178 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002179 for all tags "b" allowing PCDATA */
2180 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2181 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2182 return(0);
2183 }
2184 }
Owen Taylor3473f882001-02-23 17:55:21 +00002185 } else if (xmlNodeIsText(lastChild)) {
2186 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002187 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002188 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002189 for all tags "p" allowing PCDATA */
2190 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2191 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2192 return(0);
2193 }
2194 }
Owen Taylor3473f882001-02-23 17:55:21 +00002195 }
2196 return(1);
2197}
2198
2199/**
Owen Taylor3473f882001-02-23 17:55:21 +00002200 * htmlNewDocNoDtD:
2201 * @URI: URI for the dtd, or NULL
2202 * @ExternalID: the external ID of the DTD, or NULL
2203 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002204 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2205 * are NULL
2206 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002207 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002208 */
2209htmlDocPtr
2210htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2211 xmlDocPtr cur;
2212
2213 /*
2214 * Allocate a new document and fill the fields.
2215 */
2216 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2217 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002218 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002219 return(NULL);
2220 }
2221 memset(cur, 0, sizeof(xmlDoc));
2222
2223 cur->type = XML_HTML_DOCUMENT_NODE;
2224 cur->version = NULL;
2225 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002226 cur->doc = cur;
2227 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002228 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002229 cur->extSubset = NULL;
2230 cur->oldNs = NULL;
2231 cur->encoding = NULL;
2232 cur->standalone = 1;
2233 cur->compression = 0;
2234 cur->ids = NULL;
2235 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002236 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002237 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002238 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002239 if ((ExternalID != NULL) ||
2240 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002241 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002242 return(cur);
2243}
2244
2245/**
2246 * htmlNewDoc:
2247 * @URI: URI for the dtd, or NULL
2248 * @ExternalID: the external ID of the DTD, or NULL
2249 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002250 * Creates a new HTML document
2251 *
Owen Taylor3473f882001-02-23 17:55:21 +00002252 * Returns a new document
2253 */
2254htmlDocPtr
2255htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2256 if ((URI == NULL) && (ExternalID == NULL))
2257 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002258 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2259 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002260
2261 return(htmlNewDocNoDtD(URI, ExternalID));
2262}
2263
2264
2265/************************************************************************
2266 * *
2267 * The parser itself *
2268 * Relates to http://www.w3.org/TR/html40 *
2269 * *
2270 ************************************************************************/
2271
2272/************************************************************************
2273 * *
2274 * The parser itself *
2275 * *
2276 ************************************************************************/
2277
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002278static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002279
Owen Taylor3473f882001-02-23 17:55:21 +00002280/**
2281 * htmlParseHTMLName:
2282 * @ctxt: an HTML parser context
2283 *
2284 * parse an HTML tag or attribute name, note that we convert it to lowercase
2285 * since HTML names are not case-sensitive.
2286 *
2287 * Returns the Tag Name parsed or NULL
2288 */
2289
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002290static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002291htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002292 int i = 0;
2293 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2294
William M. Brackd1757ab2004-10-02 22:07:48 +00002295 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002296 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002297
2298 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002299 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002300 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2301 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002302 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2303 else loc[i] = CUR;
2304 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002305
Owen Taylor3473f882001-02-23 17:55:21 +00002306 NEXT;
2307 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002308
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002309 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002310}
2311
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002312
2313/**
2314 * htmlParseHTMLName_nonInvasive:
2315 * @ctxt: an HTML parser context
2316 *
2317 * parse an HTML tag or attribute name, note that we convert it to lowercase
2318 * since HTML names are not case-sensitive, this doesn't consume the data
2319 * from the stream, it's a look-ahead
2320 *
2321 * Returns the Tag Name parsed or NULL
2322 */
2323
2324static const xmlChar *
2325htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2326 int i = 0;
2327 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2328
2329 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2330 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002331
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002332 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2333 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2334 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2335 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2336 else loc[i] = NXT(1+i);
2337 i++;
2338 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002339
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002340 return(xmlDictLookup(ctxt->dict, loc, i));
2341}
2342
2343
Owen Taylor3473f882001-02-23 17:55:21 +00002344/**
2345 * htmlParseName:
2346 * @ctxt: an HTML parser context
2347 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002348 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002349 *
2350 * Returns the Name parsed or NULL
2351 */
2352
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002353static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002354htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002355 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002356 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002357 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002358
2359 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002360
2361 /*
2362 * Accelerator for simple ASCII names
2363 */
2364 in = ctxt->input->cur;
2365 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2366 ((*in >= 0x41) && (*in <= 0x5A)) ||
2367 (*in == '_') || (*in == ':')) {
2368 in++;
2369 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2370 ((*in >= 0x41) && (*in <= 0x5A)) ||
2371 ((*in >= 0x30) && (*in <= 0x39)) ||
2372 (*in == '_') || (*in == '-') ||
2373 (*in == ':') || (*in == '.'))
2374 in++;
2375 if ((*in > 0) && (*in < 0x80)) {
2376 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002377 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002378 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002379 ctxt->nbChars += count;
2380 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002381 return(ret);
2382 }
2383 }
2384 return(htmlParseNameComplex(ctxt));
2385}
2386
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002387static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002388htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002389 int len = 0, l;
2390 int c;
2391 int count = 0;
2392
2393 /*
2394 * Handler for more complex cases
2395 */
2396 GROW;
2397 c = CUR_CHAR(l);
2398 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2399 (!IS_LETTER(c) && (c != '_') &&
2400 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002401 return(NULL);
2402 }
2403
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002404 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2405 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2406 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002407 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002408 (IS_COMBINING(c)) ||
2409 (IS_EXTENDER(c)))) {
2410 if (count++ > 100) {
2411 count = 0;
2412 GROW;
2413 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002414 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002415 NEXTL(l);
2416 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002417 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002418 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002419}
2420
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002421
Owen Taylor3473f882001-02-23 17:55:21 +00002422/**
2423 * htmlParseHTMLAttribute:
2424 * @ctxt: an HTML parser context
2425 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002426 *
Owen Taylor3473f882001-02-23 17:55:21 +00002427 * parse an HTML attribute value till the stop (quote), if
2428 * stop is 0 then it stops at the first space
2429 *
2430 * Returns the attribute parsed or NULL
2431 */
2432
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002433static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002434htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2435 xmlChar *buffer = NULL;
2436 int buffer_size = 0;
2437 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002438 const xmlChar *name = NULL;
2439 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002440 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002441
2442 /*
2443 * allocate a translation buffer.
2444 */
2445 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002446 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002447 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002448 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002449 return(NULL);
2450 }
2451 out = buffer;
2452
2453 /*
2454 * Ok loop until we reach one of the ending chars
2455 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002456 while ((CUR != 0) && (CUR != stop)) {
2457 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002458 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002459 if (CUR == '&') {
2460 if (NXT(1) == '#') {
2461 unsigned int c;
2462 int bits;
2463
2464 c = htmlParseCharRef(ctxt);
2465 if (c < 0x80)
2466 { *out++ = c; bits= -6; }
2467 else if (c < 0x800)
2468 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2469 else if (c < 0x10000)
2470 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002471 else
Owen Taylor3473f882001-02-23 17:55:21 +00002472 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002473
Owen Taylor3473f882001-02-23 17:55:21 +00002474 for ( ; bits >= 0; bits-= 6) {
2475 *out++ = ((c >> bits) & 0x3F) | 0x80;
2476 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002477
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002478 if (out - buffer > buffer_size - 100) {
2479 int indx = out - buffer;
2480
2481 growBuffer(buffer);
2482 out = &buffer[indx];
2483 }
Owen Taylor3473f882001-02-23 17:55:21 +00002484 } else {
2485 ent = htmlParseEntityRef(ctxt, &name);
2486 if (name == NULL) {
2487 *out++ = '&';
2488 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002489 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002490
2491 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002492 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002493 }
2494 } else if (ent == NULL) {
2495 *out++ = '&';
2496 cur = name;
2497 while (*cur != 0) {
2498 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002499 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002500
2501 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002502 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002503 }
2504 *out++ = *cur++;
2505 }
Owen Taylor3473f882001-02-23 17:55:21 +00002506 } else {
2507 unsigned int c;
2508 int bits;
2509
2510 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002511 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002512
2513 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002514 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002515 }
Daniel Veillard48519092006-10-17 15:56:35 +00002516 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002517 if (c < 0x80)
2518 { *out++ = c; bits= -6; }
2519 else if (c < 0x800)
2520 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2521 else if (c < 0x10000)
2522 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002523 else
Owen Taylor3473f882001-02-23 17:55:21 +00002524 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002525
Owen Taylor3473f882001-02-23 17:55:21 +00002526 for ( ; bits >= 0; bits-= 6) {
2527 *out++ = ((c >> bits) & 0x3F) | 0x80;
2528 }
Owen Taylor3473f882001-02-23 17:55:21 +00002529 }
2530 }
2531 } else {
2532 unsigned int c;
2533 int bits, l;
2534
2535 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002536 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002537
2538 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002539 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002540 }
2541 c = CUR_CHAR(l);
2542 if (c < 0x80)
2543 { *out++ = c; bits= -6; }
2544 else if (c < 0x800)
2545 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2546 else if (c < 0x10000)
2547 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002548 else
Owen Taylor3473f882001-02-23 17:55:21 +00002549 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002550
Owen Taylor3473f882001-02-23 17:55:21 +00002551 for ( ; bits >= 0; bits-= 6) {
2552 *out++ = ((c >> bits) & 0x3F) | 0x80;
2553 }
2554 NEXT;
2555 }
2556 }
2557 *out++ = 0;
2558 return(buffer);
2559}
2560
2561/**
Owen Taylor3473f882001-02-23 17:55:21 +00002562 * htmlParseEntityRef:
2563 * @ctxt: an HTML parser context
2564 * @str: location to store the entity name
2565 *
2566 * parse an HTML ENTITY references
2567 *
2568 * [68] EntityRef ::= '&' Name ';'
2569 *
2570 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2571 * if non-NULL *str will have to be freed by the caller.
2572 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002573const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002574htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2575 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002576 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002577
2578 if (str != NULL) *str = NULL;
2579 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002580
2581 if (CUR == '&') {
2582 NEXT;
2583 name = htmlParseName(ctxt);
2584 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002585 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2586 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002587 } else {
2588 GROW;
2589 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002590 if (str != NULL)
2591 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002592
2593 /*
2594 * Lookup the entity in the table.
2595 */
2596 ent = htmlEntityLookup(name);
2597 if (ent != NULL) /* OK that's ugly !!! */
2598 NEXT;
2599 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002600 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2601 "htmlParseEntityRef: expecting ';'\n",
2602 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002603 if (str != NULL)
2604 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002605 }
2606 }
2607 }
2608 return(ent);
2609}
2610
2611/**
2612 * htmlParseAttValue:
2613 * @ctxt: an HTML parser context
2614 *
2615 * parse a value for an attribute
2616 * Note: the parser won't do substitution of entities here, this
2617 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002618 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002619 *
2620 * Returns the AttValue parsed or NULL.
2621 */
2622
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002623static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002624htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2625 xmlChar *ret = NULL;
2626
2627 if (CUR == '"') {
2628 NEXT;
2629 ret = htmlParseHTMLAttribute(ctxt, '"');
2630 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002631 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2632 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002633 } else
2634 NEXT;
2635 } else if (CUR == '\'') {
2636 NEXT;
2637 ret = htmlParseHTMLAttribute(ctxt, '\'');
2638 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002639 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2640 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002641 } else
2642 NEXT;
2643 } else {
2644 /*
2645 * That's an HTMLism, the attribute value may not be quoted
2646 */
2647 ret = htmlParseHTMLAttribute(ctxt, 0);
2648 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002649 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2650 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002651 }
2652 }
2653 return(ret);
2654}
2655
2656/**
2657 * htmlParseSystemLiteral:
2658 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002659 *
Owen Taylor3473f882001-02-23 17:55:21 +00002660 * parse an HTML Literal
2661 *
2662 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2663 *
2664 * Returns the SystemLiteral parsed or NULL
2665 */
2666
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002667static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002668htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2669 const xmlChar *q;
2670 xmlChar *ret = NULL;
2671
2672 if (CUR == '"') {
2673 NEXT;
2674 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002675 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002676 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002677 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002678 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2679 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002680 } else {
2681 ret = xmlStrndup(q, CUR_PTR - q);
2682 NEXT;
2683 }
2684 } else if (CUR == '\'') {
2685 NEXT;
2686 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002687 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002688 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002689 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002690 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2691 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002692 } else {
2693 ret = xmlStrndup(q, CUR_PTR - q);
2694 NEXT;
2695 }
2696 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002697 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2698 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002699 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002700
Owen Taylor3473f882001-02-23 17:55:21 +00002701 return(ret);
2702}
2703
2704/**
2705 * htmlParsePubidLiteral:
2706 * @ctxt: an HTML parser context
2707 *
2708 * parse an HTML public literal
2709 *
2710 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2711 *
2712 * Returns the PubidLiteral parsed or NULL.
2713 */
2714
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002715static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002716htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2717 const xmlChar *q;
2718 xmlChar *ret = NULL;
2719 /*
2720 * Name ::= (Letter | '_') (NameChar)*
2721 */
2722 if (CUR == '"') {
2723 NEXT;
2724 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002725 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002726 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002727 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2728 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002729 } else {
2730 ret = xmlStrndup(q, CUR_PTR - q);
2731 NEXT;
2732 }
2733 } else if (CUR == '\'') {
2734 NEXT;
2735 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002736 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002737 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002738 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002739 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2740 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002741 } else {
2742 ret = xmlStrndup(q, CUR_PTR - q);
2743 NEXT;
2744 }
2745 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002746 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2747 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002748 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002749
Owen Taylor3473f882001-02-23 17:55:21 +00002750 return(ret);
2751}
2752
2753/**
2754 * htmlParseScript:
2755 * @ctxt: an HTML parser context
2756 *
2757 * parse the content of an HTML SCRIPT or STYLE element
2758 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2759 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2760 * http://www.w3.org/TR/html4/types.html#type-script
2761 * http://www.w3.org/TR/html4/types.html#h-6.15
2762 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2763 *
2764 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2765 * element and the value of intrinsic event attributes. User agents must
2766 * not evaluate script data as HTML markup but instead must pass it on as
2767 * data to a script engine.
2768 * NOTES:
2769 * - The content is passed like CDATA
2770 * - the attributes for style and scripting "onXXX" are also described
2771 * as CDATA but SGML allows entities references in attributes so their
2772 * processing is identical as other attributes
2773 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002774static void
Owen Taylor3473f882001-02-23 17:55:21 +00002775htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002776 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002777 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002778 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002779
2780 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002781 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002782 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002783 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002784 /*
2785 * One should break here, the specification is clear:
2786 * Authors should therefore escape "</" within the content.
2787 * Escape mechanisms are specific to each scripting or
2788 * style sheet language.
2789 *
2790 * In recovery mode, only break if end tag match the
2791 * current tag, effectively ignoring all tags inside the
2792 * script/style block and treating the entire block as
2793 * CDATA.
2794 */
2795 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002796 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2797 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002798 {
2799 break; /* while */
2800 } else {
2801 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002802 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002803 ctxt->name, NULL);
2804 }
2805 } else {
2806 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002807 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002808 {
2809 break; /* while */
2810 }
2811 }
Owen Taylor3473f882001-02-23 17:55:21 +00002812 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002813 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002814 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2815 if (ctxt->sax->cdataBlock!= NULL) {
2816 /*
2817 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2818 */
2819 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002820 } else if (ctxt->sax->characters != NULL) {
2821 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002822 }
2823 nbchar = 0;
2824 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002825 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002826 NEXTL(l);
2827 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002828 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002829
Daniel Veillard68716a72006-10-16 09:32:17 +00002830 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002831 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2832 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002833 NEXT;
2834 }
2835
2836 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2837 if (ctxt->sax->cdataBlock!= NULL) {
2838 /*
2839 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2840 */
2841 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002842 } else if (ctxt->sax->characters != NULL) {
2843 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002844 }
2845 }
2846}
2847
2848
2849/**
2850 * htmlParseCharData:
2851 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002852 *
2853 * parse a CharData section.
2854 * if we are within a CDATA section ']]>' marks an end of section.
2855 *
2856 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2857 */
2858
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002859static void
2860htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002861 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2862 int nbchar = 0;
2863 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002864 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002865
2866 SHRINK;
2867 cur = CUR_CHAR(l);
2868 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002869 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002870 (cur != 0)) {
2871 if (!(IS_CHAR(cur))) {
2872 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2873 "Invalid char in CDATA 0x%X\n", cur);
2874 } else {
2875 COPY_BUF(l,buf,nbchar,cur);
2876 }
Owen Taylor3473f882001-02-23 17:55:21 +00002877 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2878 /*
2879 * Ok the segment is to be consumed as chars.
2880 */
2881 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2882 if (areBlanks(ctxt, buf, nbchar)) {
2883 if (ctxt->sax->ignorableWhitespace != NULL)
2884 ctxt->sax->ignorableWhitespace(ctxt->userData,
2885 buf, nbchar);
2886 } else {
2887 htmlCheckParagraph(ctxt);
2888 if (ctxt->sax->characters != NULL)
2889 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2890 }
2891 }
2892 nbchar = 0;
2893 }
2894 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002895 chunk++;
2896 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2897 chunk = 0;
2898 SHRINK;
2899 GROW;
2900 }
Owen Taylor3473f882001-02-23 17:55:21 +00002901 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002902 if (cur == 0) {
2903 SHRINK;
2904 GROW;
2905 cur = CUR_CHAR(l);
2906 }
Owen Taylor3473f882001-02-23 17:55:21 +00002907 }
2908 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002909 buf[nbchar] = 0;
2910
Owen Taylor3473f882001-02-23 17:55:21 +00002911 /*
2912 * Ok the segment is to be consumed as chars.
2913 */
2914 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2915 if (areBlanks(ctxt, buf, nbchar)) {
2916 if (ctxt->sax->ignorableWhitespace != NULL)
2917 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2918 } else {
2919 htmlCheckParagraph(ctxt);
2920 if (ctxt->sax->characters != NULL)
2921 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2922 }
2923 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002924 } else {
2925 /*
2926 * Loop detection
2927 */
2928 if (cur == 0)
2929 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002930 }
2931}
2932
2933/**
2934 * htmlParseExternalID:
2935 * @ctxt: an HTML parser context
2936 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002937 *
2938 * Parse an External ID or a Public ID
2939 *
Owen Taylor3473f882001-02-23 17:55:21 +00002940 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2941 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2942 *
2943 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2944 *
2945 * Returns the function returns SystemLiteral and in the second
2946 * case publicID receives PubidLiteral, is strict is off
2947 * it is possible to return NULL and have publicID set.
2948 */
2949
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002950static xmlChar *
2951htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002952 xmlChar *URI = NULL;
2953
2954 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2955 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2956 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2957 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002958 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002959 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2960 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002961 }
2962 SKIP_BLANKS;
2963 URI = htmlParseSystemLiteral(ctxt);
2964 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002965 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2966 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002967 }
2968 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2969 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2970 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2971 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002972 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002973 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2974 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002975 }
2976 SKIP_BLANKS;
2977 *publicID = htmlParsePubidLiteral(ctxt);
2978 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002979 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2980 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2981 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002982 }
2983 SKIP_BLANKS;
2984 if ((CUR == '"') || (CUR == '\'')) {
2985 URI = htmlParseSystemLiteral(ctxt);
2986 }
2987 }
2988 return(URI);
2989}
2990
2991/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002992 * xmlParsePI:
2993 * @ctxt: an XML parser context
2994 *
2995 * parse an XML Processing Instruction.
2996 *
2997 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2998 */
2999static void
3000htmlParsePI(htmlParserCtxtPtr ctxt) {
3001 xmlChar *buf = NULL;
3002 int len = 0;
3003 int size = HTML_PARSER_BUFFER_SIZE;
3004 int cur, l;
3005 const xmlChar *target;
3006 xmlParserInputState state;
3007 int count = 0;
3008
3009 if ((RAW == '<') && (NXT(1) == '?')) {
3010 state = ctxt->instate;
3011 ctxt->instate = XML_PARSER_PI;
3012 /*
3013 * this is a Processing Instruction.
3014 */
3015 SKIP(2);
3016 SHRINK;
3017
3018 /*
3019 * Parse the target name and check for special support like
3020 * namespace.
3021 */
3022 target = htmlParseName(ctxt);
3023 if (target != NULL) {
3024 if (RAW == '>') {
3025 SKIP(1);
3026
3027 /*
3028 * SAX: PI detected.
3029 */
3030 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3031 (ctxt->sax->processingInstruction != NULL))
3032 ctxt->sax->processingInstruction(ctxt->userData,
3033 target, NULL);
3034 ctxt->instate = state;
3035 return;
3036 }
3037 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3038 if (buf == NULL) {
3039 htmlErrMemory(ctxt, NULL);
3040 ctxt->instate = state;
3041 return;
3042 }
3043 cur = CUR;
3044 if (!IS_BLANK(cur)) {
3045 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3046 "ParsePI: PI %s space expected\n", target, NULL);
3047 }
3048 SKIP_BLANKS;
3049 cur = CUR_CHAR(l);
3050 while (IS_CHAR(cur) && (cur != '>')) {
3051 if (len + 5 >= size) {
3052 xmlChar *tmp;
3053
3054 size *= 2;
3055 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3056 if (tmp == NULL) {
3057 htmlErrMemory(ctxt, NULL);
3058 xmlFree(buf);
3059 ctxt->instate = state;
3060 return;
3061 }
3062 buf = tmp;
3063 }
3064 count++;
3065 if (count > 50) {
3066 GROW;
3067 count = 0;
3068 }
3069 COPY_BUF(l,buf,len,cur);
3070 NEXTL(l);
3071 cur = CUR_CHAR(l);
3072 if (cur == 0) {
3073 SHRINK;
3074 GROW;
3075 cur = CUR_CHAR(l);
3076 }
3077 }
3078 buf[len] = 0;
3079 if (cur != '>') {
3080 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3081 "ParsePI: PI %s never end ...\n", target, NULL);
3082 } else {
3083 SKIP(1);
3084
3085 /*
3086 * SAX: PI detected.
3087 */
3088 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3089 (ctxt->sax->processingInstruction != NULL))
3090 ctxt->sax->processingInstruction(ctxt->userData,
3091 target, buf);
3092 }
3093 xmlFree(buf);
3094 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003095 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003096 "PI is not started correctly", NULL, NULL);
3097 }
3098 ctxt->instate = state;
3099 }
3100}
3101
3102/**
Owen Taylor3473f882001-02-23 17:55:21 +00003103 * htmlParseComment:
3104 * @ctxt: an HTML parser context
3105 *
3106 * Parse an XML (SGML) comment <!-- .... -->
3107 *
3108 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3109 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003110static void
Owen Taylor3473f882001-02-23 17:55:21 +00003111htmlParseComment(htmlParserCtxtPtr ctxt) {
3112 xmlChar *buf = NULL;
3113 int len;
3114 int size = HTML_PARSER_BUFFER_SIZE;
3115 int q, ql;
3116 int r, rl;
3117 int cur, l;
3118 xmlParserInputState state;
3119
3120 /*
3121 * Check that there is a comment right here.
3122 */
3123 if ((RAW != '<') || (NXT(1) != '!') ||
3124 (NXT(2) != '-') || (NXT(3) != '-')) return;
3125
3126 state = ctxt->instate;
3127 ctxt->instate = XML_PARSER_COMMENT;
3128 SHRINK;
3129 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003130 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003131 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003132 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003133 ctxt->instate = state;
3134 return;
3135 }
3136 q = CUR_CHAR(ql);
3137 NEXTL(ql);
3138 r = CUR_CHAR(rl);
3139 NEXTL(rl);
3140 cur = CUR_CHAR(l);
3141 len = 0;
3142 while (IS_CHAR(cur) &&
3143 ((cur != '>') ||
3144 (r != '-') || (q != '-'))) {
3145 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003146 xmlChar *tmp;
3147
Owen Taylor3473f882001-02-23 17:55:21 +00003148 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003149 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3150 if (tmp == NULL) {
3151 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003152 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003153 ctxt->instate = state;
3154 return;
3155 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003156 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003157 }
3158 COPY_BUF(ql,buf,len,q);
3159 q = r;
3160 ql = rl;
3161 r = cur;
3162 rl = l;
3163 NEXTL(l);
3164 cur = CUR_CHAR(l);
3165 if (cur == 0) {
3166 SHRINK;
3167 GROW;
3168 cur = CUR_CHAR(l);
3169 }
3170 }
3171 buf[len] = 0;
3172 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003173 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3174 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003175 xmlFree(buf);
3176 } else {
3177 NEXT;
3178 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3179 (!ctxt->disableSAX))
3180 ctxt->sax->comment(ctxt->userData, buf);
3181 xmlFree(buf);
3182 }
3183 ctxt->instate = state;
3184}
3185
3186/**
3187 * htmlParseCharRef:
3188 * @ctxt: an HTML parser context
3189 *
3190 * parse Reference declarations
3191 *
3192 * [66] CharRef ::= '&#' [0-9]+ ';' |
3193 * '&#x' [0-9a-fA-F]+ ';'
3194 *
3195 * Returns the value parsed (as an int)
3196 */
3197int
3198htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3199 int val = 0;
3200
Daniel Veillarda03e3652004-11-02 18:45:30 +00003201 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3202 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3203 "htmlParseCharRef: context error\n",
3204 NULL, NULL);
3205 return(0);
3206 }
Owen Taylor3473f882001-02-23 17:55:21 +00003207 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003208 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003209 SKIP(3);
3210 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003211 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003212 val = val * 16 + (CUR - '0');
3213 else if ((CUR >= 'a') && (CUR <= 'f'))
3214 val = val * 16 + (CUR - 'a') + 10;
3215 else if ((CUR >= 'A') && (CUR <= 'F'))
3216 val = val * 16 + (CUR - 'A') + 10;
3217 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003218 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003219 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003220 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003221 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003222 }
3223 NEXT;
3224 }
3225 if (CUR == ';')
3226 NEXT;
3227 } else if ((CUR == '&') && (NXT(1) == '#')) {
3228 SKIP(2);
3229 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003230 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003231 val = val * 10 + (CUR - '0');
3232 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003233 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003234 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003235 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003236 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003237 }
3238 NEXT;
3239 }
3240 if (CUR == ';')
3241 NEXT;
3242 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003243 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3244 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003245 }
3246 /*
3247 * Check the value IS_CHAR ...
3248 */
3249 if (IS_CHAR(val)) {
3250 return(val);
3251 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003252 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3253 "htmlParseCharRef: invalid xmlChar value %d\n",
3254 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003255 }
3256 return(0);
3257}
3258
3259
3260/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003261 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003262 * @ctxt: an HTML parser context
3263 *
3264 * parse a DOCTYPE declaration
3265 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003266 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003267 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3268 */
3269
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003270static void
Owen Taylor3473f882001-02-23 17:55:21 +00003271htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003272 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003273 xmlChar *ExternalID = NULL;
3274 xmlChar *URI = NULL;
3275
3276 /*
3277 * We know that '<!DOCTYPE' has been detected.
3278 */
3279 SKIP(9);
3280
3281 SKIP_BLANKS;
3282
3283 /*
3284 * Parse the DOCTYPE name.
3285 */
3286 name = htmlParseName(ctxt);
3287 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003288 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3289 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3290 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003291 }
3292 /*
3293 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3294 */
3295
3296 SKIP_BLANKS;
3297
3298 /*
3299 * Check for SystemID and ExternalID
3300 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003301 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003302 SKIP_BLANKS;
3303
3304 /*
3305 * We should be at the end of the DOCTYPE declaration.
3306 */
3307 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003308 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3309 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003310 /* We shouldn't try to resynchronize ... */
3311 }
3312 NEXT;
3313
3314 /*
3315 * Create or update the document accordingly to the DOCTYPE
3316 */
3317 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3318 (!ctxt->disableSAX))
3319 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3320
3321 /*
3322 * Cleanup, since we don't use all those identifiers
3323 */
3324 if (URI != NULL) xmlFree(URI);
3325 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003326}
3327
3328/**
3329 * htmlParseAttribute:
3330 * @ctxt: an HTML parser context
3331 * @value: a xmlChar ** used to store the value of the attribute
3332 *
3333 * parse an attribute
3334 *
3335 * [41] Attribute ::= Name Eq AttValue
3336 *
3337 * [25] Eq ::= S? '=' S?
3338 *
3339 * With namespace:
3340 *
3341 * [NS 11] Attribute ::= QName Eq AttValue
3342 *
3343 * Also the case QName == xmlns:??? is handled independently as a namespace
3344 * definition.
3345 *
3346 * Returns the attribute name, and the value in *value.
3347 */
3348
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003349static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003350htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003351 const xmlChar *name;
3352 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003353
3354 *value = NULL;
3355 name = htmlParseHTMLName(ctxt);
3356 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003357 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3358 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003359 return(NULL);
3360 }
3361
3362 /*
3363 * read the value
3364 */
3365 SKIP_BLANKS;
3366 if (CUR == '=') {
3367 NEXT;
3368 SKIP_BLANKS;
3369 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003370 } else if (htmlIsBooleanAttr(name)) {
3371 /*
3372 * assume a minimized attribute
3373 */
3374 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003375 }
3376
3377 *value = val;
3378 return(name);
3379}
3380
3381/**
3382 * htmlCheckEncoding:
3383 * @ctxt: an HTML parser context
3384 * @attvalue: the attribute value
3385 *
3386 * Checks an http-equiv attribute from a Meta tag to detect
3387 * the encoding
3388 * If a new encoding is detected the parser is switched to decode
3389 * it and pass UTF8
3390 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003391static void
Owen Taylor3473f882001-02-23 17:55:21 +00003392htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3393 const xmlChar *encoding;
3394
3395 if ((ctxt == NULL) || (attvalue == NULL))
3396 return;
3397
Daniel Veillarde77db162009-08-22 11:32:38 +02003398 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003399 if (ctxt->input->encoding != NULL)
3400 return;
3401
3402 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3403 if (encoding != NULL) {
3404 encoding += 8;
3405 } else {
3406 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3407 if (encoding != NULL)
3408 encoding += 9;
3409 }
3410 if (encoding != NULL) {
3411 xmlCharEncoding enc;
3412 xmlCharEncodingHandlerPtr handler;
3413
3414 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3415
3416 if (ctxt->input->encoding != NULL)
3417 xmlFree((xmlChar *) ctxt->input->encoding);
3418 ctxt->input->encoding = xmlStrdup(encoding);
3419
3420 enc = xmlParseCharEncoding((const char *) encoding);
3421 /*
3422 * registered set of known encodings
3423 */
3424 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003425 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003426 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3427 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3428 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3429 (ctxt->input->buf != NULL) &&
3430 (ctxt->input->buf->encoder == NULL)) {
3431 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3432 "htmlCheckEncoding: wrong encoding meta\n",
3433 NULL, NULL);
3434 } else {
3435 xmlSwitchEncoding(ctxt, enc);
3436 }
Owen Taylor3473f882001-02-23 17:55:21 +00003437 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3438 } else {
3439 /*
3440 * fallback for unknown encodings
3441 */
3442 handler = xmlFindCharEncodingHandler((const char *) encoding);
3443 if (handler != NULL) {
3444 xmlSwitchToEncoding(ctxt, handler);
3445 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3446 } else {
3447 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3448 }
3449 }
3450
3451 if ((ctxt->input->buf != NULL) &&
3452 (ctxt->input->buf->encoder != NULL) &&
3453 (ctxt->input->buf->raw != NULL) &&
3454 (ctxt->input->buf->buffer != NULL)) {
3455 int nbchars;
3456 int processed;
3457
3458 /*
3459 * convert as much as possible to the parser reading buffer.
3460 */
3461 processed = ctxt->input->cur - ctxt->input->base;
3462 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3463 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3464 ctxt->input->buf->buffer,
3465 ctxt->input->buf->raw);
3466 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3468 "htmlCheckEncoding: encoder error\n",
3469 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003470 }
3471 ctxt->input->base =
3472 ctxt->input->cur = ctxt->input->buf->buffer->content;
3473 }
3474 }
3475}
3476
3477/**
3478 * htmlCheckMeta:
3479 * @ctxt: an HTML parser context
3480 * @atts: the attributes values
3481 *
3482 * Checks an attributes from a Meta tag
3483 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003484static void
Owen Taylor3473f882001-02-23 17:55:21 +00003485htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3486 int i;
3487 const xmlChar *att, *value;
3488 int http = 0;
3489 const xmlChar *content = NULL;
3490
3491 if ((ctxt == NULL) || (atts == NULL))
3492 return;
3493
3494 i = 0;
3495 att = atts[i++];
3496 while (att != NULL) {
3497 value = atts[i++];
3498 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3499 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3500 http = 1;
3501 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3502 content = value;
3503 att = atts[i++];
3504 }
3505 if ((http) && (content != NULL))
3506 htmlCheckEncoding(ctxt, content);
3507
3508}
3509
3510/**
3511 * htmlParseStartTag:
3512 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003513 *
Owen Taylor3473f882001-02-23 17:55:21 +00003514 * parse a start of tag either for rule element or
3515 * EmptyElement. In both case we don't parse the tag closing chars.
3516 *
3517 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3518 *
3519 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3520 *
3521 * With namespace:
3522 *
3523 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3524 *
3525 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3526 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003527 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003528 */
3529
Daniel Veillard597f1c12005-07-03 23:00:18 +00003530static int
Owen Taylor3473f882001-02-23 17:55:21 +00003531htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003532 const xmlChar *name;
3533 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003534 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003535 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003536 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003537 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003538 int meta = 0;
3539 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003540 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003541
Daniel Veillarde77db162009-08-22 11:32:38 +02003542 if (ctxt->instate == XML_PARSER_EOF)
3543 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003544 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3545 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3546 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003547 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003548 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003549 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003550 NEXT;
3551
Daniel Veillard30e76072006-03-09 14:13:55 +00003552 atts = ctxt->atts;
3553 maxatts = ctxt->maxatts;
3554
Owen Taylor3473f882001-02-23 17:55:21 +00003555 GROW;
3556 name = htmlParseHTMLName(ctxt);
3557 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003558 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3559 "htmlParseStartTag: invalid element name\n",
3560 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003561 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003562 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3563 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003564 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003565 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003566 }
3567 if (xmlStrEqual(name, BAD_CAST"meta"))
3568 meta = 1;
3569
3570 /*
3571 * Check for auto-closure of HTML elements.
3572 */
3573 htmlAutoClose(ctxt, name);
3574
3575 /*
3576 * Check for implied HTML elements.
3577 */
3578 htmlCheckImplied(ctxt, name);
3579
3580 /*
3581 * Avoid html at any level > 0, head at any level != 1
3582 * or any attempt to recurse body
3583 */
3584 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003585 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3586 "htmlParseStartTag: misplaced <html> tag\n",
3587 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003588 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003589 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003590 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003591 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003592 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003593 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3594 "htmlParseStartTag: misplaced <head> tag\n",
3595 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003596 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003597 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003598 }
3599 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003600 int indx;
3601 for (indx = 0;indx < ctxt->nameNr;indx++) {
3602 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003603 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3604 "htmlParseStartTag: misplaced <body> tag\n",
3605 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003606 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003607 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003608 }
3609 }
3610 }
3611
3612 /*
3613 * Now parse the attributes, it ends up with the ending
3614 *
3615 * (S Attribute)* S?
3616 */
3617 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003618 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003619 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003620 ((CUR != '/') || (NXT(1) != '>'))) {
3621 long cons = ctxt->nbChars;
3622
3623 GROW;
3624 attname = htmlParseAttribute(ctxt, &attvalue);
3625 if (attname != NULL) {
3626
3627 /*
3628 * Well formedness requires at most one declaration of an attribute
3629 */
3630 for (i = 0; i < nbatts;i += 2) {
3631 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003632 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3633 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003634 if (attvalue != NULL)
3635 xmlFree(attvalue);
3636 goto failed;
3637 }
3638 }
3639
3640 /*
3641 * Add the pair to atts
3642 */
3643 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003644 maxatts = 22; /* allow for 10 attrs by default */
3645 atts = (const xmlChar **)
3646 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003647 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003648 htmlErrMemory(ctxt, NULL);
3649 if (attvalue != NULL)
3650 xmlFree(attvalue);
3651 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003652 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003653 ctxt->atts = atts;
3654 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003655 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003656 const xmlChar **n;
3657
Owen Taylor3473f882001-02-23 17:55:21 +00003658 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003659 n = (const xmlChar **) xmlRealloc((void *) atts,
3660 maxatts * sizeof(const xmlChar *));
3661 if (n == NULL) {
3662 htmlErrMemory(ctxt, NULL);
3663 if (attvalue != NULL)
3664 xmlFree(attvalue);
3665 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003666 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003667 atts = n;
3668 ctxt->atts = atts;
3669 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003670 }
3671 atts[nbatts++] = attname;
3672 atts[nbatts++] = attvalue;
3673 atts[nbatts] = NULL;
3674 atts[nbatts + 1] = NULL;
3675 }
3676 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003677 if (attvalue != NULL)
3678 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003679 /* Dump the bogus attribute string up to the next blank or
3680 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003681 while ((IS_CHAR_CH(CUR)) &&
3682 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003683 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003684 NEXT;
3685 }
3686
3687failed:
3688 SKIP_BLANKS;
3689 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003690 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3691 "htmlParseStartTag: problem parsing attributes\n",
3692 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003693 break;
3694 }
3695 }
3696
3697 /*
3698 * Handle specific association to the META tag
3699 */
William M. Bracke978ae22007-03-21 06:16:02 +00003700 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003701 htmlCheckMeta(ctxt, atts);
3702
3703 /*
3704 * SAX: Start of Element !
3705 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003706 if (!discardtag) {
3707 htmlnamePush(ctxt, name);
3708 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3709 if (nbatts != 0)
3710 ctxt->sax->startElement(ctxt->userData, name, atts);
3711 else
3712 ctxt->sax->startElement(ctxt->userData, name, NULL);
3713 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003714 }
Owen Taylor3473f882001-02-23 17:55:21 +00003715
3716 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003717 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003718 if (atts[i] != NULL)
3719 xmlFree((xmlChar *) atts[i]);
3720 }
Owen Taylor3473f882001-02-23 17:55:21 +00003721 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003722
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003723 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003724}
3725
3726/**
3727 * htmlParseEndTag:
3728 * @ctxt: an HTML parser context
3729 *
3730 * parse an end of tag
3731 *
3732 * [42] ETag ::= '</' Name S? '>'
3733 *
3734 * With namespace
3735 *
3736 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003737 *
3738 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003739 */
3740
Daniel Veillardf420ac52001-07-04 16:04:09 +00003741static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003742htmlParseEndTag(htmlParserCtxtPtr ctxt)
3743{
3744 const xmlChar *name;
3745 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003746 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003747
3748 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003749 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3750 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003751 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003752 }
3753 SKIP(2);
3754
3755 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003756 if (name == NULL)
3757 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003758 /*
3759 * We should definitely be at the ending "S? '>'" part
3760 */
3761 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003762 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003763 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3764 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003765 if (ctxt->recovery) {
3766 /*
3767 * We're not at the ending > !!
3768 * Error, unless in recover mode where we search forwards
3769 * until we find a >
3770 */
3771 while (CUR != '\0' && CUR != '>') NEXT;
3772 NEXT;
3773 }
Owen Taylor3473f882001-02-23 17:55:21 +00003774 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003775 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003776
3777 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003778 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3779 * out now.
3780 */
3781 if ((ctxt->depth > 0) &&
3782 (xmlStrEqual(name, BAD_CAST "html") ||
3783 xmlStrEqual(name, BAD_CAST "body") ||
3784 xmlStrEqual(name, BAD_CAST "head"))) {
3785 ctxt->depth--;
3786 return (0);
3787 }
3788
3789 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003790 * If the name read is not one of the element in the parsing stack
3791 * then return, it's just an error.
3792 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003793 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3794 if (xmlStrEqual(name, ctxt->nameTab[i]))
3795 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003796 }
3797 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003798 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3799 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003800 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003801 }
3802
3803
3804 /*
3805 * Check for auto-closure of HTML elements.
3806 */
3807
3808 htmlAutoCloseOnClose(ctxt, name);
3809
3810 /*
3811 * Well formedness constraints, opening and closing must match.
3812 * With the exception that the autoclose may have popped stuff out
3813 * of the stack.
3814 */
3815 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003816 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003817 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3818 "Opening and ending tag mismatch: %s and %s\n",
3819 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003820 }
3821 }
3822
3823 /*
3824 * SAX: End of Tag
3825 */
3826 oldname = ctxt->name;
3827 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003828 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3829 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003830 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003831 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003832 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003833 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003834 }
3835
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003836 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003837}
3838
3839
3840/**
3841 * htmlParseReference:
3842 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003843 *
Owen Taylor3473f882001-02-23 17:55:21 +00003844 * parse and handle entity references in content,
3845 * this will end-up in a call to character() since this is either a
3846 * CharRef, or a predefined entity.
3847 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003848static void
Owen Taylor3473f882001-02-23 17:55:21 +00003849htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003850 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003851 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003852 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003853 if (CUR != '&') return;
3854
3855 if (NXT(1) == '#') {
3856 unsigned int c;
3857 int bits, i = 0;
3858
3859 c = htmlParseCharRef(ctxt);
3860 if (c == 0)
3861 return;
3862
3863 if (c < 0x80) { out[i++]= c; bits= -6; }
3864 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3865 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3866 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003867
Owen Taylor3473f882001-02-23 17:55:21 +00003868 for ( ; bits >= 0; bits-= 6) {
3869 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3870 }
3871 out[i] = 0;
3872
3873 htmlCheckParagraph(ctxt);
3874 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3875 ctxt->sax->characters(ctxt->userData, out, i);
3876 } else {
3877 ent = htmlParseEntityRef(ctxt, &name);
3878 if (name == NULL) {
3879 htmlCheckParagraph(ctxt);
3880 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3881 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3882 return;
3883 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003884 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003885 htmlCheckParagraph(ctxt);
3886 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3887 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3888 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3889 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3890 }
3891 } else {
3892 unsigned int c;
3893 int bits, i = 0;
3894
3895 c = ent->value;
3896 if (c < 0x80)
3897 { out[i++]= c; bits= -6; }
3898 else if (c < 0x800)
3899 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3900 else if (c < 0x10000)
3901 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003902 else
Owen Taylor3473f882001-02-23 17:55:21 +00003903 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003904
Owen Taylor3473f882001-02-23 17:55:21 +00003905 for ( ; bits >= 0; bits-= 6) {
3906 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3907 }
3908 out[i] = 0;
3909
3910 htmlCheckParagraph(ctxt);
3911 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3912 ctxt->sax->characters(ctxt->userData, out, i);
3913 }
Owen Taylor3473f882001-02-23 17:55:21 +00003914 }
3915}
3916
3917/**
3918 * htmlParseContent:
3919 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003920 *
3921 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003922 */
3923
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003924static void
Owen Taylor3473f882001-02-23 17:55:21 +00003925htmlParseContent(htmlParserCtxtPtr ctxt) {
3926 xmlChar *currentNode;
3927 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003928 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003929
3930 currentNode = xmlStrdup(ctxt->name);
3931 depth = ctxt->nameNr;
3932 while (1) {
3933 long cons = ctxt->nbChars;
3934
3935 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003936
3937 if (ctxt->instate == XML_PARSER_EOF)
3938 break;
3939
Owen Taylor3473f882001-02-23 17:55:21 +00003940 /*
3941 * Our tag or one of it's parent or children is ending.
3942 */
3943 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003944 if (htmlParseEndTag(ctxt) &&
3945 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3946 if (currentNode != NULL)
3947 xmlFree(currentNode);
3948 return;
3949 }
3950 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003951 }
3952
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003953 else if ((CUR == '<') &&
3954 ((IS_ASCII_LETTER(NXT(1))) ||
3955 (NXT(1) == '_') || (NXT(1) == ':'))) {
3956 name = htmlParseHTMLName_nonInvasive(ctxt);
3957 if (name == NULL) {
3958 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3959 "htmlParseStartTag: invalid element name\n",
3960 NULL, NULL);
3961 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003962 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003963 NEXT;
3964
3965 if (currentNode != NULL)
3966 xmlFree(currentNode);
3967 return;
3968 }
3969
3970 if (ctxt->name != NULL) {
3971 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3972 htmlAutoClose(ctxt, name);
3973 continue;
3974 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003975 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003976 }
3977
Owen Taylor3473f882001-02-23 17:55:21 +00003978 /*
3979 * Has this node been popped out during parsing of
3980 * the next element
3981 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003982 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3983 (!xmlStrEqual(currentNode, ctxt->name)))
3984 {
Owen Taylor3473f882001-02-23 17:55:21 +00003985 if (currentNode != NULL) xmlFree(currentNode);
3986 return;
3987 }
3988
Daniel Veillardf9533d12001-03-03 10:04:57 +00003989 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3990 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003991 /*
3992 * Handle SCRIPT/STYLE separately
3993 */
3994 htmlParseScript(ctxt);
3995 } else {
3996 /*
3997 * Sometimes DOCTYPE arrives in the middle of the document
3998 */
3999 if ((CUR == '<') && (NXT(1) == '!') &&
4000 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4001 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4002 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4003 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004004 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4005 "Misplaced DOCTYPE declaration\n",
4006 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004007 htmlParseDocTypeDecl(ctxt);
4008 }
4009
4010 /*
4011 * First case : a comment
4012 */
4013 if ((CUR == '<') && (NXT(1) == '!') &&
4014 (NXT(2) == '-') && (NXT(3) == '-')) {
4015 htmlParseComment(ctxt);
4016 }
4017
4018 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004019 * Second case : a Processing Instruction.
4020 */
4021 else if ((CUR == '<') && (NXT(1) == '?')) {
4022 htmlParsePI(ctxt);
4023 }
4024
4025 /*
4026 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004027 */
4028 else if (CUR == '<') {
4029 htmlParseElement(ctxt);
4030 }
4031
4032 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004033 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004034 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004035 */
4036 else if (CUR == '&') {
4037 htmlParseReference(ctxt);
4038 }
4039
4040 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004041 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004042 */
4043 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004044 htmlAutoCloseOnEnd(ctxt);
4045 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004046 }
4047
4048 /*
4049 * Last case, text. Note that References are handled directly.
4050 */
4051 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004052 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004053 }
4054
4055 if (cons == ctxt->nbChars) {
4056 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004057 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4058 "detected an error in element content\n",
4059 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004060 }
4061 break;
4062 }
4063 }
4064 GROW;
4065 }
4066 if (currentNode != NULL) xmlFree(currentNode);
4067}
4068
4069/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004070 * htmlParseContent:
4071 * @ctxt: an HTML parser context
4072 *
4073 * Parse a content: comment, sub-element, reference or text.
4074 */
4075
4076void
4077__htmlParseContent(void *ctxt) {
4078 if (ctxt != NULL)
4079 htmlParseContent((htmlParserCtxtPtr) ctxt);
4080}
4081
4082/**
Owen Taylor3473f882001-02-23 17:55:21 +00004083 * htmlParseElement:
4084 * @ctxt: an HTML parser context
4085 *
4086 * parse an HTML element, this is highly recursive
4087 *
4088 * [39] element ::= EmptyElemTag | STag content ETag
4089 *
4090 * [41] Attribute ::= Name Eq AttValue
4091 */
4092
4093void
4094htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004095 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004096 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004097 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004098 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004099 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004100 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004101 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004102
Daniel Veillarda03e3652004-11-02 18:45:30 +00004103 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4104 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004105 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004106 return;
4107 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004108
4109 if (ctxt->instate == XML_PARSER_EOF)
4110 return;
4111
Owen Taylor3473f882001-02-23 17:55:21 +00004112 /* Capture start position */
4113 if (ctxt->record_info) {
4114 node_info.begin_pos = ctxt->input->consumed +
4115 (CUR_PTR - ctxt->input->base);
4116 node_info.begin_line = ctxt->input->line;
4117 }
4118
Daniel Veillard597f1c12005-07-03 23:00:18 +00004119 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004120 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004121 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004122 if (CUR == '>')
4123 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004124 return;
4125 }
Owen Taylor3473f882001-02-23 17:55:21 +00004126
4127 /*
4128 * Lookup the info for that element.
4129 */
4130 info = htmlTagLookup(name);
4131 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004132 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4133 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004134 }
4135
4136 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004137 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004138 */
4139 if ((CUR == '/') && (NXT(1) == '>')) {
4140 SKIP(2);
4141 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4142 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004143 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004144 return;
4145 }
4146
4147 if (CUR == '>') {
4148 NEXT;
4149 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004150 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4151 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004152
4153 /*
4154 * end of parsing of this node.
4155 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004156 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004157 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004158 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004159 }
Owen Taylor3473f882001-02-23 17:55:21 +00004160
4161 /*
4162 * Capture end position and add node
4163 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004164 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004165 node_info.end_pos = ctxt->input->consumed +
4166 (CUR_PTR - ctxt->input->base);
4167 node_info.end_line = ctxt->input->line;
4168 node_info.node = ctxt->node;
4169 xmlParserAddNodeInfo(ctxt, &node_info);
4170 }
4171 return;
4172 }
4173
4174 /*
4175 * Check for an Empty Element from DTD definition
4176 */
4177 if ((info != NULL) && (info->empty)) {
4178 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4179 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004180 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004181 return;
4182 }
4183
4184 /*
4185 * Parse the content of the element:
4186 */
4187 currentNode = xmlStrdup(ctxt->name);
4188 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004189 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004190 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004191 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004192 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004193 if (ctxt->nameNr < depth) break;
4194 }
Owen Taylor3473f882001-02-23 17:55:21 +00004195
Owen Taylor3473f882001-02-23 17:55:21 +00004196 /*
4197 * Capture end position and add node
4198 */
4199 if ( currentNode != NULL && ctxt->record_info ) {
4200 node_info.end_pos = ctxt->input->consumed +
4201 (CUR_PTR - ctxt->input->base);
4202 node_info.end_line = ctxt->input->line;
4203 node_info.node = ctxt->node;
4204 xmlParserAddNodeInfo(ctxt, &node_info);
4205 }
William M. Brack76e95df2003-10-18 16:20:14 +00004206 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004207 htmlAutoCloseOnEnd(ctxt);
4208 }
4209
Owen Taylor3473f882001-02-23 17:55:21 +00004210 if (currentNode != NULL)
4211 xmlFree(currentNode);
4212}
4213
4214/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004215 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004216 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004217 *
Owen Taylor3473f882001-02-23 17:55:21 +00004218 * parse an HTML document (and build a tree if using the standard SAX
4219 * interface).
4220 *
4221 * Returns 0, -1 in case of error. the parser context is augmented
4222 * as a result of the parsing.
4223 */
4224
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004225int
Owen Taylor3473f882001-02-23 17:55:21 +00004226htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004227 xmlChar start[4];
4228 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004229 xmlDtdPtr dtd;
4230
Daniel Veillardd0463562001-10-13 09:15:48 +00004231 xmlInitParser();
4232
Owen Taylor3473f882001-02-23 17:55:21 +00004233 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004234
Daniel Veillarda03e3652004-11-02 18:45:30 +00004235 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4236 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4237 "htmlParseDocument: context error\n", NULL, NULL);
4238 return(XML_ERR_INTERNAL_ERROR);
4239 }
4240 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004241 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004242 GROW;
4243 /*
4244 * SAX: beginning of the document processing.
4245 */
4246 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4247 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4248
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004249 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4250 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4251 /*
4252 * Get the 4 first bytes and decode the charset
4253 * if enc != XML_CHAR_ENCODING_NONE
4254 * plug some encoding conversion routines.
4255 */
4256 start[0] = RAW;
4257 start[1] = NXT(1);
4258 start[2] = NXT(2);
4259 start[3] = NXT(3);
4260 enc = xmlDetectCharEncoding(&start[0], 4);
4261 if (enc != XML_CHAR_ENCODING_NONE) {
4262 xmlSwitchEncoding(ctxt, enc);
4263 }
4264 }
4265
Owen Taylor3473f882001-02-23 17:55:21 +00004266 /*
4267 * Wipe out everything which is before the first '<'
4268 */
4269 SKIP_BLANKS;
4270 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004271 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004272 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004273 }
4274
4275 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4276 ctxt->sax->startDocument(ctxt->userData);
4277
4278
4279 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004280 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004281 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004282 while (((CUR == '<') && (NXT(1) == '!') &&
4283 (NXT(2) == '-') && (NXT(3) == '-')) ||
4284 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004285 htmlParseComment(ctxt);
4286 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004287 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004288 }
Owen Taylor3473f882001-02-23 17:55:21 +00004289
4290
4291 /*
4292 * Then possibly doc type declaration(s) and more Misc
4293 * (doctypedecl Misc*)?
4294 */
4295 if ((CUR == '<') && (NXT(1) == '!') &&
4296 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4297 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4298 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4299 (UPP(8) == 'E')) {
4300 htmlParseDocTypeDecl(ctxt);
4301 }
4302 SKIP_BLANKS;
4303
4304 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004305 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004306 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004307 while (((CUR == '<') && (NXT(1) == '!') &&
4308 (NXT(2) == '-') && (NXT(3) == '-')) ||
4309 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004310 htmlParseComment(ctxt);
4311 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004312 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004313 }
Owen Taylor3473f882001-02-23 17:55:21 +00004314
4315 /*
4316 * Time to start parsing the tree itself
4317 */
4318 htmlParseContent(ctxt);
4319
4320 /*
4321 * autoclose
4322 */
4323 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004324 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004325
4326
4327 /*
4328 * SAX: end of the document processing.
4329 */
4330 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4331 ctxt->sax->endDocument(ctxt->userData);
4332
4333 if (ctxt->myDoc != NULL) {
4334 dtd = xmlGetIntSubset(ctxt->myDoc);
4335 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004336 ctxt->myDoc->intSubset =
4337 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004338 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4339 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4340 }
4341 if (! ctxt->wellFormed) return(-1);
4342 return(0);
4343}
4344
4345
4346/************************************************************************
4347 * *
4348 * Parser contexts handling *
4349 * *
4350 ************************************************************************/
4351
4352/**
William M. Brackedb65a72004-02-06 07:36:04 +00004353 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004354 * @ctxt: an HTML parser context
4355 *
4356 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004357 *
4358 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004359 */
4360
Daniel Veillardf403d292003-10-05 13:51:35 +00004361static int
Owen Taylor3473f882001-02-23 17:55:21 +00004362htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4363{
4364 htmlSAXHandler *sax;
4365
Daniel Veillardf403d292003-10-05 13:51:35 +00004366 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004367 memset(ctxt, 0, sizeof(htmlParserCtxt));
4368
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004369 ctxt->dict = xmlDictCreate();
4370 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004371 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4372 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004373 }
Owen Taylor3473f882001-02-23 17:55:21 +00004374 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4375 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004376 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4377 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004378 }
4379 else
4380 memset(sax, 0, sizeof(htmlSAXHandler));
4381
4382 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004383 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004384 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4385 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004386 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004387 ctxt->inputNr = 0;
4388 ctxt->inputMax = 0;
4389 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004390 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004391 }
4392 ctxt->inputNr = 0;
4393 ctxt->inputMax = 5;
4394 ctxt->input = NULL;
4395 ctxt->version = NULL;
4396 ctxt->encoding = NULL;
4397 ctxt->standalone = -1;
4398 ctxt->instate = XML_PARSER_START;
4399
4400 /* Allocate the Node stack */
4401 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4402 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004403 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004404 ctxt->nodeNr = 0;
4405 ctxt->nodeMax = 0;
4406 ctxt->node = NULL;
4407 ctxt->inputNr = 0;
4408 ctxt->inputMax = 0;
4409 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004410 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004411 }
4412 ctxt->nodeNr = 0;
4413 ctxt->nodeMax = 10;
4414 ctxt->node = NULL;
4415
4416 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004417 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004418 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004419 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004420 ctxt->nameNr = 0;
4421 ctxt->nameMax = 10;
4422 ctxt->name = NULL;
4423 ctxt->nodeNr = 0;
4424 ctxt->nodeMax = 0;
4425 ctxt->node = NULL;
4426 ctxt->inputNr = 0;
4427 ctxt->inputMax = 0;
4428 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004429 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004430 }
4431 ctxt->nameNr = 0;
4432 ctxt->nameMax = 10;
4433 ctxt->name = NULL;
4434
Daniel Veillard092643b2003-09-25 14:29:29 +00004435 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004436 else {
4437 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004438 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004439 }
4440 ctxt->userData = ctxt;
4441 ctxt->myDoc = NULL;
4442 ctxt->wellFormed = 1;
4443 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004444 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004445 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004446 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004447 ctxt->vctxt.userData = ctxt;
4448 ctxt->vctxt.error = xmlParserValidityError;
4449 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004450 ctxt->record_info = 0;
4451 ctxt->validate = 0;
4452 ctxt->nbChars = 0;
4453 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004454 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004455 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004456 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004457}
4458
4459/**
4460 * htmlFreeParserCtxt:
4461 * @ctxt: an HTML parser context
4462 *
4463 * Free all the memory used by a parser context. However the parsed
4464 * document in ctxt->myDoc is not freed.
4465 */
4466
4467void
4468htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4469{
4470 xmlFreeParserCtxt(ctxt);
4471}
4472
4473/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004474 * htmlNewParserCtxt:
4475 *
4476 * Allocate and initialize a new parser context.
4477 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004478 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004479 */
4480
Daniel Veillard34c647c2006-09-21 06:53:59 +00004481htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004482htmlNewParserCtxt(void)
4483{
4484 xmlParserCtxtPtr ctxt;
4485
4486 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4487 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004488 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004489 return(NULL);
4490 }
4491 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004492 if (htmlInitParserCtxt(ctxt) < 0) {
4493 htmlFreeParserCtxt(ctxt);
4494 return(NULL);
4495 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004496 return(ctxt);
4497}
4498
4499/**
4500 * htmlCreateMemoryParserCtxt:
4501 * @buffer: a pointer to a char array
4502 * @size: the size of the array
4503 *
4504 * Create a parser context for an HTML in-memory document.
4505 *
4506 * Returns the new parser context or NULL
4507 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004508htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004509htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4510 xmlParserCtxtPtr ctxt;
4511 xmlParserInputPtr input;
4512 xmlParserInputBufferPtr buf;
4513
4514 if (buffer == NULL)
4515 return(NULL);
4516 if (size <= 0)
4517 return(NULL);
4518
4519 ctxt = htmlNewParserCtxt();
4520 if (ctxt == NULL)
4521 return(NULL);
4522
4523 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4524 if (buf == NULL) return(NULL);
4525
4526 input = xmlNewInputStream(ctxt);
4527 if (input == NULL) {
4528 xmlFreeParserCtxt(ctxt);
4529 return(NULL);
4530 }
4531
4532 input->filename = NULL;
4533 input->buf = buf;
4534 input->base = input->buf->buffer->content;
4535 input->cur = input->buf->buffer->content;
4536 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4537
4538 inputPush(ctxt, input);
4539 return(ctxt);
4540}
4541
4542/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004543 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004544 * @cur: a pointer to an array of xmlChar
4545 * @encoding: a free form C string describing the HTML document encoding, or NULL
4546 *
4547 * Create a parser context for an HTML document.
4548 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004549 * TODO: check the need to add encoding handling there
4550 *
Owen Taylor3473f882001-02-23 17:55:21 +00004551 * Returns the new parser context or NULL
4552 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004553static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004554htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004555 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004556 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004557
Daniel Veillard1d995272002-07-22 16:43:32 +00004558 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004559 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004560 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004561 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004562 if (ctxt == NULL)
4563 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004564
4565 if (encoding != NULL) {
4566 xmlCharEncoding enc;
4567 xmlCharEncodingHandlerPtr handler;
4568
4569 if (ctxt->input->encoding != NULL)
4570 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004571 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004572
4573 enc = xmlParseCharEncoding(encoding);
4574 /*
4575 * registered set of known encodings
4576 */
4577 if (enc != XML_CHAR_ENCODING_ERROR) {
4578 xmlSwitchEncoding(ctxt, enc);
4579 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004580 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004581 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004582 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004583 }
4584 } else {
4585 /*
4586 * fallback for unknown encodings
4587 */
4588 handler = xmlFindCharEncodingHandler((const char *) encoding);
4589 if (handler != NULL) {
4590 xmlSwitchToEncoding(ctxt, handler);
4591 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004592 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4593 "Unsupported encoding %s\n",
4594 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004595 }
4596 }
4597 }
4598 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004599}
4600
Daniel Veillard73b013f2003-09-30 12:36:01 +00004601#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004602/************************************************************************
4603 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004604 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004605 * *
4606 ************************************************************************/
4607
4608/**
4609 * htmlParseLookupSequence:
4610 * @ctxt: an HTML parser context
4611 * @first: the first char to lookup
4612 * @next: the next char to lookup or zero
4613 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004614 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004615 *
4616 * Try to find if a sequence (first, next, third) or just (first next) or
4617 * (first) is available in the input stream.
4618 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4619 * to avoid rescanning sequences of bytes, it DOES change the state of the
4620 * parser, do not use liberally.
4621 * This is basically similar to xmlParseLookupSequence()
4622 *
4623 * Returns the index to the current parsing point if the full sequence
4624 * is available, -1 otherwise.
4625 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004626static int
Owen Taylor3473f882001-02-23 17:55:21 +00004627htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004628 xmlChar next, xmlChar third, int iscomment,
4629 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004630 int base, len;
4631 htmlParserInputPtr in;
4632 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004633 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004634 int invalue = 0;
4635 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004636
4637 in = ctxt->input;
4638 if (in == NULL) return(-1);
4639 base = in->cur - in->base;
4640 if (base < 0) return(-1);
4641 if (ctxt->checkIndex > base)
4642 base = ctxt->checkIndex;
4643 if (in->buf == NULL) {
4644 buf = in->base;
4645 len = in->length;
4646 } else {
4647 buf = in->buf->buffer->content;
4648 len = in->buf->buffer->use;
4649 }
4650 /* take into account the sequence length */
4651 if (third) len -= 2;
4652 else if (next) len --;
4653 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004654 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004655 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4656 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4657 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004658 /* do not increment past <! - some people use <!--> */
4659 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004660 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004661 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004662 if (ignoreattrval) {
4663 if (buf[base] == '"' || buf[base] == '\'') {
4664 if (invalue) {
4665 if (buf[base] == valdellim) {
4666 invalue = 0;
4667 continue;
4668 }
4669 } else {
4670 valdellim = buf[base];
4671 invalue = 1;
4672 continue;
4673 }
4674 } else if (invalue) {
4675 continue;
4676 }
4677 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004678 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004679 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004680 return(-1);
4681 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4682 (buf[base + 2] == '>')) {
4683 incomment = 0;
4684 base += 2;
4685 }
4686 continue;
4687 }
Owen Taylor3473f882001-02-23 17:55:21 +00004688 if (buf[base] == first) {
4689 if (third != 0) {
4690 if ((buf[base + 1] != next) ||
4691 (buf[base + 2] != third)) continue;
4692 } else if (next != 0) {
4693 if (buf[base + 1] != next) continue;
4694 }
4695 ctxt->checkIndex = 0;
4696#ifdef DEBUG_PUSH
4697 if (next == 0)
4698 xmlGenericError(xmlGenericErrorContext,
4699 "HPP: lookup '%c' found at %d\n",
4700 first, base);
4701 else if (third == 0)
4702 xmlGenericError(xmlGenericErrorContext,
4703 "HPP: lookup '%c%c' found at %d\n",
4704 first, next, base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004705 else
Owen Taylor3473f882001-02-23 17:55:21 +00004706 xmlGenericError(xmlGenericErrorContext,
4707 "HPP: lookup '%c%c%c' found at %d\n",
4708 first, next, third, base);
4709#endif
4710 return(base - (in->cur - in->base));
4711 }
4712 }
4713 ctxt->checkIndex = base;
4714#ifdef DEBUG_PUSH
4715 if (next == 0)
4716 xmlGenericError(xmlGenericErrorContext,
4717 "HPP: lookup '%c' failed\n", first);
4718 else if (third == 0)
4719 xmlGenericError(xmlGenericErrorContext,
4720 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004721 else
Owen Taylor3473f882001-02-23 17:55:21 +00004722 xmlGenericError(xmlGenericErrorContext,
4723 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4724#endif
4725 return(-1);
4726}
4727
4728/**
Markus Kull56a03032009-08-24 19:00:23 +02004729 * htmlParseLookupChars:
4730 * @ctxt: an HTML parser context
4731 * @stop: Array of chars, which stop the lookup.
4732 * @stopLen: Length of stop-Array
4733 *
4734 * Try to find if any char of the stop-Array is available in the input
4735 * stream.
4736 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4737 * to avoid rescanning sequences of bytes, it DOES change the state of the
4738 * parser, do not use liberally.
4739 *
4740 * Returns the index to the current parsing point if a stopChar
4741 * is available, -1 otherwise.
4742 */
4743static int
4744htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
4745 int stopLen)
4746{
4747 int base, len;
4748 htmlParserInputPtr in;
4749 const xmlChar *buf;
4750 int incomment = 0;
4751 int i;
4752
4753 in = ctxt->input;
4754 if (in == NULL)
4755 return (-1);
4756
4757 base = in->cur - in->base;
4758 if (base < 0)
4759 return (-1);
4760
4761 if (ctxt->checkIndex > base)
4762 base = ctxt->checkIndex;
4763
4764 if (in->buf == NULL) {
4765 buf = in->base;
4766 len = in->length;
4767 } else {
4768 buf = in->buf->buffer->content;
4769 len = in->buf->buffer->use;
4770 }
4771
4772 for (; base < len; base++) {
4773 if (!incomment && (base + 4 < len)) {
4774 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4775 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4776 incomment = 1;
4777 /* do not increment past <! - some people use <!--> */
4778 base += 2;
4779 }
4780 }
4781 if (incomment) {
4782 if (base + 3 > len)
4783 return (-1);
4784 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4785 (buf[base + 2] == '>')) {
4786 incomment = 0;
4787 base += 2;
4788 }
4789 continue;
4790 }
4791 for (i = 0; i < stopLen; ++i) {
4792 if (buf[base] == stop[i]) {
4793 ctxt->checkIndex = 0;
4794 return (base - (in->cur - in->base));
4795 }
4796 }
4797 }
4798 ctxt->checkIndex = base;
4799 return (-1);
4800}
4801
4802/**
Owen Taylor3473f882001-02-23 17:55:21 +00004803 * htmlParseTryOrFinish:
4804 * @ctxt: an HTML parser context
4805 * @terminate: last chunk indicator
4806 *
4807 * Try to progress on parsing
4808 *
4809 * Returns zero if no parsing was possible
4810 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004811static int
Owen Taylor3473f882001-02-23 17:55:21 +00004812htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4813 int ret = 0;
4814 htmlParserInputPtr in;
4815 int avail = 0;
4816 xmlChar cur, next;
4817
4818#ifdef DEBUG_PUSH
4819 switch (ctxt->instate) {
4820 case XML_PARSER_EOF:
4821 xmlGenericError(xmlGenericErrorContext,
4822 "HPP: try EOF\n"); break;
4823 case XML_PARSER_START:
4824 xmlGenericError(xmlGenericErrorContext,
4825 "HPP: try START\n"); break;
4826 case XML_PARSER_MISC:
4827 xmlGenericError(xmlGenericErrorContext,
4828 "HPP: try MISC\n");break;
4829 case XML_PARSER_COMMENT:
4830 xmlGenericError(xmlGenericErrorContext,
4831 "HPP: try COMMENT\n");break;
4832 case XML_PARSER_PROLOG:
4833 xmlGenericError(xmlGenericErrorContext,
4834 "HPP: try PROLOG\n");break;
4835 case XML_PARSER_START_TAG:
4836 xmlGenericError(xmlGenericErrorContext,
4837 "HPP: try START_TAG\n");break;
4838 case XML_PARSER_CONTENT:
4839 xmlGenericError(xmlGenericErrorContext,
4840 "HPP: try CONTENT\n");break;
4841 case XML_PARSER_CDATA_SECTION:
4842 xmlGenericError(xmlGenericErrorContext,
4843 "HPP: try CDATA_SECTION\n");break;
4844 case XML_PARSER_END_TAG:
4845 xmlGenericError(xmlGenericErrorContext,
4846 "HPP: try END_TAG\n");break;
4847 case XML_PARSER_ENTITY_DECL:
4848 xmlGenericError(xmlGenericErrorContext,
4849 "HPP: try ENTITY_DECL\n");break;
4850 case XML_PARSER_ENTITY_VALUE:
4851 xmlGenericError(xmlGenericErrorContext,
4852 "HPP: try ENTITY_VALUE\n");break;
4853 case XML_PARSER_ATTRIBUTE_VALUE:
4854 xmlGenericError(xmlGenericErrorContext,
4855 "HPP: try ATTRIBUTE_VALUE\n");break;
4856 case XML_PARSER_DTD:
4857 xmlGenericError(xmlGenericErrorContext,
4858 "HPP: try DTD\n");break;
4859 case XML_PARSER_EPILOG:
4860 xmlGenericError(xmlGenericErrorContext,
4861 "HPP: try EPILOG\n");break;
4862 case XML_PARSER_PI:
4863 xmlGenericError(xmlGenericErrorContext,
4864 "HPP: try PI\n");break;
4865 case XML_PARSER_SYSTEM_LITERAL:
4866 xmlGenericError(xmlGenericErrorContext,
4867 "HPP: try SYSTEM_LITERAL\n");break;
4868 }
4869#endif
4870
4871 while (1) {
4872
4873 in = ctxt->input;
4874 if (in == NULL) break;
4875 if (in->buf == NULL)
4876 avail = in->length - (in->cur - in->base);
4877 else
4878 avail = in->buf->buffer->use - (in->cur - in->base);
4879 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004880 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004881 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004882 /*
4883 * SAX: end of the document processing.
4884 */
4885 ctxt->instate = XML_PARSER_EOF;
4886 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4887 ctxt->sax->endDocument(ctxt->userData);
4888 }
4889 }
4890 if (avail < 1)
4891 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004892 cur = in->cur[0];
4893 if (cur == 0) {
4894 SKIP(1);
4895 continue;
4896 }
4897
Owen Taylor3473f882001-02-23 17:55:21 +00004898 switch (ctxt->instate) {
4899 case XML_PARSER_EOF:
4900 /*
4901 * Document parsing is done !
4902 */
4903 goto done;
4904 case XML_PARSER_START:
4905 /*
4906 * Very first chars read from the document flow.
4907 */
4908 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004909 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004910 SKIP_BLANKS;
4911 if (in->buf == NULL)
4912 avail = in->length - (in->cur - in->base);
4913 else
4914 avail = in->buf->buffer->use - (in->cur - in->base);
4915 }
4916 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4917 ctxt->sax->setDocumentLocator(ctxt->userData,
4918 &xmlDefaultSAXLocator);
4919 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4920 (!ctxt->disableSAX))
4921 ctxt->sax->startDocument(ctxt->userData);
4922
4923 cur = in->cur[0];
4924 next = in->cur[1];
4925 if ((cur == '<') && (next == '!') &&
4926 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4927 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4928 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4929 (UPP(8) == 'E')) {
4930 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004931 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004932 goto done;
4933#ifdef DEBUG_PUSH
4934 xmlGenericError(xmlGenericErrorContext,
4935 "HPP: Parsing internal subset\n");
4936#endif
4937 htmlParseDocTypeDecl(ctxt);
4938 ctxt->instate = XML_PARSER_PROLOG;
4939#ifdef DEBUG_PUSH
4940 xmlGenericError(xmlGenericErrorContext,
4941 "HPP: entering PROLOG\n");
4942#endif
4943 } else {
4944 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004945#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004946 xmlGenericError(xmlGenericErrorContext,
4947 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004948#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004949 }
Owen Taylor3473f882001-02-23 17:55:21 +00004950 break;
4951 case XML_PARSER_MISC:
4952 SKIP_BLANKS;
4953 if (in->buf == NULL)
4954 avail = in->length - (in->cur - in->base);
4955 else
4956 avail = in->buf->buffer->use - (in->cur - in->base);
4957 if (avail < 2)
4958 goto done;
4959 cur = in->cur[0];
4960 next = in->cur[1];
4961 if ((cur == '<') && (next == '!') &&
4962 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4963 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004964 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004965 goto done;
4966#ifdef DEBUG_PUSH
4967 xmlGenericError(xmlGenericErrorContext,
4968 "HPP: Parsing Comment\n");
4969#endif
4970 htmlParseComment(ctxt);
4971 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004972 } else if ((cur == '<') && (next == '?')) {
4973 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004974 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004975 goto done;
4976#ifdef DEBUG_PUSH
4977 xmlGenericError(xmlGenericErrorContext,
4978 "HPP: Parsing PI\n");
4979#endif
4980 htmlParsePI(ctxt);
4981 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004982 } else if ((cur == '<') && (next == '!') &&
4983 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4984 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4985 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4986 (UPP(8) == 'E')) {
4987 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004988 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004989 goto done;
4990#ifdef DEBUG_PUSH
4991 xmlGenericError(xmlGenericErrorContext,
4992 "HPP: Parsing internal subset\n");
4993#endif
4994 htmlParseDocTypeDecl(ctxt);
4995 ctxt->instate = XML_PARSER_PROLOG;
4996#ifdef DEBUG_PUSH
4997 xmlGenericError(xmlGenericErrorContext,
4998 "HPP: entering PROLOG\n");
4999#endif
5000 } else if ((cur == '<') && (next == '!') &&
5001 (avail < 9)) {
5002 goto done;
5003 } else {
5004 ctxt->instate = XML_PARSER_START_TAG;
5005#ifdef DEBUG_PUSH
5006 xmlGenericError(xmlGenericErrorContext,
5007 "HPP: entering START_TAG\n");
5008#endif
5009 }
5010 break;
5011 case XML_PARSER_PROLOG:
5012 SKIP_BLANKS;
5013 if (in->buf == NULL)
5014 avail = in->length - (in->cur - in->base);
5015 else
5016 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005017 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005018 goto done;
5019 cur = in->cur[0];
5020 next = in->cur[1];
5021 if ((cur == '<') && (next == '!') &&
5022 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5023 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005024 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005025 goto done;
5026#ifdef DEBUG_PUSH
5027 xmlGenericError(xmlGenericErrorContext,
5028 "HPP: Parsing Comment\n");
5029#endif
5030 htmlParseComment(ctxt);
5031 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005032 } else if ((cur == '<') && (next == '?')) {
5033 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005034 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005035 goto done;
5036#ifdef DEBUG_PUSH
5037 xmlGenericError(xmlGenericErrorContext,
5038 "HPP: Parsing PI\n");
5039#endif
5040 htmlParsePI(ctxt);
5041 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005042 } else if ((cur == '<') && (next == '!') &&
5043 (avail < 4)) {
5044 goto done;
5045 } else {
5046 ctxt->instate = XML_PARSER_START_TAG;
5047#ifdef DEBUG_PUSH
5048 xmlGenericError(xmlGenericErrorContext,
5049 "HPP: entering START_TAG\n");
5050#endif
5051 }
5052 break;
5053 case XML_PARSER_EPILOG:
5054 if (in->buf == NULL)
5055 avail = in->length - (in->cur - in->base);
5056 else
5057 avail = in->buf->buffer->use - (in->cur - in->base);
5058 if (avail < 1)
5059 goto done;
5060 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005061 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005062 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005063 goto done;
5064 }
5065 if (avail < 2)
5066 goto done;
5067 next = in->cur[1];
5068 if ((cur == '<') && (next == '!') &&
5069 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5070 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005071 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005072 goto done;
5073#ifdef DEBUG_PUSH
5074 xmlGenericError(xmlGenericErrorContext,
5075 "HPP: Parsing Comment\n");
5076#endif
5077 htmlParseComment(ctxt);
5078 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005079 } else if ((cur == '<') && (next == '?')) {
5080 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005081 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005082 goto done;
5083#ifdef DEBUG_PUSH
5084 xmlGenericError(xmlGenericErrorContext,
5085 "HPP: Parsing PI\n");
5086#endif
5087 htmlParsePI(ctxt);
5088 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005089 } else if ((cur == '<') && (next == '!') &&
5090 (avail < 4)) {
5091 goto done;
5092 } else {
5093 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005094 ctxt->wellFormed = 0;
5095 ctxt->instate = XML_PARSER_EOF;
5096#ifdef DEBUG_PUSH
5097 xmlGenericError(xmlGenericErrorContext,
5098 "HPP: entering EOF\n");
5099#endif
5100 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5101 ctxt->sax->endDocument(ctxt->userData);
5102 goto done;
5103 }
5104 break;
5105 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005106 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005107 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005108 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005109
5110 if (avail < 2)
5111 goto done;
5112 cur = in->cur[0];
5113 if (cur != '<') {
5114 ctxt->instate = XML_PARSER_CONTENT;
5115#ifdef DEBUG_PUSH
5116 xmlGenericError(xmlGenericErrorContext,
5117 "HPP: entering CONTENT\n");
5118#endif
5119 break;
5120 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005121 if (in->cur[1] == '/') {
5122 ctxt->instate = XML_PARSER_END_TAG;
5123 ctxt->checkIndex = 0;
5124#ifdef DEBUG_PUSH
5125 xmlGenericError(xmlGenericErrorContext,
5126 "HPP: entering END_TAG\n");
5127#endif
5128 break;
5129 }
Owen Taylor3473f882001-02-23 17:55:21 +00005130 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005131 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005132 goto done;
5133
Daniel Veillard597f1c12005-07-03 23:00:18 +00005134 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005135 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005136 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005137 (name == NULL)) {
5138 if (CUR == '>')
5139 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005140 break;
5141 }
Owen Taylor3473f882001-02-23 17:55:21 +00005142
5143 /*
5144 * Lookup the info for that element.
5145 */
5146 info = htmlTagLookup(name);
5147 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005148 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5149 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005150 }
5151
5152 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005153 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005154 */
5155 if ((CUR == '/') && (NXT(1) == '>')) {
5156 SKIP(2);
5157 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5158 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005159 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005160 ctxt->instate = XML_PARSER_CONTENT;
5161#ifdef DEBUG_PUSH
5162 xmlGenericError(xmlGenericErrorContext,
5163 "HPP: entering CONTENT\n");
5164#endif
5165 break;
5166 }
5167
5168 if (CUR == '>') {
5169 NEXT;
5170 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005171 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5172 "Couldn't find end of Start Tag %s\n",
5173 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005174
5175 /*
5176 * end of parsing of this node.
5177 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005178 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005179 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005180 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005181 }
Owen Taylor3473f882001-02-23 17:55:21 +00005182
5183 ctxt->instate = XML_PARSER_CONTENT;
5184#ifdef DEBUG_PUSH
5185 xmlGenericError(xmlGenericErrorContext,
5186 "HPP: entering CONTENT\n");
5187#endif
5188 break;
5189 }
5190
5191 /*
5192 * Check for an Empty Element from DTD definition
5193 */
5194 if ((info != NULL) && (info->empty)) {
5195 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5196 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005197 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005198 }
5199 ctxt->instate = XML_PARSER_CONTENT;
5200#ifdef DEBUG_PUSH
5201 xmlGenericError(xmlGenericErrorContext,
5202 "HPP: entering CONTENT\n");
5203#endif
5204 break;
5205 }
5206 case XML_PARSER_CONTENT: {
5207 long cons;
5208 /*
5209 * Handle preparsed entities and charRef
5210 */
5211 if (ctxt->token != 0) {
5212 xmlChar chr[2] = { 0 , 0 } ;
5213
5214 chr[0] = (xmlChar) ctxt->token;
5215 htmlCheckParagraph(ctxt);
5216 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5217 ctxt->sax->characters(ctxt->userData, chr, 1);
5218 ctxt->token = 0;
5219 ctxt->checkIndex = 0;
5220 }
5221 if ((avail == 1) && (terminate)) {
5222 cur = in->cur[0];
5223 if ((cur != '<') && (cur != '&')) {
5224 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005225 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005226 if (ctxt->sax->ignorableWhitespace != NULL)
5227 ctxt->sax->ignorableWhitespace(
5228 ctxt->userData, &cur, 1);
5229 } else {
5230 htmlCheckParagraph(ctxt);
5231 if (ctxt->sax->characters != NULL)
5232 ctxt->sax->characters(
5233 ctxt->userData, &cur, 1);
5234 }
5235 }
5236 ctxt->token = 0;
5237 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005238 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005239 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005240 }
Owen Taylor3473f882001-02-23 17:55:21 +00005241 }
5242 if (avail < 2)
5243 goto done;
5244 cur = in->cur[0];
5245 next = in->cur[1];
5246 cons = ctxt->nbChars;
5247 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5248 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5249 /*
5250 * Handle SCRIPT/STYLE separately
5251 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005252 if (!terminate) {
5253 int idx;
5254 xmlChar val;
5255
Jiri Netolicky446e1262009-08-07 17:05:36 +02005256 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005257 if (idx < 0)
5258 goto done;
5259 val = in->cur[idx + 2];
5260 if (val == 0) /* bad cut of input */
5261 goto done;
5262 }
Owen Taylor3473f882001-02-23 17:55:21 +00005263 htmlParseScript(ctxt);
5264 if ((cur == '<') && (next == '/')) {
5265 ctxt->instate = XML_PARSER_END_TAG;
5266 ctxt->checkIndex = 0;
5267#ifdef DEBUG_PUSH
5268 xmlGenericError(xmlGenericErrorContext,
5269 "HPP: entering END_TAG\n");
5270#endif
5271 break;
5272 }
5273 } else {
5274 /*
5275 * Sometimes DOCTYPE arrives in the middle of the document
5276 */
5277 if ((cur == '<') && (next == '!') &&
5278 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5279 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5280 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5281 (UPP(8) == 'E')) {
5282 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005283 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005284 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005285 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5286 "Misplaced DOCTYPE declaration\n",
5287 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005288 htmlParseDocTypeDecl(ctxt);
5289 } else if ((cur == '<') && (next == '!') &&
5290 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5291 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005292 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005293 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005294 goto done;
5295#ifdef DEBUG_PUSH
5296 xmlGenericError(xmlGenericErrorContext,
5297 "HPP: Parsing Comment\n");
5298#endif
5299 htmlParseComment(ctxt);
5300 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005301 } else if ((cur == '<') && (next == '?')) {
5302 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005303 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005304 goto done;
5305#ifdef DEBUG_PUSH
5306 xmlGenericError(xmlGenericErrorContext,
5307 "HPP: Parsing PI\n");
5308#endif
5309 htmlParsePI(ctxt);
5310 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005311 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5312 goto done;
5313 } else if ((cur == '<') && (next == '/')) {
5314 ctxt->instate = XML_PARSER_END_TAG;
5315 ctxt->checkIndex = 0;
5316#ifdef DEBUG_PUSH
5317 xmlGenericError(xmlGenericErrorContext,
5318 "HPP: entering END_TAG\n");
5319#endif
5320 break;
5321 } else if (cur == '<') {
5322 ctxt->instate = XML_PARSER_START_TAG;
5323 ctxt->checkIndex = 0;
5324#ifdef DEBUG_PUSH
5325 xmlGenericError(xmlGenericErrorContext,
5326 "HPP: entering START_TAG\n");
5327#endif
5328 break;
5329 } else if (cur == '&') {
5330 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005331 (htmlParseLookupChars(ctxt,
5332 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005333 goto done;
5334#ifdef DEBUG_PUSH
5335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: Parsing Reference\n");
5337#endif
5338 /* TODO: check generation of subtrees if noent !!! */
5339 htmlParseReference(ctxt);
5340 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005341 /*
5342 * check that the text sequence is complete
5343 * before handing out the data to the parser
5344 * to avoid problems with erroneous end of
5345 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005346 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005347 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005348 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005349 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005350 ctxt->checkIndex = 0;
5351#ifdef DEBUG_PUSH
5352 xmlGenericError(xmlGenericErrorContext,
5353 "HPP: Parsing char data\n");
5354#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005355 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005356 }
5357 }
5358 if (cons == ctxt->nbChars) {
5359 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005360 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5361 "detected an error in element content\n",
5362 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005363 }
5364 NEXT;
5365 break;
5366 }
5367
5368 break;
5369 }
5370 case XML_PARSER_END_TAG:
5371 if (avail < 2)
5372 goto done;
5373 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005374 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005375 goto done;
5376 htmlParseEndTag(ctxt);
5377 if (ctxt->nameNr == 0) {
5378 ctxt->instate = XML_PARSER_EPILOG;
5379 } else {
5380 ctxt->instate = XML_PARSER_CONTENT;
5381 }
5382 ctxt->checkIndex = 0;
5383#ifdef DEBUG_PUSH
5384 xmlGenericError(xmlGenericErrorContext,
5385 "HPP: entering CONTENT\n");
5386#endif
5387 break;
5388 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005389 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5390 "HPP: internal error, state == CDATA\n",
5391 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005392 ctxt->instate = XML_PARSER_CONTENT;
5393 ctxt->checkIndex = 0;
5394#ifdef DEBUG_PUSH
5395 xmlGenericError(xmlGenericErrorContext,
5396 "HPP: entering CONTENT\n");
5397#endif
5398 break;
5399 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005400 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5401 "HPP: internal error, state == DTD\n",
5402 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005403 ctxt->instate = XML_PARSER_CONTENT;
5404 ctxt->checkIndex = 0;
5405#ifdef DEBUG_PUSH
5406 xmlGenericError(xmlGenericErrorContext,
5407 "HPP: entering CONTENT\n");
5408#endif
5409 break;
5410 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005411 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5412 "HPP: internal error, state == COMMENT\n",
5413 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005414 ctxt->instate = XML_PARSER_CONTENT;
5415 ctxt->checkIndex = 0;
5416#ifdef DEBUG_PUSH
5417 xmlGenericError(xmlGenericErrorContext,
5418 "HPP: entering CONTENT\n");
5419#endif
5420 break;
5421 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005422 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5423 "HPP: internal error, state == PI\n",
5424 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005425 ctxt->instate = XML_PARSER_CONTENT;
5426 ctxt->checkIndex = 0;
5427#ifdef DEBUG_PUSH
5428 xmlGenericError(xmlGenericErrorContext,
5429 "HPP: entering CONTENT\n");
5430#endif
5431 break;
5432 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005433 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5434 "HPP: internal error, state == ENTITY_DECL\n",
5435 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005436 ctxt->instate = XML_PARSER_CONTENT;
5437 ctxt->checkIndex = 0;
5438#ifdef DEBUG_PUSH
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: entering CONTENT\n");
5441#endif
5442 break;
5443 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005444 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5445 "HPP: internal error, state == ENTITY_VALUE\n",
5446 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005447 ctxt->instate = XML_PARSER_CONTENT;
5448 ctxt->checkIndex = 0;
5449#ifdef DEBUG_PUSH
5450 xmlGenericError(xmlGenericErrorContext,
5451 "HPP: entering DTD\n");
5452#endif
5453 break;
5454 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005455 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5456 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5457 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005458 ctxt->instate = XML_PARSER_START_TAG;
5459 ctxt->checkIndex = 0;
5460#ifdef DEBUG_PUSH
5461 xmlGenericError(xmlGenericErrorContext,
5462 "HPP: entering START_TAG\n");
5463#endif
5464 break;
5465 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005466 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5467 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5468 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005469 ctxt->instate = XML_PARSER_CONTENT;
5470 ctxt->checkIndex = 0;
5471#ifdef DEBUG_PUSH
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: entering CONTENT\n");
5474#endif
5475 break;
5476 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005477 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5478 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5479 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005480 ctxt->instate = XML_PARSER_CONTENT;
5481 ctxt->checkIndex = 0;
5482#ifdef DEBUG_PUSH
5483 xmlGenericError(xmlGenericErrorContext,
5484 "HPP: entering CONTENT\n");
5485#endif
5486 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005487 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005488 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5489 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5490 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005491 ctxt->instate = XML_PARSER_CONTENT;
5492 ctxt->checkIndex = 0;
5493#ifdef DEBUG_PUSH
5494 xmlGenericError(xmlGenericErrorContext,
5495 "HPP: entering CONTENT\n");
5496#endif
5497 break;
5498
Owen Taylor3473f882001-02-23 17:55:21 +00005499 }
5500 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005501done:
Owen Taylor3473f882001-02-23 17:55:21 +00005502 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005503 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005504 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005505 /*
5506 * SAX: end of the document processing.
5507 */
5508 ctxt->instate = XML_PARSER_EOF;
5509 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5510 ctxt->sax->endDocument(ctxt->userData);
5511 }
5512 }
5513 if ((ctxt->myDoc != NULL) &&
5514 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5515 (ctxt->instate == XML_PARSER_EPILOG))) {
5516 xmlDtdPtr dtd;
5517 dtd = xmlGetIntSubset(ctxt->myDoc);
5518 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005519 ctxt->myDoc->intSubset =
5520 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005521 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5522 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5523 }
5524#ifdef DEBUG_PUSH
5525 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5526#endif
5527 return(ret);
5528}
5529
5530/**
Owen Taylor3473f882001-02-23 17:55:21 +00005531 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005532 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005533 * @chunk: an char array
5534 * @size: the size in byte of the chunk
5535 * @terminate: last chunk indicator
5536 *
5537 * Parse a Chunk of memory
5538 *
5539 * Returns zero if no error, the xmlParserErrors otherwise.
5540 */
5541int
5542htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5543 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005544 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5545 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5546 "htmlParseChunk: context error\n", NULL, NULL);
5547 return(XML_ERR_INTERNAL_ERROR);
5548 }
Owen Taylor3473f882001-02-23 17:55:21 +00005549 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5550 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5551 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5552 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005553 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005554
5555 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005556 if (res < 0) {
5557 ctxt->errNo = XML_PARSER_EOF;
5558 ctxt->disableSAX = 1;
5559 return (XML_PARSER_EOF);
5560 }
Owen Taylor3473f882001-02-23 17:55:21 +00005561 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5562 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005563 ctxt->input->end =
5564 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005565#ifdef DEBUG_PUSH
5566 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5567#endif
5568
Daniel Veillard14f752c2003-08-09 11:44:50 +00005569#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005570 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5571 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005572#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005573 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005574 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5575 xmlParserInputBufferPtr in = ctxt->input->buf;
5576 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5577 (in->raw != NULL)) {
5578 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005579
Daniel Veillard14f752c2003-08-09 11:44:50 +00005580 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5581 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005582 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5583 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005584 return(XML_ERR_INVALID_ENCODING);
5585 }
5586 }
5587 }
Owen Taylor3473f882001-02-23 17:55:21 +00005588 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005589 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005590 if (terminate) {
5591 if ((ctxt->instate != XML_PARSER_EOF) &&
5592 (ctxt->instate != XML_PARSER_EPILOG) &&
5593 (ctxt->instate != XML_PARSER_MISC)) {
5594 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005595 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005596 }
Owen Taylor3473f882001-02-23 17:55:21 +00005597 if (ctxt->instate != XML_PARSER_EOF) {
5598 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5599 ctxt->sax->endDocument(ctxt->userData);
5600 }
5601 ctxt->instate = XML_PARSER_EOF;
5602 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005603 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005604}
5605
5606/************************************************************************
5607 * *
5608 * User entry points *
5609 * *
5610 ************************************************************************/
5611
5612/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005613 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005614 * @sax: a SAX handler
5615 * @user_data: The user data returned on SAX callbacks
5616 * @chunk: a pointer to an array of chars
5617 * @size: number of chars in the array
5618 * @filename: an optional file name or URI
5619 * @enc: an optional encoding
5620 *
5621 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005622 * The value of @filename is used for fetching external entities
5623 * and error/warning reports.
5624 *
5625 * Returns the new parser context or NULL
5626 */
5627htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005628htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005629 const char *chunk, int size, const char *filename,
5630 xmlCharEncoding enc) {
5631 htmlParserCtxtPtr ctxt;
5632 htmlParserInputPtr inputStream;
5633 xmlParserInputBufferPtr buf;
5634
Daniel Veillardd0463562001-10-13 09:15:48 +00005635 xmlInitParser();
5636
Owen Taylor3473f882001-02-23 17:55:21 +00005637 buf = xmlAllocParserInputBuffer(enc);
5638 if (buf == NULL) return(NULL);
5639
Daniel Veillardf403d292003-10-05 13:51:35 +00005640 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005641 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005642 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005643 return(NULL);
5644 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005645 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5646 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005647 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005648 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005649 xmlFree(ctxt->sax);
5650 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5651 if (ctxt->sax == NULL) {
5652 xmlFree(buf);
5653 xmlFree(ctxt);
5654 return(NULL);
5655 }
5656 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5657 if (user_data != NULL)
5658 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005659 }
Owen Taylor3473f882001-02-23 17:55:21 +00005660 if (filename == NULL) {
5661 ctxt->directory = NULL;
5662 } else {
5663 ctxt->directory = xmlParserGetDirectory(filename);
5664 }
5665
5666 inputStream = htmlNewInputStream(ctxt);
5667 if (inputStream == NULL) {
5668 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005669 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005670 return(NULL);
5671 }
5672
5673 if (filename == NULL)
5674 inputStream->filename = NULL;
5675 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005676 inputStream->filename = (char *)
5677 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005678 inputStream->buf = buf;
5679 inputStream->base = inputStream->buf->buffer->content;
5680 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005681 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005682 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005683
5684 inputPush(ctxt, inputStream);
5685
5686 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005687 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005688 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5689 int cur = ctxt->input->cur - ctxt->input->base;
5690
Daniel Veillarde77db162009-08-22 11:32:38 +02005691 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005692
5693 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5694 ctxt->input->cur = ctxt->input->base + cur;
5695 ctxt->input->end =
5696 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005697#ifdef DEBUG_PUSH
5698 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5699#endif
5700 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005701 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005702
5703 return(ctxt);
5704}
William M. Brack21e4ef22005-01-02 09:53:13 +00005705#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005706
5707/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005708 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005709 * @cur: a pointer to an array of xmlChar
5710 * @encoding: a free form C string describing the HTML document encoding, or NULL
5711 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005712 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005713 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005714 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5715 * to handle parse events. If sax is NULL, fallback to the default DOM
5716 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005717 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005718 * Returns the resulting document tree unless SAX is NULL or the document is
5719 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005720 */
5721
5722htmlDocPtr
5723htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5724 htmlDocPtr ret;
5725 htmlParserCtxtPtr ctxt;
5726
Daniel Veillardd0463562001-10-13 09:15:48 +00005727 xmlInitParser();
5728
Owen Taylor3473f882001-02-23 17:55:21 +00005729 if (cur == NULL) return(NULL);
5730
5731
5732 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5733 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005734 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005735 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005736 ctxt->sax = sax;
5737 ctxt->userData = userData;
5738 }
5739
5740 htmlParseDocument(ctxt);
5741 ret = ctxt->myDoc;
5742 if (sax != NULL) {
5743 ctxt->sax = NULL;
5744 ctxt->userData = NULL;
5745 }
5746 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005747
Owen Taylor3473f882001-02-23 17:55:21 +00005748 return(ret);
5749}
5750
5751/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005752 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005753 * @cur: a pointer to an array of xmlChar
5754 * @encoding: a free form C string describing the HTML document encoding, or NULL
5755 *
5756 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005757 *
Owen Taylor3473f882001-02-23 17:55:21 +00005758 * Returns the resulting document tree
5759 */
5760
5761htmlDocPtr
5762htmlParseDoc(xmlChar *cur, const char *encoding) {
5763 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5764}
5765
5766
5767/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005768 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005769 * @filename: the filename
5770 * @encoding: a free form C string describing the HTML document encoding, or NULL
5771 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005772 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005773 * Automatic support for ZLIB/Compress compressed document is provided
5774 * by default if found at compile-time.
5775 *
5776 * Returns the new parser context or NULL
5777 */
5778htmlParserCtxtPtr
5779htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5780{
5781 htmlParserCtxtPtr ctxt;
5782 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005783 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005784 /* htmlCharEncoding enc; */
5785 xmlChar *content, *content_line = (xmlChar *) "charset=";
5786
Daniel Veillarda03e3652004-11-02 18:45:30 +00005787 if (filename == NULL)
5788 return(NULL);
5789
Daniel Veillardf403d292003-10-05 13:51:35 +00005790 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005791 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005792 return(NULL);
5793 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005794 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5795 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005796#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005797 if (xmlDefaultSAXHandler.error != NULL) {
5798 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5799 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005800#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005801 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005802 return(NULL);
5803 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005804
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005805 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5806 xmlFree(canonicFilename);
5807 if (inputStream == NULL) {
5808 xmlFreeParserCtxt(ctxt);
5809 return(NULL);
5810 }
Owen Taylor3473f882001-02-23 17:55:21 +00005811
5812 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005813
Owen Taylor3473f882001-02-23 17:55:21 +00005814 /* set encoding */
5815 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005816 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005817 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005818 strcpy ((char *)content, (char *)content_line);
5819 strcat ((char *)content, (char *)encoding);
5820 htmlCheckEncoding (ctxt, content);
5821 xmlFree (content);
5822 }
5823 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005824
Owen Taylor3473f882001-02-23 17:55:21 +00005825 return(ctxt);
5826}
5827
5828/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005829 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005830 * @filename: the filename
5831 * @encoding: a free form C string describing the HTML document encoding, or NULL
5832 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005833 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005834 *
5835 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5836 * compressed document is provided by default if found at compile-time.
5837 * It use the given SAX function block to handle the parsing callback.
5838 * If sax is NULL, fallback to the default DOM tree building routines.
5839 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005840 * Returns the resulting document tree unless SAX is NULL or the document is
5841 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005842 */
5843
5844htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005845htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005846 void *userData) {
5847 htmlDocPtr ret;
5848 htmlParserCtxtPtr ctxt;
5849 htmlSAXHandlerPtr oldsax = NULL;
5850
Daniel Veillardd0463562001-10-13 09:15:48 +00005851 xmlInitParser();
5852
Owen Taylor3473f882001-02-23 17:55:21 +00005853 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5854 if (ctxt == NULL) return(NULL);
5855 if (sax != NULL) {
5856 oldsax = ctxt->sax;
5857 ctxt->sax = sax;
5858 ctxt->userData = userData;
5859 }
5860
5861 htmlParseDocument(ctxt);
5862
5863 ret = ctxt->myDoc;
5864 if (sax != NULL) {
5865 ctxt->sax = oldsax;
5866 ctxt->userData = NULL;
5867 }
5868 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005869
Owen Taylor3473f882001-02-23 17:55:21 +00005870 return(ret);
5871}
5872
5873/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005874 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005875 * @filename: the filename
5876 * @encoding: a free form C string describing the HTML document encoding, or NULL
5877 *
5878 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5879 * compressed document is provided by default if found at compile-time.
5880 *
5881 * Returns the resulting document tree
5882 */
5883
5884htmlDocPtr
5885htmlParseFile(const char *filename, const char *encoding) {
5886 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5887}
5888
5889/**
5890 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005891 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005892 *
5893 * Set and return the previous value for handling HTML omitted tags.
5894 *
5895 * Returns the last value for 0 for no handling, 1 for auto insertion.
5896 */
5897
5898int
5899htmlHandleOmittedElem(int val) {
5900 int old = htmlOmittedDefaultValue;
5901
5902 htmlOmittedDefaultValue = val;
5903 return(old);
5904}
5905
Daniel Veillard930dfb62003-02-05 10:17:38 +00005906/**
5907 * htmlElementAllowedHere:
5908 * @parent: HTML parent element
5909 * @elt: HTML element
5910 *
5911 * Checks whether an HTML element may be a direct child of a parent element.
5912 * Note - doesn't check for deprecated elements
5913 *
5914 * Returns 1 if allowed; 0 otherwise.
5915 */
5916int
5917htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5918 const char** p ;
5919
5920 if ( ! elt || ! parent || ! parent->subelts )
5921 return 0 ;
5922
5923 for ( p = parent->subelts; *p; ++p )
5924 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5925 return 1 ;
5926
5927 return 0 ;
5928}
5929/**
5930 * htmlElementStatusHere:
5931 * @parent: HTML parent element
5932 * @elt: HTML element
5933 *
5934 * Checks whether an HTML element may be a direct child of a parent element.
5935 * and if so whether it is valid or deprecated.
5936 *
5937 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5938 */
5939htmlStatus
5940htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5941 if ( ! parent || ! elt )
5942 return HTML_INVALID ;
5943 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5944 return HTML_INVALID ;
5945
5946 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5947}
5948/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005949 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005950 * @elt: HTML element
5951 * @attr: HTML attribute
5952 * @legacy: whether to allow deprecated attributes
5953 *
5954 * Checks whether an attribute is valid for an element
5955 * Has full knowledge of Required and Deprecated attributes
5956 *
5957 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5958 */
5959htmlStatus
5960htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5961 const char** p ;
5962
5963 if ( !elt || ! attr )
5964 return HTML_INVALID ;
5965
5966 if ( elt->attrs_req )
5967 for ( p = elt->attrs_req; *p; ++p)
5968 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5969 return HTML_REQUIRED ;
5970
5971 if ( elt->attrs_opt )
5972 for ( p = elt->attrs_opt; *p; ++p)
5973 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5974 return HTML_VALID ;
5975
5976 if ( legacy && elt->attrs_depr )
5977 for ( p = elt->attrs_depr; *p; ++p)
5978 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5979 return HTML_DEPRECATED ;
5980
5981 return HTML_INVALID ;
5982}
5983/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005984 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005985 * @node: an htmlNodePtr in a tree
5986 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005987 * for Element nodes)
5988 *
5989 * Checks whether the tree node is valid. Experimental (the author
5990 * only uses the HTML enhancements in a SAX parser)
5991 *
5992 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5993 * legacy allowed) or htmlElementStatusHere (otherwise).
5994 * for Attribute nodes, a return from htmlAttrAllowed
5995 * for other nodes, HTML_NA (no checks performed)
5996 */
5997htmlStatus
5998htmlNodeStatus(const htmlNodePtr node, int legacy) {
5999 if ( ! node )
6000 return HTML_INVALID ;
6001
6002 switch ( node->type ) {
6003 case XML_ELEMENT_NODE:
6004 return legacy
6005 ? ( htmlElementAllowedHere (
6006 htmlTagLookup(node->parent->name) , node->name
6007 ) ? HTML_VALID : HTML_INVALID )
6008 : htmlElementStatusHere(
6009 htmlTagLookup(node->parent->name) ,
6010 htmlTagLookup(node->name) )
6011 ;
6012 case XML_ATTRIBUTE_NODE:
6013 return htmlAttrAllowed(
6014 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6015 default: return HTML_NA ;
6016 }
6017}
Daniel Veillard9475a352003-09-26 12:47:50 +00006018/************************************************************************
6019 * *
6020 * New set (2.6.0) of simpler and more flexible APIs *
6021 * *
6022 ************************************************************************/
6023/**
6024 * DICT_FREE:
6025 * @str: a string
6026 *
6027 * Free a string if it is not owned by the "dict" dictionnary in the
6028 * current scope
6029 */
6030#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006031 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006032 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6033 xmlFree((char *)(str));
6034
6035/**
6036 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006037 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006038 *
6039 * Reset a parser context
6040 */
6041void
6042htmlCtxtReset(htmlParserCtxtPtr ctxt)
6043{
6044 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006045 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006046
Daniel Veillarda03e3652004-11-02 18:45:30 +00006047 if (ctxt == NULL)
6048 return;
6049
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006050 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006051 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006052
6053 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6054 xmlFreeInputStream(input);
6055 }
6056 ctxt->inputNr = 0;
6057 ctxt->input = NULL;
6058
6059 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006060 if (ctxt->spaceTab != NULL) {
6061 ctxt->spaceTab[0] = -1;
6062 ctxt->space = &ctxt->spaceTab[0];
6063 } else {
6064 ctxt->space = NULL;
6065 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006066
6067
6068 ctxt->nodeNr = 0;
6069 ctxt->node = NULL;
6070
6071 ctxt->nameNr = 0;
6072 ctxt->name = NULL;
6073
6074 DICT_FREE(ctxt->version);
6075 ctxt->version = NULL;
6076 DICT_FREE(ctxt->encoding);
6077 ctxt->encoding = NULL;
6078 DICT_FREE(ctxt->directory);
6079 ctxt->directory = NULL;
6080 DICT_FREE(ctxt->extSubURI);
6081 ctxt->extSubURI = NULL;
6082 DICT_FREE(ctxt->extSubSystem);
6083 ctxt->extSubSystem = NULL;
6084 if (ctxt->myDoc != NULL)
6085 xmlFreeDoc(ctxt->myDoc);
6086 ctxt->myDoc = NULL;
6087
6088 ctxt->standalone = -1;
6089 ctxt->hasExternalSubset = 0;
6090 ctxt->hasPErefs = 0;
6091 ctxt->html = 1;
6092 ctxt->external = 0;
6093 ctxt->instate = XML_PARSER_START;
6094 ctxt->token = 0;
6095
6096 ctxt->wellFormed = 1;
6097 ctxt->nsWellFormed = 1;
6098 ctxt->valid = 1;
6099 ctxt->vctxt.userData = ctxt;
6100 ctxt->vctxt.error = xmlParserValidityError;
6101 ctxt->vctxt.warning = xmlParserValidityWarning;
6102 ctxt->record_info = 0;
6103 ctxt->nbChars = 0;
6104 ctxt->checkIndex = 0;
6105 ctxt->inSubset = 0;
6106 ctxt->errNo = XML_ERR_OK;
6107 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006108 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006109 ctxt->catalogs = NULL;
6110 xmlInitNodeInfoSeq(&ctxt->node_seq);
6111
6112 if (ctxt->attsDefault != NULL) {
6113 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6114 ctxt->attsDefault = NULL;
6115 }
6116 if (ctxt->attsSpecial != NULL) {
6117 xmlHashFree(ctxt->attsSpecial, NULL);
6118 ctxt->attsSpecial = NULL;
6119 }
6120}
6121
6122/**
6123 * htmlCtxtUseOptions:
6124 * @ctxt: an HTML parser context
6125 * @options: a combination of htmlParserOption(s)
6126 *
6127 * Applies the options to the parser context
6128 *
6129 * Returns 0 in case of success, the set of unknown or unimplemented options
6130 * in case of error.
6131 */
6132int
6133htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6134{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006135 if (ctxt == NULL)
6136 return(-1);
6137
Daniel Veillard9475a352003-09-26 12:47:50 +00006138 if (options & HTML_PARSE_NOWARNING) {
6139 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006140 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006141 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006142 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006143 }
6144 if (options & HTML_PARSE_NOERROR) {
6145 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006146 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006147 ctxt->sax->fatalError = NULL;
6148 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006149 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006150 }
6151 if (options & HTML_PARSE_PEDANTIC) {
6152 ctxt->pedantic = 1;
6153 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006154 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006155 } else
6156 ctxt->pedantic = 0;
6157 if (options & XML_PARSE_NOBLANKS) {
6158 ctxt->keepBlanks = 0;
6159 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6160 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006161 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006162 } else
6163 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006164 if (options & HTML_PARSE_RECOVER) {
6165 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006166 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006167 } else
6168 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006169 if (options & HTML_PARSE_COMPACT) {
6170 ctxt->options |= HTML_PARSE_COMPACT;
6171 options -= HTML_PARSE_COMPACT;
6172 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006173 if (options & XML_PARSE_HUGE) {
6174 ctxt->options |= XML_PARSE_HUGE;
6175 options -= XML_PARSE_HUGE;
6176 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006177 ctxt->dictNames = 0;
6178 return (options);
6179}
6180
6181/**
6182 * htmlDoRead:
6183 * @ctxt: an HTML parser context
6184 * @URL: the base URL to use for the document
6185 * @encoding: the document encoding, or NULL
6186 * @options: a combination of htmlParserOption(s)
6187 * @reuse: keep the context for reuse
6188 *
6189 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006190 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006191 * Returns the resulting document tree or NULL
6192 */
6193static htmlDocPtr
6194htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6195 int options, int reuse)
6196{
6197 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006198
Daniel Veillard9475a352003-09-26 12:47:50 +00006199 htmlCtxtUseOptions(ctxt, options);
6200 ctxt->html = 1;
6201 if (encoding != NULL) {
6202 xmlCharEncodingHandlerPtr hdlr;
6203
6204 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006205 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006206 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006207 if (ctxt->input->encoding != NULL)
6208 xmlFree((xmlChar *) ctxt->input->encoding);
6209 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6210 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006211 }
6212 if ((URL != NULL) && (ctxt->input != NULL) &&
6213 (ctxt->input->filename == NULL))
6214 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6215 htmlParseDocument(ctxt);
6216 ret = ctxt->myDoc;
6217 ctxt->myDoc = NULL;
6218 if (!reuse) {
6219 if ((ctxt->dictNames) &&
6220 (ret != NULL) &&
6221 (ret->dict == ctxt->dict))
6222 ctxt->dict = NULL;
6223 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006224 }
6225 return (ret);
6226}
6227
6228/**
6229 * htmlReadDoc:
6230 * @cur: a pointer to a zero terminated string
6231 * @URL: the base URL to use for the document
6232 * @encoding: the document encoding, or NULL
6233 * @options: a combination of htmlParserOption(s)
6234 *
6235 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006236 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006237 * Returns the resulting document tree
6238 */
6239htmlDocPtr
6240htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6241{
6242 htmlParserCtxtPtr ctxt;
6243
6244 if (cur == NULL)
6245 return (NULL);
6246
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006247 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006248 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006249 if (ctxt == NULL)
6250 return (NULL);
6251 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6252}
6253
6254/**
6255 * htmlReadFile:
6256 * @filename: a file or URL
6257 * @encoding: the document encoding, or NULL
6258 * @options: a combination of htmlParserOption(s)
6259 *
6260 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006261 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006262 * Returns the resulting document tree
6263 */
6264htmlDocPtr
6265htmlReadFile(const char *filename, const char *encoding, int options)
6266{
6267 htmlParserCtxtPtr ctxt;
6268
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006269 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006270 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6271 if (ctxt == NULL)
6272 return (NULL);
6273 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6274}
6275
6276/**
6277 * htmlReadMemory:
6278 * @buffer: a pointer to a char array
6279 * @size: the size of the array
6280 * @URL: the base URL to use for the document
6281 * @encoding: the document encoding, or NULL
6282 * @options: a combination of htmlParserOption(s)
6283 *
6284 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006285 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006286 * Returns the resulting document tree
6287 */
6288htmlDocPtr
6289htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6290{
6291 htmlParserCtxtPtr ctxt;
6292
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006293 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006294 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6295 if (ctxt == NULL)
6296 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006297 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006298 if (ctxt->sax != NULL)
6299 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006300 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6301}
6302
6303/**
6304 * htmlReadFd:
6305 * @fd: an open file descriptor
6306 * @URL: the base URL to use for the document
6307 * @encoding: the document encoding, or NULL
6308 * @options: a combination of htmlParserOption(s)
6309 *
6310 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006311 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006312 * Returns the resulting document tree
6313 */
6314htmlDocPtr
6315htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6316{
6317 htmlParserCtxtPtr ctxt;
6318 xmlParserInputBufferPtr input;
6319 xmlParserInputPtr stream;
6320
6321 if (fd < 0)
6322 return (NULL);
6323
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006324 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006325 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6326 if (input == NULL)
6327 return (NULL);
6328 ctxt = xmlNewParserCtxt();
6329 if (ctxt == NULL) {
6330 xmlFreeParserInputBuffer(input);
6331 return (NULL);
6332 }
6333 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6334 if (stream == NULL) {
6335 xmlFreeParserInputBuffer(input);
6336 xmlFreeParserCtxt(ctxt);
6337 return (NULL);
6338 }
6339 inputPush(ctxt, stream);
6340 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6341}
6342
6343/**
6344 * htmlReadIO:
6345 * @ioread: an I/O read function
6346 * @ioclose: an I/O close function
6347 * @ioctx: an I/O handler
6348 * @URL: the base URL to use for the document
6349 * @encoding: the document encoding, or NULL
6350 * @options: a combination of htmlParserOption(s)
6351 *
6352 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006353 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006354 * Returns the resulting document tree
6355 */
6356htmlDocPtr
6357htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6358 void *ioctx, const char *URL, const char *encoding, int options)
6359{
6360 htmlParserCtxtPtr ctxt;
6361 xmlParserInputBufferPtr input;
6362 xmlParserInputPtr stream;
6363
6364 if (ioread == NULL)
6365 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006366 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006367
6368 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6369 XML_CHAR_ENCODING_NONE);
6370 if (input == NULL)
6371 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006372 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006373 if (ctxt == NULL) {
6374 xmlFreeParserInputBuffer(input);
6375 return (NULL);
6376 }
6377 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6378 if (stream == NULL) {
6379 xmlFreeParserInputBuffer(input);
6380 xmlFreeParserCtxt(ctxt);
6381 return (NULL);
6382 }
6383 inputPush(ctxt, stream);
6384 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6385}
6386
6387/**
6388 * htmlCtxtReadDoc:
6389 * @ctxt: an HTML parser context
6390 * @cur: a pointer to a zero terminated string
6391 * @URL: the base URL to use for the document
6392 * @encoding: the document encoding, or NULL
6393 * @options: a combination of htmlParserOption(s)
6394 *
6395 * parse an XML in-memory document and build a tree.
6396 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006397 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006398 * Returns the resulting document tree
6399 */
6400htmlDocPtr
6401htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6402 const char *URL, const char *encoding, int options)
6403{
6404 xmlParserInputPtr stream;
6405
6406 if (cur == NULL)
6407 return (NULL);
6408 if (ctxt == NULL)
6409 return (NULL);
6410
6411 htmlCtxtReset(ctxt);
6412
6413 stream = xmlNewStringInputStream(ctxt, cur);
6414 if (stream == NULL) {
6415 return (NULL);
6416 }
6417 inputPush(ctxt, stream);
6418 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6419}
6420
6421/**
6422 * htmlCtxtReadFile:
6423 * @ctxt: an HTML parser context
6424 * @filename: a file or URL
6425 * @encoding: the document encoding, or NULL
6426 * @options: a combination of htmlParserOption(s)
6427 *
6428 * parse an XML file from the filesystem or the network.
6429 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006430 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006431 * Returns the resulting document tree
6432 */
6433htmlDocPtr
6434htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6435 const char *encoding, int options)
6436{
6437 xmlParserInputPtr stream;
6438
6439 if (filename == NULL)
6440 return (NULL);
6441 if (ctxt == NULL)
6442 return (NULL);
6443
6444 htmlCtxtReset(ctxt);
6445
Daniel Veillard29614c72004-11-26 10:47:26 +00006446 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006447 if (stream == NULL) {
6448 return (NULL);
6449 }
6450 inputPush(ctxt, stream);
6451 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6452}
6453
6454/**
6455 * htmlCtxtReadMemory:
6456 * @ctxt: an HTML parser context
6457 * @buffer: a pointer to a char array
6458 * @size: the size of the array
6459 * @URL: the base URL to use for the document
6460 * @encoding: the document encoding, or NULL
6461 * @options: a combination of htmlParserOption(s)
6462 *
6463 * parse an XML in-memory document and build a tree.
6464 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006465 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006466 * Returns the resulting document tree
6467 */
6468htmlDocPtr
6469htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6470 const char *URL, const char *encoding, int options)
6471{
6472 xmlParserInputBufferPtr input;
6473 xmlParserInputPtr stream;
6474
6475 if (ctxt == NULL)
6476 return (NULL);
6477 if (buffer == NULL)
6478 return (NULL);
6479
6480 htmlCtxtReset(ctxt);
6481
6482 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6483 if (input == NULL) {
6484 return(NULL);
6485 }
6486
6487 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6488 if (stream == NULL) {
6489 xmlFreeParserInputBuffer(input);
6490 return(NULL);
6491 }
6492
6493 inputPush(ctxt, stream);
6494 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6495}
6496
6497/**
6498 * htmlCtxtReadFd:
6499 * @ctxt: an HTML parser context
6500 * @fd: an open file descriptor
6501 * @URL: the base URL to use for the document
6502 * @encoding: the document encoding, or NULL
6503 * @options: a combination of htmlParserOption(s)
6504 *
6505 * parse an XML from a file descriptor and build a tree.
6506 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006507 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006508 * Returns the resulting document tree
6509 */
6510htmlDocPtr
6511htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6512 const char *URL, const char *encoding, int options)
6513{
6514 xmlParserInputBufferPtr input;
6515 xmlParserInputPtr stream;
6516
6517 if (fd < 0)
6518 return (NULL);
6519 if (ctxt == NULL)
6520 return (NULL);
6521
6522 htmlCtxtReset(ctxt);
6523
6524
6525 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6526 if (input == NULL)
6527 return (NULL);
6528 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6529 if (stream == NULL) {
6530 xmlFreeParserInputBuffer(input);
6531 return (NULL);
6532 }
6533 inputPush(ctxt, stream);
6534 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6535}
6536
6537/**
6538 * htmlCtxtReadIO:
6539 * @ctxt: an HTML parser context
6540 * @ioread: an I/O read function
6541 * @ioclose: an I/O close function
6542 * @ioctx: an I/O handler
6543 * @URL: the base URL to use for the document
6544 * @encoding: the document encoding, or NULL
6545 * @options: a combination of htmlParserOption(s)
6546 *
6547 * parse an HTML document from I/O functions and source and build a tree.
6548 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006549 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006550 * Returns the resulting document tree
6551 */
6552htmlDocPtr
6553htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6554 xmlInputCloseCallback ioclose, void *ioctx,
6555 const char *URL,
6556 const char *encoding, int options)
6557{
6558 xmlParserInputBufferPtr input;
6559 xmlParserInputPtr stream;
6560
6561 if (ioread == NULL)
6562 return (NULL);
6563 if (ctxt == NULL)
6564 return (NULL);
6565
6566 htmlCtxtReset(ctxt);
6567
6568 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6569 XML_CHAR_ENCODING_NONE);
6570 if (input == NULL)
6571 return (NULL);
6572 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6573 if (stream == NULL) {
6574 xmlFreeParserInputBuffer(input);
6575 return (NULL);
6576 }
6577 inputPush(ctxt, stream);
6578 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6579}
6580
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006581#define bottom_HTMLparser
6582#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006583#endif /* LIBXML_HTML_ENABLED */