blob: f5957c587f6353ae369fd2fcb591fc4873cba58f [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
212/*
213 * Macros for accessing the content. Those should be used only by the parser,
214 * and not exported.
215 *
216 * Dirty macros, i.e. one need to make assumption on the context to use them
217 *
218 * CUR_PTR return the current pointer to the xmlChar to be parsed.
219 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
220 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
221 * in UNICODE mode. This should be used internally by the parser
222 * only to compare to ASCII values otherwise it would break when
223 * running with UTF-8 encoding.
224 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
225 * to compare on ASCII based substring.
226 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
227 * it should be used only to compare on ASCII based substring.
228 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 *
231 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
232 *
233 * CURRENT Returns the current char value, with the full decoding of
234 * UTF-8 if we are using this mode. It returns an int.
235 * NEXT Skip to the next character, this does the proper decoding
236 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000237 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000238 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
239 */
240
241#define UPPER (toupper(*ctxt->input->cur))
242
Daniel Veillard77a90a72003-03-22 00:04:05 +0000243#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000244
245#define NXT(val) ctxt->input->cur[(val)]
246
247#define UPP(val) (toupper(ctxt->input->cur[(val)]))
248
249#define CUR_PTR ctxt->input->cur
250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
252 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
253 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000255#define GROW if ((ctxt->progressive == 0) && \
256 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
257 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000258
259#define CURRENT ((int) (*ctxt->input->cur))
260
261#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
262
263/* Inported from XML */
264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
266#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000267#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000268
Daniel Veillard561b7f82002-03-20 21:55:57 +0000269#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000270#define NXT(val) ctxt->input->cur[(val)]
271#define CUR_PTR ctxt->input->cur
272
273
274#define NEXTL(l) do { \
275 if (*(ctxt->input->cur) == '\n') { \
276 ctxt->input->line++; ctxt->input->col = 1; \
277 } else ctxt->input->col++; \
278 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200280
Owen Taylor3473f882001-02-23 17:55:21 +0000281/************
282 \
283 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
284 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
285 ************/
286
287#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
288#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
289
290#define COPY_BUF(l,b,i,v) \
291 if (l == 1) b[i++] = (xmlChar) v; \
292 else i += xmlCopyChar(l,&b[i],v)
293
294/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200295 * htmlFindEncoding:
296 * @the HTML parser context
297 *
298 * Ty to find and encoding in the current data available in the input
299 * buffer this is needed to try to switch to the proper encoding when
300 * one face a character error.
301 * That's an heuristic, since it's operating outside of parsing it could
302 * try to use a meta which had been commented out, that's the reason it
303 * should only be used in case of error, not as a default.
304 *
305 * Returns an encoding string or NULL if not found, the string need to
306 * be freed
307 */
308static xmlChar *
309htmlFindEncoding(xmlParserCtxtPtr ctxt) {
310 const xmlChar *start, *cur, *end;
311
312 if ((ctxt == NULL) || (ctxt->input == NULL) ||
313 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
314 (ctxt->input->buf->encoder != NULL))
315 return(NULL);
316 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
317 return(NULL);
318
319 start = ctxt->input->cur;
320 end = ctxt->input->end;
321 /* we also expect the input buffer to be zero terminated */
322 if (*end != 0)
323 return(NULL);
324
325 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
326 if (cur == NULL)
327 return(NULL);
328 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
329 if (cur == NULL)
330 return(NULL);
331 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
332 if (cur == NULL)
333 return(NULL);
334 cur += 8;
335 start = cur;
336 while (((*cur >= 'A') && (*cur <= 'Z')) ||
337 ((*cur >= 'a') && (*cur <= 'z')) ||
338 ((*cur >= '0') && (*cur <= '9')) ||
339 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
340 cur++;
341 if (cur == start)
342 return(NULL);
343 return(xmlStrndup(start, cur - start));
344}
345
346/**
Owen Taylor3473f882001-02-23 17:55:21 +0000347 * htmlCurrentChar:
348 * @ctxt: the HTML parser context
349 * @len: pointer to the length of the char read
350 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000351 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000352 * bytes in the input buffer. Implement the end of line normalization:
353 * 2.11 End-of-Line Handling
354 * If the encoding is unspecified, in the case we find an ISO-Latin-1
355 * char, then the encoding converter is plugged in automatically.
356 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000357 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
359
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000360static int
Owen Taylor3473f882001-02-23 17:55:21 +0000361htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
362 if (ctxt->instate == XML_PARSER_EOF)
363 return(0);
364
365 if (ctxt->token != 0) {
366 *len = 0;
367 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200368 }
Owen Taylor3473f882001-02-23 17:55:21 +0000369 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
370 /*
371 * We are supposed to handle UTF8, check it's valid
372 * From rfc2044: encoding of the Unicode values on UTF-8:
373 *
374 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
375 * 0000 0000-0000 007F 0xxxxxxx
376 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200377 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000378 *
379 * Check for the 0x110000 limit too
380 */
381 const unsigned char *cur = ctxt->input->cur;
382 unsigned char c;
383 unsigned int val;
384
385 c = *cur;
386 if (c & 0x80) {
387 if (cur[1] == 0)
388 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
389 if ((cur[1] & 0xc0) != 0x80)
390 goto encoding_error;
391 if ((c & 0xe0) == 0xe0) {
392
393 if (cur[2] == 0)
394 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
395 if ((cur[2] & 0xc0) != 0x80)
396 goto encoding_error;
397 if ((c & 0xf0) == 0xf0) {
398 if (cur[3] == 0)
399 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
400 if (((c & 0xf8) != 0xf0) ||
401 ((cur[3] & 0xc0) != 0x80))
402 goto encoding_error;
403 /* 4-byte code */
404 *len = 4;
405 val = (cur[0] & 0x7) << 18;
406 val |= (cur[1] & 0x3f) << 12;
407 val |= (cur[2] & 0x3f) << 6;
408 val |= cur[3] & 0x3f;
409 } else {
410 /* 3-byte code */
411 *len = 3;
412 val = (cur[0] & 0xf) << 12;
413 val |= (cur[1] & 0x3f) << 6;
414 val |= cur[2] & 0x3f;
415 }
416 } else {
417 /* 2-byte code */
418 *len = 2;
419 val = (cur[0] & 0x1f) << 6;
420 val |= cur[1] & 0x3f;
421 }
422 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000423 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
424 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200425 }
Owen Taylor3473f882001-02-23 17:55:21 +0000426 return(val);
427 } else {
428 /* 1-byte code */
429 *len = 1;
430 return((int) *ctxt->input->cur);
431 }
432 }
433 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000434 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000435 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000436 * XML constructs only use < 128 chars
437 */
438 *len = 1;
439 if ((int) *ctxt->input->cur < 0x80)
440 return((int) *ctxt->input->cur);
441
442 /*
443 * Humm this is bad, do an automatic flow conversion
444 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200445 {
446 xmlChar * guess;
447 xmlCharEncodingHandlerPtr handler;
448
449 guess = htmlFindEncoding(ctxt);
450 if (guess == NULL) {
451 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
452 } else {
453 if (ctxt->input->encoding != NULL)
454 xmlFree((xmlChar *) ctxt->input->encoding);
455 ctxt->input->encoding = guess;
456 handler = xmlFindCharEncodingHandler((const char *) guess);
457 if (handler != NULL) {
458 xmlSwitchToEncoding(ctxt, handler);
459 } else {
460 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
461 "Unsupported encoding %s", guess, NULL);
462 }
463 }
464 ctxt->charset = XML_CHAR_ENCODING_UTF8;
465 }
466
Owen Taylor3473f882001-02-23 17:55:21 +0000467 return(xmlCurrentChar(ctxt, len));
468
469encoding_error:
470 /*
471 * If we detect an UTF8 error that probably mean that the
472 * input encoding didn't get properly advertized in the
473 * declaration header. Report the error and switch the encoding
474 * to ISO-Latin-1 (if you don't like this policy, just declare the
475 * encoding !)
476 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000477 {
478 char buffer[150];
479
Daniel Veillard861101d2007-06-12 08:38:57 +0000480 if (ctxt->input->end - ctxt->input->cur >= 4) {
481 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
482 ctxt->input->cur[0], ctxt->input->cur[1],
483 ctxt->input->cur[2], ctxt->input->cur[3]);
484 } else {
485 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
486 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000487 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
488 "Input is not proper UTF-8, indicate encoding !\n",
489 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000490 }
491
Daniel Veillarde77db162009-08-22 11:32:38 +0200492 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000493 *len = 1;
494 return((int) *ctxt->input->cur);
495}
496
497/**
Owen Taylor3473f882001-02-23 17:55:21 +0000498 * htmlSkipBlankChars:
499 * @ctxt: the HTML parser context
500 *
501 * skip all blanks character found at that point in the input streams.
502 *
503 * Returns the number of space chars skipped
504 */
505
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000506static int
Owen Taylor3473f882001-02-23 17:55:21 +0000507htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
508 int res = 0;
509
William M. Brack76e95df2003-10-18 16:20:14 +0000510 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000511 if ((*ctxt->input->cur == 0) &&
512 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
513 xmlPopInput(ctxt);
514 } else {
515 if (*(ctxt->input->cur) == '\n') {
516 ctxt->input->line++; ctxt->input->col = 1;
517 } else ctxt->input->col++;
518 ctxt->input->cur++;
519 ctxt->nbChars++;
520 if (*ctxt->input->cur == 0)
521 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
522 }
523 res++;
524 }
525 return(res);
526}
527
528
529
530/************************************************************************
531 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200532 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000533 * *
534 ************************************************************************/
535
536/*
537 * Start Tag: 1 means the start tag can be ommited
538 * End Tag: 1 means the end tag can be ommited
539 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000540 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000541 * Depr: this element is deprecated
542 * DTD: 1 means that this element is valid only in the Loose DTD
543 * 2 means that this element is valid only in the Frameset DTD
544 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000545 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000546 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000547 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000548
549/* Definitions and a couple of vars for HTML Elements */
550
551#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000552#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000554#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000555#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
556#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000557#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000558#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
559#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
560#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000561#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000562#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000563#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000564#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000565#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000566#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000567#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000568#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000569#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000570#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000571#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000572#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000573#define EMPTY NULL
574
575
Daniel Veillard065abe82006-07-03 08:55:04 +0000576static const char* const html_flow[] = { FLOW, NULL } ;
577static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000578
579/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000580static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000581#define html_cdata html_pcdata
582
583
584/* ... and for HTML Attributes */
585
586#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000587#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000588#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000589#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000590#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000591#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000592#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000593#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000594#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000595#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000596#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000597#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000598
Daniel Veillard065abe82006-07-03 08:55:04 +0000599static const char* const html_attrs[] = { ATTRS, NULL } ;
600static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
601static const char* const core_attrs[] = { COREATTRS, NULL } ;
602static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603
604
605/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000606static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000607 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
608 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000609static const char* const target_attr[] = { "target", NULL } ;
610static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
611static const char* const alt_attr[] = { "alt", NULL } ;
612static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
613static const char* const href_attrs[] = { "href", NULL } ;
614static const char* const clear_attrs[] = { "clear", NULL } ;
615static const char* const inline_p[] = { INLINE, "p", NULL } ;
616
617static const char* const flow_param[] = { FLOW, "param", NULL } ;
618static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000619 "archive", "alt", "name", "height", "width", "align",
620 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000621static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000622 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000623static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000624 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000625static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
626static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
627static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
628static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000629 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000630static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000631 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
632
633
Daniel Veillard065abe82006-07-03 08:55:04 +0000634static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
635static const char* const col_elt[] = { "col", NULL } ;
636static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
637static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
638static const char* const dl_contents[] = { "dt", "dd", NULL } ;
639static const char* const compact_attr[] = { "compact", NULL } ;
640static const char* const label_attr[] = { "label", NULL } ;
641static const char* const fieldset_contents[] = { FLOW, "legend" } ;
642static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
643static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
644static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
645static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
646static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
647static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
648static const char* const head_attrs[] = { I18N, "profile", NULL } ;
649static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
650static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
651static const char* const version_attr[] = { "version", NULL } ;
652static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
653static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
654static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000655static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000656static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
657static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
658static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
659static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
660static const char* const align_attr[] = { "align", NULL } ;
661static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
662static const char* const map_contents[] = { BLOCK, "area", NULL } ;
663static const char* const name_attr[] = { "name", NULL } ;
664static const char* const action_attr[] = { "action", NULL } ;
665static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
666static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
667static const char* const content_attr[] = { "content", NULL } ;
668static const char* const type_attr[] = { "type", NULL } ;
669static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
670static const char* const object_contents[] = { FLOW, "param", NULL } ;
671static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
672static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
673static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
674static const char* const option_elt[] = { "option", NULL } ;
675static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
676static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
677static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
678static const char* const width_attr[] = { "width", NULL } ;
679static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
680static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
681static const char* const language_attr[] = { "language", NULL } ;
682static const char* const select_content[] = { "optgroup", "option", NULL } ;
683static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
684static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200685static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000686static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
687static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
688static const char* const tr_elt[] = { "tr", NULL } ;
689static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
690static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
691static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
692static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
693static const char* const tr_contents[] = { "th", "td", NULL } ;
694static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
695static const char* const li_elt[] = { "li", NULL } ;
696static const char* const ul_depr[] = { "type", "compact", NULL} ;
697static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000698
699#define DECL (const char**)
700
Daniel Veillard22090732001-07-16 00:06:07 +0000701static const htmlElemDesc
702html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000703{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
704 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
705},
706{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
707 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
708},
709{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
710 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
711},
712{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
713 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
714},
715{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
716 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
717},
718{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
719 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
720},
721{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
722 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
723},
724{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
725 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
726},
727{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
728 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
729},
730{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
731 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
732},
733{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
734 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
735},
736{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
737 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
738},
739{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
740 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
741},
742{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
743 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
744},
745{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
746 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
747},
748{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
749 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
750},
751{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
752 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
753},
754{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
755 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
756},
757{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
758 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
759},
760{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
761 EMPTY , NULL , DECL col_attrs , NULL, NULL
762},
763{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
764 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
765},
766{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
767 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
768},
769{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
770 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
771},
772{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
773 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
774},
775{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
776 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
777},
778{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
779 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
780},
781{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000782 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000783},
784{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
785 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
786},
787{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
788 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
789},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000790{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000791 EMPTY, NULL, DECL embed_attrs, NULL, NULL
792},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000793{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
794 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
795},
796{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
797 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
798},
799{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
800 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
801},
802{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
803 EMPTY, NULL, NULL, DECL frame_attrs, NULL
804},
805{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
806 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
807},
808{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
809 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
810},
811{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813},
814{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
815 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
816},
817{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
818 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
819},
820{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
821 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822},
823{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
824 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
825},
826{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
827 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
828},
829{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
830 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
831},
832{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
833 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
834},
835{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
836 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837},
838{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
839 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
840},
841{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000842 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000843},
844{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
845 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
846},
847{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
848 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
849},
850{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
851 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
852},
853{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
854 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
855},
856{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
857 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
858},
859{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
860 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
861},
862{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
863 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
864},
865{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
866 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
867},
868{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000869 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000870},
871{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
872 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
873},
874{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
875 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
876},
877{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
878 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
879},
880{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
881 DECL html_flow, "div", DECL html_attrs, NULL, NULL
882},
883{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
884 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
885},
886{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
887 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
888},
889{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000890 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000891},
892{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
893 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
894},
895{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
896 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
897},
898{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000899 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000900},
901{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
902 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
903},
904{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
905 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
906},
907{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
908 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
909},
910{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
911 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
912},
913{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
914 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
915},
916{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
917 DECL select_content, NULL, DECL select_attrs, NULL, NULL
918},
919{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
920 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
921},
922{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
923 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
924},
925{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
926 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
927},
928{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
929 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
930},
931{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
932 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
933},
934{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
935 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936},
937{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
938 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
939},
940{ "table", 0, 0, 0, 0, 0, 0, 0, "",
941 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
942},
943{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
944 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
945},
946{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
947 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
948},
949{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
950 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
951},
952{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
953 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
954},
955{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
956 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
957},
958{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
959 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
960},
961{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
962 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
963},
964{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
965 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
966},
967{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
968 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
969},
970{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
971 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
972},
973{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
974 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
975},
976{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
977 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
978}
Owen Taylor3473f882001-02-23 17:55:21 +0000979};
980
981/*
Owen Taylor3473f882001-02-23 17:55:21 +0000982 * start tags that imply the end of current element
983 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000984static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000985"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
986 "dl", "ul", "ol", "menu", "dir", "address", "pre",
987 "listing", "xmp", "head", NULL,
988"head", "p", NULL,
989"title", "p", NULL,
990"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000991"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000992"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
993 "pre", "listing", "xmp", "head", "li", NULL,
994"hr", "p", "head", NULL,
995"h1", "p", "head", NULL,
996"h2", "p", "head", NULL,
997"h3", "p", "head", NULL,
998"h4", "p", "head", NULL,
999"h5", "p", "head", NULL,
1000"h6", "p", "head", NULL,
1001"dir", "p", "head", NULL,
1002"address", "p", "head", "ul", NULL,
1003"pre", "p", "head", "ul", NULL,
1004"listing", "p", "head", NULL,
1005"xmp", "p", "head", NULL,
1006"blockquote", "p", "head", NULL,
1007"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1008 "xmp", "head", NULL,
1009"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1010 "head", "dd", NULL,
1011"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1012 "head", "dt", NULL,
1013"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1014 "listing", "xmp", NULL,
1015"ol", "p", "head", "ul", NULL,
1016"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001017"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001018"div", "p", "head", NULL,
1019"noscript", "p", "head", NULL,
1020"center", "font", "b", "i", "p", "head", NULL,
1021"a", "a", NULL,
1022"caption", "p", NULL,
1023"colgroup", "caption", "colgroup", "col", "p", NULL,
1024"col", "caption", "col", "p", NULL,
1025"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1026 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001027"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001028"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001029"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1030"thead", "caption", "col", "colgroup", NULL,
1031"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1032 "tbody", "p", NULL,
1033"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1034 "tfoot", "tbody", "p", NULL,
1035"optgroup", "option", NULL,
1036"option", "option", NULL,
1037"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1038 "pre", "listing", "xmp", "a", NULL,
1039NULL
1040};
1041
1042/*
1043 * The list of HTML elements which are supposed not to have
1044 * CDATA content and where a p element will be implied
1045 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001046 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001047 * implied paragraph
1048 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001049static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001050 "html",
1051 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001052 NULL
1053};
1054
1055/*
1056 * The list of HTML attributes which are of content %Script;
1057 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1058 * it assumes the name starts with 'on'
1059 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001060static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001061 "onclick",
1062 "ondblclick",
1063 "onmousedown",
1064 "onmouseup",
1065 "onmouseover",
1066 "onmousemove",
1067 "onmouseout",
1068 "onkeypress",
1069 "onkeydown",
1070 "onkeyup",
1071 "onload",
1072 "onunload",
1073 "onfocus",
1074 "onblur",
1075 "onsubmit",
1076 "onrest",
1077 "onchange",
1078 "onselect"
1079};
1080
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001081/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001082 * This table is used by the htmlparser to know what to do with
1083 * broken html pages. By assigning different priorities to different
1084 * elements the parser can decide how to handle extra endtags.
1085 * Endtags are only allowed to close elements with lower or equal
1086 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001087 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001088
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001089typedef struct {
1090 const char *name;
1091 int priority;
1092} elementPriority;
1093
Daniel Veillard22090732001-07-16 00:06:07 +00001094static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095 {"div", 150},
1096 {"td", 160},
1097 {"th", 160},
1098 {"tr", 170},
1099 {"thead", 180},
1100 {"tbody", 180},
1101 {"tfoot", 180},
1102 {"table", 190},
1103 {"head", 200},
1104 {"body", 200},
1105 {"html", 220},
1106 {NULL, 100} /* Default priority */
1107};
Owen Taylor3473f882001-02-23 17:55:21 +00001108
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001109static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001110static int htmlStartCloseIndexinitialized = 0;
1111
1112/************************************************************************
1113 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001114 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001115 * *
1116 ************************************************************************/
1117
1118/**
1119 * htmlInitAutoClose:
1120 *
1121 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1122 * This is not reentrant. Call xmlInitParser() once before processing in
1123 * case of use in multithreaded programs.
1124 */
1125void
1126htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001127 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001128
1129 if (htmlStartCloseIndexinitialized) return;
1130
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001131 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1132 indx = 0;
1133 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001134 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001135 while (htmlStartClose[i] != NULL) i++;
1136 i++;
1137 }
1138 htmlStartCloseIndexinitialized = 1;
1139}
1140
1141/**
1142 * htmlTagLookup:
1143 * @tag: The tag name in lowercase
1144 *
1145 * Lookup the HTML tag in the ElementTable
1146 *
1147 * Returns the related htmlElemDescPtr or NULL if not found.
1148 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001149const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001150htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001151 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001152
1153 for (i = 0; i < (sizeof(html40ElementTable) /
1154 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001155 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001156 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001157 }
1158 return(NULL);
1159}
1160
1161/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001162 * htmlGetEndPriority:
1163 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001164 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001165 * Return value: The "endtag" priority.
1166 **/
1167static int
1168htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001169 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001170
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171 while ((htmlEndPriority[i].name != NULL) &&
1172 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1173 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001174
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001175 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001176}
1177
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001178
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001179/**
Owen Taylor3473f882001-02-23 17:55:21 +00001180 * htmlCheckAutoClose:
1181 * @newtag: The new tag name
1182 * @oldtag: The old tag name
1183 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001184 * Checks whether the new tag is one of the registered valid tags for
1185 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001186 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1187 *
1188 * Returns 0 if no, 1 if yes.
1189 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001190static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001191htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1192{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001193 int i, indx;
1194 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001195
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001196 if (htmlStartCloseIndexinitialized == 0)
1197 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001198
1199 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001200 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001201 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001202 if (closed == NULL)
1203 return (0);
1204 if (xmlStrEqual(BAD_CAST * closed, newtag))
1205 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001206 }
1207
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001208 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001209 i++;
1210 while (htmlStartClose[i] != NULL) {
1211 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001212 return (1);
1213 }
1214 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001215 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001216 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001217}
1218
1219/**
1220 * htmlAutoCloseOnClose:
1221 * @ctxt: an HTML parser context
1222 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001223 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001224 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001225 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001226 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001227static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001228htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1229{
1230 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001231 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001232
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001234
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001235 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001236
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001237 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1238 break;
1239 /*
1240 * A missplaced endtag can only close elements with lower
1241 * or equal priority, so if we find an element with higher
1242 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001243 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 */
1245 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1246 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001247 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 if (i < 0)
1249 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001250
1251 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001252 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001253 if ((info != NULL) && (info->endTag == 3)) {
1254 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1255 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001256 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001257 }
1258 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1259 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001260 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001261 }
1262}
1263
1264/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001265 * htmlAutoCloseOnEnd:
1266 * @ctxt: an HTML parser context
1267 *
1268 * Close all remaining tags at the end of the stream
1269 */
1270static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001271htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1272{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001273 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001274
William M. Brack899e64a2003-09-26 18:03:42 +00001275 if (ctxt->nameNr == 0)
1276 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001277 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001278 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1279 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001280 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001281 }
1282}
1283
1284/**
Owen Taylor3473f882001-02-23 17:55:21 +00001285 * htmlAutoClose:
1286 * @ctxt: an HTML parser context
1287 * @newtag: The new tag name or NULL
1288 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001289 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001290 * The list is kept in htmlStartClose array. This function is
1291 * called when a new tag has been detected and generates the
1292 * appropriates closes if possible/needed.
1293 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001294 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001296static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1298{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001299 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001300 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001301 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1302 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001303 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001304 }
1305 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 htmlAutoCloseOnEnd(ctxt);
1307 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001308 }
1309 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001310 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1311 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1312 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001313 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1314 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001315 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001316 }
Owen Taylor3473f882001-02-23 17:55:21 +00001317}
1318
1319/**
1320 * htmlAutoCloseTag:
1321 * @doc: the HTML document
1322 * @name: The tag name
1323 * @elem: the HTML element
1324 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001325 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001326 * The list is kept in htmlStartClose array. This function checks
1327 * if the element or one of it's children would autoclose the
1328 * given tag.
1329 *
1330 * Returns 1 if autoclose, 0 otherwise
1331 */
1332int
1333htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1334 htmlNodePtr child;
1335
1336 if (elem == NULL) return(1);
1337 if (xmlStrEqual(name, elem->name)) return(0);
1338 if (htmlCheckAutoClose(elem->name, name)) return(1);
1339 child = elem->children;
1340 while (child != NULL) {
1341 if (htmlAutoCloseTag(doc, name, child)) return(1);
1342 child = child->next;
1343 }
1344 return(0);
1345}
1346
1347/**
1348 * htmlIsAutoClosed:
1349 * @doc: the HTML document
1350 * @elem: the HTML element
1351 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001352 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001353 * The list is kept in htmlStartClose array. This function checks
1354 * if a tag is autoclosed by one of it's child
1355 *
1356 * Returns 1 if autoclosed, 0 otherwise
1357 */
1358int
1359htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1360 htmlNodePtr child;
1361
1362 if (elem == NULL) return(1);
1363 child = elem->children;
1364 while (child != NULL) {
1365 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1366 child = child->next;
1367 }
1368 return(0);
1369}
1370
1371/**
1372 * htmlCheckImplied:
1373 * @ctxt: an HTML parser context
1374 * @newtag: The new tag name
1375 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001376 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001377 * called when a new tag has been detected and generates the
1378 * appropriates implicit tags if missing
1379 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001380static void
Owen Taylor3473f882001-02-23 17:55:21 +00001381htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1382 if (!htmlOmittedDefaultValue)
1383 return;
1384 if (xmlStrEqual(newtag, BAD_CAST"html"))
1385 return;
1386 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001387 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001388 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1389 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1390 }
1391 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1392 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001393 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001394 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1395 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1396 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1397 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1398 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1399 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001400 if (ctxt->html >= 3) {
1401 /* we already saw or generated an <head> before */
1402 return;
1403 }
1404 /*
1405 * dropped OBJECT ... i you put it first BODY will be
1406 * assumed !
1407 */
1408 htmlnamePush(ctxt, BAD_CAST"head");
1409 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1410 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001411 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1412 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1413 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001414 if (ctxt->html >= 10) {
1415 /* we already saw or generated a <body> before */
1416 return;
1417 }
Owen Taylor3473f882001-02-23 17:55:21 +00001418 int i;
1419 for (i = 0;i < ctxt->nameNr;i++) {
1420 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1421 return;
1422 }
1423 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1424 return;
1425 }
1426 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001427
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001428 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001429 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1430 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1431 }
1432}
1433
1434/**
1435 * htmlCheckParagraph
1436 * @ctxt: an HTML parser context
1437 *
1438 * Check whether a p element need to be implied before inserting
1439 * characters in the current element.
1440 *
1441 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1442 * in case of error.
1443 */
1444
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001445static int
Owen Taylor3473f882001-02-23 17:55:21 +00001446htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1447 const xmlChar *tag;
1448 int i;
1449
1450 if (ctxt == NULL)
1451 return(-1);
1452 tag = ctxt->name;
1453 if (tag == NULL) {
1454 htmlAutoClose(ctxt, BAD_CAST"p");
1455 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001456 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001457 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1458 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1459 return(1);
1460 }
1461 if (!htmlOmittedDefaultValue)
1462 return(0);
1463 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1464 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001465 htmlAutoClose(ctxt, BAD_CAST"p");
1466 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001467 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001468 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1469 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1470 return(1);
1471 }
1472 }
1473 return(0);
1474}
1475
1476/**
1477 * htmlIsScriptAttribute:
1478 * @name: an attribute name
1479 *
1480 * Check if an attribute is of content type Script
1481 *
1482 * Returns 1 is the attribute is a script 0 otherwise
1483 */
1484int
1485htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001486 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001487
1488 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001489 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001490 /*
1491 * all script attributes start with 'on'
1492 */
1493 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001494 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001495 for (i = 0;
1496 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1497 i++) {
1498 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1499 return(1);
1500 }
1501 return(0);
1502}
1503
1504/************************************************************************
1505 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001506 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001507 * *
1508 ************************************************************************/
1509
1510
Daniel Veillard22090732001-07-16 00:06:07 +00001511static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001512/*
1513 * the 4 absolute ones, plus apostrophe.
1514 */
1515{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1516{ 38, "amp", "ampersand, U+0026 ISOnum" },
1517{ 39, "apos", "single quote" },
1518{ 60, "lt", "less-than sign, U+003C ISOnum" },
1519{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1520
1521/*
1522 * A bunch still in the 128-255 range
1523 * Replacing them depend really on the charset used.
1524 */
1525{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1526{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1527{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1528{ 163, "pound","pound sign, U+00A3 ISOnum" },
1529{ 164, "curren","currency sign, U+00A4 ISOnum" },
1530{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1531{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1532{ 167, "sect", "section sign, U+00A7 ISOnum" },
1533{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1534{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1535{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1536{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1537{ 172, "not", "not sign, U+00AC ISOnum" },
1538{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1539{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1540{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1541{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1542{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1543{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1544{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1545{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1546{ 181, "micro","micro sign, U+00B5 ISOnum" },
1547{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1548{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1549{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1550{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1551{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1552{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1553{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1554{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1555{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1556{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1557{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1558{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1559{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1560{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1561{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1562{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1563{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1564{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1565{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1566{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1567{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1568{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1569{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1570{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1571{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1572{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1573{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1574{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1575{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1576{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1577{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1578{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1579{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1580{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1581{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1582{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1583{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1584{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1585{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1586{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1587{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1588{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1589{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1590{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1591{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1592{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1593{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1594{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1595{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1596{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1597{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1598{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1599{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1600{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1601{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1602{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1603{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1604{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1605{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1606{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1607{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1608{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1609{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1610{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1611{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1612{ 247, "divide","division sign, U+00F7 ISOnum" },
1613{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1614{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1615{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1616{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1617{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1618{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1619{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1620{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1621
1622{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1623{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1624{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1625{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1626{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1627
1628/*
1629 * Anything below should really be kept as entities references
1630 */
1631{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1632
1633{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1634{ 732, "tilde","small tilde, U+02DC ISOdia" },
1635
1636{ 913, "Alpha","greek capital letter alpha, U+0391" },
1637{ 914, "Beta", "greek capital letter beta, U+0392" },
1638{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1639{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1640{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1641{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1642{ 919, "Eta", "greek capital letter eta, U+0397" },
1643{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1644{ 921, "Iota", "greek capital letter iota, U+0399" },
1645{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001646{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001647{ 924, "Mu", "greek capital letter mu, U+039C" },
1648{ 925, "Nu", "greek capital letter nu, U+039D" },
1649{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1650{ 927, "Omicron","greek capital letter omicron, U+039F" },
1651{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1652{ 929, "Rho", "greek capital letter rho, U+03A1" },
1653{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1654{ 932, "Tau", "greek capital letter tau, U+03A4" },
1655{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1656{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1657{ 935, "Chi", "greek capital letter chi, U+03A7" },
1658{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1659{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1660
1661{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1662{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1663{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1664{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1665{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1666{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1667{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1668{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1669{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1670{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1671{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1672{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1673{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1674{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1675{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1676{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1677{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1678{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1679{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1680{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1681{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1682{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1683{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1684{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1685{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1686{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1687{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1688{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1689
1690{ 8194, "ensp", "en space, U+2002 ISOpub" },
1691{ 8195, "emsp", "em space, U+2003 ISOpub" },
1692{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1693{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1694{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1695{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1696{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1697{ 8211, "ndash","en dash, U+2013 ISOpub" },
1698{ 8212, "mdash","em dash, U+2014 ISOpub" },
1699{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1700{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1701{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1702{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1703{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1704{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1705{ 8224, "dagger","dagger, U+2020 ISOpub" },
1706{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1707
1708{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1709{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1710
1711{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1712
1713{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1714{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1715
1716{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1717{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1718
1719{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1720{ 8260, "frasl","fraction slash, U+2044 NEW" },
1721
1722{ 8364, "euro", "euro sign, U+20AC NEW" },
1723
1724{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1725{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1726{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1727{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1728{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1729{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1730{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1731{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1732{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1733{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1734{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1735{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1736{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1737{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1738{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1739{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1740
1741{ 8704, "forall","for all, U+2200 ISOtech" },
1742{ 8706, "part", "partial differential, U+2202 ISOtech" },
1743{ 8707, "exist","there exists, U+2203 ISOtech" },
1744{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1745{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1746{ 8712, "isin", "element of, U+2208 ISOtech" },
1747{ 8713, "notin","not an element of, U+2209 ISOtech" },
1748{ 8715, "ni", "contains as member, U+220B ISOtech" },
1749{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001750{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001751{ 8722, "minus","minus sign, U+2212 ISOtech" },
1752{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1753{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1754{ 8733, "prop", "proportional to, U+221D ISOtech" },
1755{ 8734, "infin","infinity, U+221E ISOtech" },
1756{ 8736, "ang", "angle, U+2220 ISOamso" },
1757{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1758{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1759{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1760{ 8746, "cup", "union = cup, U+222A ISOtech" },
1761{ 8747, "int", "integral, U+222B ISOtech" },
1762{ 8756, "there4","therefore, U+2234 ISOtech" },
1763{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1764{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1765{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1766{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1767{ 8801, "equiv","identical to, U+2261 ISOtech" },
1768{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1769{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1770{ 8834, "sub", "subset of, U+2282 ISOtech" },
1771{ 8835, "sup", "superset of, U+2283 ISOtech" },
1772{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1773{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1774{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1775{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1776{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1777{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1778{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1779{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1780{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1781{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1782{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1783{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1784{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1785{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1786
1787{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1788{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1789{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1790{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1791
1792};
1793
1794/************************************************************************
1795 * *
1796 * Commodity functions to handle entities *
1797 * *
1798 ************************************************************************/
1799
1800/*
1801 * Macro used to grow the current buffer.
1802 */
1803#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001804 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001805 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001806 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1807 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001808 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001809 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001810 return(NULL); \
1811 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001812 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001813}
1814
1815/**
1816 * htmlEntityLookup:
1817 * @name: the entity name
1818 *
1819 * Lookup the given entity in EntitiesTable
1820 *
1821 * TODO: the linear scan is really ugly, an hash table is really needed.
1822 *
1823 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1824 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001825const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001826htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001827 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001828
1829 for (i = 0;i < (sizeof(html40EntitiesTable)/
1830 sizeof(html40EntitiesTable[0]));i++) {
1831 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001832 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001833 }
1834 }
1835 return(NULL);
1836}
1837
1838/**
1839 * htmlEntityValueLookup:
1840 * @value: the entity's unicode value
1841 *
1842 * Lookup the given entity in EntitiesTable
1843 *
1844 * TODO: the linear scan is really ugly, an hash table is really needed.
1845 *
1846 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1847 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001848const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001849htmlEntityValueLookup(unsigned int value) {
1850 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001851
1852 for (i = 0;i < (sizeof(html40EntitiesTable)/
1853 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001854 if (html40EntitiesTable[i].value >= value) {
1855 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001856 break;
William M. Brack78637da2003-07-31 14:47:38 +00001857 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001858 }
Owen Taylor3473f882001-02-23 17:55:21 +00001859 }
1860 return(NULL);
1861}
1862
1863/**
1864 * UTF8ToHtml:
1865 * @out: a pointer to an array of bytes to store the result
1866 * @outlen: the length of @out
1867 * @in: a pointer to an array of UTF-8 chars
1868 * @inlen: the length of @in
1869 *
1870 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1871 * plus HTML entities block of chars out.
1872 *
1873 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1874 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001875 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001876 * The value of @outlen after return is the number of octets consumed.
1877 */
1878int
1879UTF8ToHtml(unsigned char* out, int *outlen,
1880 const unsigned char* in, int *inlen) {
1881 const unsigned char* processed = in;
1882 const unsigned char* outend;
1883 const unsigned char* outstart = out;
1884 const unsigned char* instart = in;
1885 const unsigned char* inend;
1886 unsigned int c, d;
1887 int trailing;
1888
Daniel Veillardce682bc2004-11-05 17:22:25 +00001889 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001890 if (in == NULL) {
1891 /*
1892 * initialization nothing to do
1893 */
1894 *outlen = 0;
1895 *inlen = 0;
1896 return(0);
1897 }
1898 inend = in + (*inlen);
1899 outend = out + (*outlen);
1900 while (in < inend) {
1901 d = *in++;
1902 if (d < 0x80) { c= d; trailing= 0; }
1903 else if (d < 0xC0) {
1904 /* trailing byte in leading position */
1905 *outlen = out - outstart;
1906 *inlen = processed - instart;
1907 return(-2);
1908 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1909 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1910 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1911 else {
1912 /* no chance for this in Ascii */
1913 *outlen = out - outstart;
1914 *inlen = processed - instart;
1915 return(-2);
1916 }
1917
1918 if (inend - in < trailing) {
1919 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001920 }
Owen Taylor3473f882001-02-23 17:55:21 +00001921
1922 for ( ; trailing; trailing--) {
1923 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1924 break;
1925 c <<= 6;
1926 c |= d & 0x3F;
1927 }
1928
1929 /* assertion: c is a single UTF-4 value */
1930 if (c < 0x80) {
1931 if (out + 1 >= outend)
1932 break;
1933 *out++ = c;
1934 } else {
1935 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001936 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001937 const char *cp;
1938 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001939
1940 /*
1941 * Try to lookup a predefined HTML entity for it
1942 */
1943
1944 ent = htmlEntityValueLookup(c);
1945 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001946 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1947 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001948 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001949 else
1950 cp = ent->name;
1951 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001952 if (out + 2 + len >= outend)
1953 break;
1954 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001955 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001956 out += len;
1957 *out++ = ';';
1958 }
1959 processed = in;
1960 }
1961 *outlen = out - outstart;
1962 *inlen = processed - instart;
1963 return(0);
1964}
1965
1966/**
1967 * htmlEncodeEntities:
1968 * @out: a pointer to an array of bytes to store the result
1969 * @outlen: the length of @out
1970 * @in: a pointer to an array of UTF-8 chars
1971 * @inlen: the length of @in
1972 * @quoteChar: the quote character to escape (' or ") or zero.
1973 *
1974 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1975 * plus HTML entities block of chars out.
1976 *
1977 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1978 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001979 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001980 * The value of @outlen after return is the number of octets consumed.
1981 */
1982int
1983htmlEncodeEntities(unsigned char* out, int *outlen,
1984 const unsigned char* in, int *inlen, int quoteChar) {
1985 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001986 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001987 const unsigned char* outstart = out;
1988 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001989 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001990 unsigned int c, d;
1991 int trailing;
1992
Daniel Veillardce682bc2004-11-05 17:22:25 +00001993 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1994 return(-1);
1995 outend = out + (*outlen);
1996 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001997 while (in < inend) {
1998 d = *in++;
1999 if (d < 0x80) { c= d; trailing= 0; }
2000 else if (d < 0xC0) {
2001 /* trailing byte in leading position */
2002 *outlen = out - outstart;
2003 *inlen = processed - instart;
2004 return(-2);
2005 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2006 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2007 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2008 else {
2009 /* no chance for this in Ascii */
2010 *outlen = out - outstart;
2011 *inlen = processed - instart;
2012 return(-2);
2013 }
2014
2015 if (inend - in < trailing)
2016 break;
2017
2018 while (trailing--) {
2019 if (((d= *in++) & 0xC0) != 0x80) {
2020 *outlen = out - outstart;
2021 *inlen = processed - instart;
2022 return(-2);
2023 }
2024 c <<= 6;
2025 c |= d & 0x3F;
2026 }
2027
2028 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002029 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2030 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002031 if (out >= outend)
2032 break;
2033 *out++ = c;
2034 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002035 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002036 const char *cp;
2037 char nbuf[16];
2038 int len;
2039
2040 /*
2041 * Try to lookup a predefined HTML entity for it
2042 */
2043 ent = htmlEntityValueLookup(c);
2044 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002045 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002046 cp = nbuf;
2047 }
2048 else
2049 cp = ent->name;
2050 len = strlen(cp);
2051 if (out + 2 + len > outend)
2052 break;
2053 *out++ = '&';
2054 memcpy(out, cp, len);
2055 out += len;
2056 *out++ = ';';
2057 }
2058 processed = in;
2059 }
2060 *outlen = out - outstart;
2061 *inlen = processed - instart;
2062 return(0);
2063}
2064
Owen Taylor3473f882001-02-23 17:55:21 +00002065/************************************************************************
2066 * *
2067 * Commodity functions to handle streams *
2068 * *
2069 ************************************************************************/
2070
2071/**
Owen Taylor3473f882001-02-23 17:55:21 +00002072 * htmlNewInputStream:
2073 * @ctxt: an HTML parser context
2074 *
2075 * Create a new input stream structure
2076 * Returns the new input stream or NULL
2077 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002078static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002079htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2080 htmlParserInputPtr input;
2081
2082 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2083 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002084 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002085 return(NULL);
2086 }
2087 memset(input, 0, sizeof(htmlParserInput));
2088 input->filename = NULL;
2089 input->directory = NULL;
2090 input->base = NULL;
2091 input->cur = NULL;
2092 input->buf = NULL;
2093 input->line = 1;
2094 input->col = 1;
2095 input->buf = NULL;
2096 input->free = NULL;
2097 input->version = NULL;
2098 input->consumed = 0;
2099 input->length = 0;
2100 return(input);
2101}
2102
2103
2104/************************************************************************
2105 * *
2106 * Commodity functions, cleanup needed ? *
2107 * *
2108 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002109/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002110 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002111 * NOTE: it might be more apropriate to integrate this information
2112 * into the html40ElementTable array but I don't want to risk any
2113 * binary incomptibility
2114 */
2115static const char *allowPCData[] = {
2116 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2117 "blockquote", "body", "button", "caption", "center", "cite", "code",
2118 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2119 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2120 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2121 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2122};
Owen Taylor3473f882001-02-23 17:55:21 +00002123
2124/**
2125 * areBlanks:
2126 * @ctxt: an HTML parser context
2127 * @str: a xmlChar *
2128 * @len: the size of @str
2129 *
2130 * Is this a sequence of blank chars that one can ignore ?
2131 *
2132 * Returns 1 if ignorable 0 otherwise.
2133 */
2134
2135static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002136 unsigned int i;
2137 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002138 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002139 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002140
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002141 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002142 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002143
2144 if (CUR == 0) return(1);
2145 if (CUR != '<') return(0);
2146 if (ctxt->name == NULL)
2147 return(1);
2148 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2149 return(1);
2150 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2151 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002152
2153 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2154 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2155 dtd = xmlGetIntSubset(ctxt->myDoc);
2156 if (dtd != NULL && dtd->ExternalID != NULL) {
2157 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2158 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2159 return(1);
2160 }
2161 }
2162
Owen Taylor3473f882001-02-23 17:55:21 +00002163 if (ctxt->node == NULL) return(0);
2164 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002165 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2166 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002167 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002168 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2169 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002170 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002171 for all tags "b" allowing PCDATA */
2172 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2173 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2174 return(0);
2175 }
2176 }
Owen Taylor3473f882001-02-23 17:55:21 +00002177 } else if (xmlNodeIsText(lastChild)) {
2178 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002179 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002180 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002181 for all tags "p" allowing PCDATA */
2182 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2183 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2184 return(0);
2185 }
2186 }
Owen Taylor3473f882001-02-23 17:55:21 +00002187 }
2188 return(1);
2189}
2190
2191/**
Owen Taylor3473f882001-02-23 17:55:21 +00002192 * htmlNewDocNoDtD:
2193 * @URI: URI for the dtd, or NULL
2194 * @ExternalID: the external ID of the DTD, or NULL
2195 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002196 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2197 * are NULL
2198 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002199 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002200 */
2201htmlDocPtr
2202htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2203 xmlDocPtr cur;
2204
2205 /*
2206 * Allocate a new document and fill the fields.
2207 */
2208 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2209 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002210 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002211 return(NULL);
2212 }
2213 memset(cur, 0, sizeof(xmlDoc));
2214
2215 cur->type = XML_HTML_DOCUMENT_NODE;
2216 cur->version = NULL;
2217 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002218 cur->doc = cur;
2219 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002220 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002221 cur->extSubset = NULL;
2222 cur->oldNs = NULL;
2223 cur->encoding = NULL;
2224 cur->standalone = 1;
2225 cur->compression = 0;
2226 cur->ids = NULL;
2227 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002228 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002229 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002230 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002231 if ((ExternalID != NULL) ||
2232 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002233 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002234 return(cur);
2235}
2236
2237/**
2238 * htmlNewDoc:
2239 * @URI: URI for the dtd, or NULL
2240 * @ExternalID: the external ID of the DTD, or NULL
2241 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002242 * Creates a new HTML document
2243 *
Owen Taylor3473f882001-02-23 17:55:21 +00002244 * Returns a new document
2245 */
2246htmlDocPtr
2247htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2248 if ((URI == NULL) && (ExternalID == NULL))
2249 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002250 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2251 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002252
2253 return(htmlNewDocNoDtD(URI, ExternalID));
2254}
2255
2256
2257/************************************************************************
2258 * *
2259 * The parser itself *
2260 * Relates to http://www.w3.org/TR/html40 *
2261 * *
2262 ************************************************************************/
2263
2264/************************************************************************
2265 * *
2266 * The parser itself *
2267 * *
2268 ************************************************************************/
2269
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002270static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002271
Owen Taylor3473f882001-02-23 17:55:21 +00002272/**
2273 * htmlParseHTMLName:
2274 * @ctxt: an HTML parser context
2275 *
2276 * parse an HTML tag or attribute name, note that we convert it to lowercase
2277 * since HTML names are not case-sensitive.
2278 *
2279 * Returns the Tag Name parsed or NULL
2280 */
2281
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002282static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002283htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002284 int i = 0;
2285 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2286
William M. Brackd1757ab2004-10-02 22:07:48 +00002287 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002288 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002289
2290 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002291 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002292 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2293 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002294 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2295 else loc[i] = CUR;
2296 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002297
Owen Taylor3473f882001-02-23 17:55:21 +00002298 NEXT;
2299 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002300
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002301 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002302}
2303
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002304
2305/**
2306 * htmlParseHTMLName_nonInvasive:
2307 * @ctxt: an HTML parser context
2308 *
2309 * parse an HTML tag or attribute name, note that we convert it to lowercase
2310 * since HTML names are not case-sensitive, this doesn't consume the data
2311 * from the stream, it's a look-ahead
2312 *
2313 * Returns the Tag Name parsed or NULL
2314 */
2315
2316static const xmlChar *
2317htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2318 int i = 0;
2319 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2320
2321 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2322 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002323
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002324 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2325 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2326 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2327 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2328 else loc[i] = NXT(1+i);
2329 i++;
2330 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002331
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002332 return(xmlDictLookup(ctxt->dict, loc, i));
2333}
2334
2335
Owen Taylor3473f882001-02-23 17:55:21 +00002336/**
2337 * htmlParseName:
2338 * @ctxt: an HTML parser context
2339 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002340 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002341 *
2342 * Returns the Name parsed or NULL
2343 */
2344
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002345static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002346htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002347 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002348 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002349 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002350
2351 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002352
2353 /*
2354 * Accelerator for simple ASCII names
2355 */
2356 in = ctxt->input->cur;
2357 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2358 ((*in >= 0x41) && (*in <= 0x5A)) ||
2359 (*in == '_') || (*in == ':')) {
2360 in++;
2361 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2362 ((*in >= 0x41) && (*in <= 0x5A)) ||
2363 ((*in >= 0x30) && (*in <= 0x39)) ||
2364 (*in == '_') || (*in == '-') ||
2365 (*in == ':') || (*in == '.'))
2366 in++;
2367 if ((*in > 0) && (*in < 0x80)) {
2368 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002369 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002370 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002371 ctxt->nbChars += count;
2372 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002373 return(ret);
2374 }
2375 }
2376 return(htmlParseNameComplex(ctxt));
2377}
2378
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002379static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002380htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002381 int len = 0, l;
2382 int c;
2383 int count = 0;
2384
2385 /*
2386 * Handler for more complex cases
2387 */
2388 GROW;
2389 c = CUR_CHAR(l);
2390 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2391 (!IS_LETTER(c) && (c != '_') &&
2392 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002393 return(NULL);
2394 }
2395
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002396 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2397 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2398 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002399 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002400 (IS_COMBINING(c)) ||
2401 (IS_EXTENDER(c)))) {
2402 if (count++ > 100) {
2403 count = 0;
2404 GROW;
2405 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002406 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002407 NEXTL(l);
2408 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002409 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002410 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002411}
2412
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002413
Owen Taylor3473f882001-02-23 17:55:21 +00002414/**
2415 * htmlParseHTMLAttribute:
2416 * @ctxt: an HTML parser context
2417 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002418 *
Owen Taylor3473f882001-02-23 17:55:21 +00002419 * parse an HTML attribute value till the stop (quote), if
2420 * stop is 0 then it stops at the first space
2421 *
2422 * Returns the attribute parsed or NULL
2423 */
2424
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002425static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002426htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2427 xmlChar *buffer = NULL;
2428 int buffer_size = 0;
2429 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002430 const xmlChar *name = NULL;
2431 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002432 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002433
2434 /*
2435 * allocate a translation buffer.
2436 */
2437 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002438 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002439 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002440 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002441 return(NULL);
2442 }
2443 out = buffer;
2444
2445 /*
2446 * Ok loop until we reach one of the ending chars
2447 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002448 while ((CUR != 0) && (CUR != stop)) {
2449 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002450 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002451 if (CUR == '&') {
2452 if (NXT(1) == '#') {
2453 unsigned int c;
2454 int bits;
2455
2456 c = htmlParseCharRef(ctxt);
2457 if (c < 0x80)
2458 { *out++ = c; bits= -6; }
2459 else if (c < 0x800)
2460 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2461 else if (c < 0x10000)
2462 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002463 else
Owen Taylor3473f882001-02-23 17:55:21 +00002464 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002465
Owen Taylor3473f882001-02-23 17:55:21 +00002466 for ( ; bits >= 0; bits-= 6) {
2467 *out++ = ((c >> bits) & 0x3F) | 0x80;
2468 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002469
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002470 if (out - buffer > buffer_size - 100) {
2471 int indx = out - buffer;
2472
2473 growBuffer(buffer);
2474 out = &buffer[indx];
2475 }
Owen Taylor3473f882001-02-23 17:55:21 +00002476 } else {
2477 ent = htmlParseEntityRef(ctxt, &name);
2478 if (name == NULL) {
2479 *out++ = '&';
2480 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002481 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002482
2483 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002484 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002485 }
2486 } else if (ent == NULL) {
2487 *out++ = '&';
2488 cur = name;
2489 while (*cur != 0) {
2490 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002491 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002492
2493 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002494 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002495 }
2496 *out++ = *cur++;
2497 }
Owen Taylor3473f882001-02-23 17:55:21 +00002498 } else {
2499 unsigned int c;
2500 int bits;
2501
2502 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002503 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002504
2505 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002506 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002507 }
Daniel Veillard48519092006-10-17 15:56:35 +00002508 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002509 if (c < 0x80)
2510 { *out++ = c; bits= -6; }
2511 else if (c < 0x800)
2512 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2513 else if (c < 0x10000)
2514 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002515 else
Owen Taylor3473f882001-02-23 17:55:21 +00002516 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002517
Owen Taylor3473f882001-02-23 17:55:21 +00002518 for ( ; bits >= 0; bits-= 6) {
2519 *out++ = ((c >> bits) & 0x3F) | 0x80;
2520 }
Owen Taylor3473f882001-02-23 17:55:21 +00002521 }
2522 }
2523 } else {
2524 unsigned int c;
2525 int bits, l;
2526
2527 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002528 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002529
2530 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002531 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002532 }
2533 c = CUR_CHAR(l);
2534 if (c < 0x80)
2535 { *out++ = c; bits= -6; }
2536 else if (c < 0x800)
2537 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2538 else if (c < 0x10000)
2539 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002540 else
Owen Taylor3473f882001-02-23 17:55:21 +00002541 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002542
Owen Taylor3473f882001-02-23 17:55:21 +00002543 for ( ; bits >= 0; bits-= 6) {
2544 *out++ = ((c >> bits) & 0x3F) | 0x80;
2545 }
2546 NEXT;
2547 }
2548 }
2549 *out++ = 0;
2550 return(buffer);
2551}
2552
2553/**
Owen Taylor3473f882001-02-23 17:55:21 +00002554 * htmlParseEntityRef:
2555 * @ctxt: an HTML parser context
2556 * @str: location to store the entity name
2557 *
2558 * parse an HTML ENTITY references
2559 *
2560 * [68] EntityRef ::= '&' Name ';'
2561 *
2562 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2563 * if non-NULL *str will have to be freed by the caller.
2564 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002565const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002566htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2567 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002568 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002569
2570 if (str != NULL) *str = NULL;
2571 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002572
2573 if (CUR == '&') {
2574 NEXT;
2575 name = htmlParseName(ctxt);
2576 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002577 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2578 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002579 } else {
2580 GROW;
2581 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002582 if (str != NULL)
2583 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002584
2585 /*
2586 * Lookup the entity in the table.
2587 */
2588 ent = htmlEntityLookup(name);
2589 if (ent != NULL) /* OK that's ugly !!! */
2590 NEXT;
2591 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002592 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2593 "htmlParseEntityRef: expecting ';'\n",
2594 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002595 if (str != NULL)
2596 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002597 }
2598 }
2599 }
2600 return(ent);
2601}
2602
2603/**
2604 * htmlParseAttValue:
2605 * @ctxt: an HTML parser context
2606 *
2607 * parse a value for an attribute
2608 * Note: the parser won't do substitution of entities here, this
2609 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002610 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002611 *
2612 * Returns the AttValue parsed or NULL.
2613 */
2614
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002615static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002616htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2617 xmlChar *ret = NULL;
2618
2619 if (CUR == '"') {
2620 NEXT;
2621 ret = htmlParseHTMLAttribute(ctxt, '"');
2622 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002623 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2624 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002625 } else
2626 NEXT;
2627 } else if (CUR == '\'') {
2628 NEXT;
2629 ret = htmlParseHTMLAttribute(ctxt, '\'');
2630 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002631 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2632 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002633 } else
2634 NEXT;
2635 } else {
2636 /*
2637 * That's an HTMLism, the attribute value may not be quoted
2638 */
2639 ret = htmlParseHTMLAttribute(ctxt, 0);
2640 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002641 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2642 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002643 }
2644 }
2645 return(ret);
2646}
2647
2648/**
2649 * htmlParseSystemLiteral:
2650 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002651 *
Owen Taylor3473f882001-02-23 17:55:21 +00002652 * parse an HTML Literal
2653 *
2654 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2655 *
2656 * Returns the SystemLiteral parsed or NULL
2657 */
2658
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002659static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002660htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2661 const xmlChar *q;
2662 xmlChar *ret = NULL;
2663
2664 if (CUR == '"') {
2665 NEXT;
2666 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002667 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002668 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002669 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002670 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2671 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002672 } else {
2673 ret = xmlStrndup(q, CUR_PTR - q);
2674 NEXT;
2675 }
2676 } else if (CUR == '\'') {
2677 NEXT;
2678 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002679 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002680 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002681 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002682 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2683 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002684 } else {
2685 ret = xmlStrndup(q, CUR_PTR - q);
2686 NEXT;
2687 }
2688 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002689 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2690 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002691 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002692
Owen Taylor3473f882001-02-23 17:55:21 +00002693 return(ret);
2694}
2695
2696/**
2697 * htmlParsePubidLiteral:
2698 * @ctxt: an HTML parser context
2699 *
2700 * parse an HTML public literal
2701 *
2702 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2703 *
2704 * Returns the PubidLiteral parsed or NULL.
2705 */
2706
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002707static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002708htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2709 const xmlChar *q;
2710 xmlChar *ret = NULL;
2711 /*
2712 * Name ::= (Letter | '_') (NameChar)*
2713 */
2714 if (CUR == '"') {
2715 NEXT;
2716 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002717 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002718 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002719 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2720 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002721 } else {
2722 ret = xmlStrndup(q, CUR_PTR - q);
2723 NEXT;
2724 }
2725 } else if (CUR == '\'') {
2726 NEXT;
2727 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002728 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002729 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002730 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002731 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2732 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002733 } else {
2734 ret = xmlStrndup(q, CUR_PTR - q);
2735 NEXT;
2736 }
2737 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002738 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2739 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002740 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002741
Owen Taylor3473f882001-02-23 17:55:21 +00002742 return(ret);
2743}
2744
2745/**
2746 * htmlParseScript:
2747 * @ctxt: an HTML parser context
2748 *
2749 * parse the content of an HTML SCRIPT or STYLE element
2750 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2751 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2752 * http://www.w3.org/TR/html4/types.html#type-script
2753 * http://www.w3.org/TR/html4/types.html#h-6.15
2754 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2755 *
2756 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2757 * element and the value of intrinsic event attributes. User agents must
2758 * not evaluate script data as HTML markup but instead must pass it on as
2759 * data to a script engine.
2760 * NOTES:
2761 * - The content is passed like CDATA
2762 * - the attributes for style and scripting "onXXX" are also described
2763 * as CDATA but SGML allows entities references in attributes so their
2764 * processing is identical as other attributes
2765 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002766static void
Owen Taylor3473f882001-02-23 17:55:21 +00002767htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002768 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002769 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002770 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002771
2772 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002773 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002774 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002775 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002776 /*
2777 * One should break here, the specification is clear:
2778 * Authors should therefore escape "</" within the content.
2779 * Escape mechanisms are specific to each scripting or
2780 * style sheet language.
2781 *
2782 * In recovery mode, only break if end tag match the
2783 * current tag, effectively ignoring all tags inside the
2784 * script/style block and treating the entire block as
2785 * CDATA.
2786 */
2787 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002788 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2789 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002790 {
2791 break; /* while */
2792 } else {
2793 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002794 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002795 ctxt->name, NULL);
2796 }
2797 } else {
2798 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002799 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002800 {
2801 break; /* while */
2802 }
2803 }
Owen Taylor3473f882001-02-23 17:55:21 +00002804 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002805 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002806 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2807 if (ctxt->sax->cdataBlock!= NULL) {
2808 /*
2809 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2810 */
2811 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002812 } else if (ctxt->sax->characters != NULL) {
2813 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002814 }
2815 nbchar = 0;
2816 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002817 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002818 NEXTL(l);
2819 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002820 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002821
Daniel Veillard68716a72006-10-16 09:32:17 +00002822 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002823 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2824 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002825 NEXT;
2826 }
2827
2828 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2829 if (ctxt->sax->cdataBlock!= NULL) {
2830 /*
2831 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2832 */
2833 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002834 } else if (ctxt->sax->characters != NULL) {
2835 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002836 }
2837 }
2838}
2839
2840
2841/**
2842 * htmlParseCharData:
2843 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002844 *
2845 * parse a CharData section.
2846 * if we are within a CDATA section ']]>' marks an end of section.
2847 *
2848 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2849 */
2850
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002851static void
2852htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002853 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2854 int nbchar = 0;
2855 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002856 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002857
2858 SHRINK;
2859 cur = CUR_CHAR(l);
2860 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002861 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002862 (cur != 0)) {
2863 if (!(IS_CHAR(cur))) {
2864 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2865 "Invalid char in CDATA 0x%X\n", cur);
2866 } else {
2867 COPY_BUF(l,buf,nbchar,cur);
2868 }
Owen Taylor3473f882001-02-23 17:55:21 +00002869 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2870 /*
2871 * Ok the segment is to be consumed as chars.
2872 */
2873 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2874 if (areBlanks(ctxt, buf, nbchar)) {
2875 if (ctxt->sax->ignorableWhitespace != NULL)
2876 ctxt->sax->ignorableWhitespace(ctxt->userData,
2877 buf, nbchar);
2878 } else {
2879 htmlCheckParagraph(ctxt);
2880 if (ctxt->sax->characters != NULL)
2881 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2882 }
2883 }
2884 nbchar = 0;
2885 }
2886 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002887 chunk++;
2888 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2889 chunk = 0;
2890 SHRINK;
2891 GROW;
2892 }
Owen Taylor3473f882001-02-23 17:55:21 +00002893 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002894 if (cur == 0) {
2895 SHRINK;
2896 GROW;
2897 cur = CUR_CHAR(l);
2898 }
Owen Taylor3473f882001-02-23 17:55:21 +00002899 }
2900 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002901 buf[nbchar] = 0;
2902
Owen Taylor3473f882001-02-23 17:55:21 +00002903 /*
2904 * Ok the segment is to be consumed as chars.
2905 */
2906 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2907 if (areBlanks(ctxt, buf, nbchar)) {
2908 if (ctxt->sax->ignorableWhitespace != NULL)
2909 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2910 } else {
2911 htmlCheckParagraph(ctxt);
2912 if (ctxt->sax->characters != NULL)
2913 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2914 }
2915 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002916 } else {
2917 /*
2918 * Loop detection
2919 */
2920 if (cur == 0)
2921 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002922 }
2923}
2924
2925/**
2926 * htmlParseExternalID:
2927 * @ctxt: an HTML parser context
2928 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002929 *
2930 * Parse an External ID or a Public ID
2931 *
Owen Taylor3473f882001-02-23 17:55:21 +00002932 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2933 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2934 *
2935 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2936 *
2937 * Returns the function returns SystemLiteral and in the second
2938 * case publicID receives PubidLiteral, is strict is off
2939 * it is possible to return NULL and have publicID set.
2940 */
2941
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002942static xmlChar *
2943htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002944 xmlChar *URI = NULL;
2945
2946 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2947 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2948 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2949 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002950 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002951 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2952 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002953 }
2954 SKIP_BLANKS;
2955 URI = htmlParseSystemLiteral(ctxt);
2956 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002957 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2958 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002959 }
2960 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2961 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2962 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2963 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002964 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002965 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2966 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002967 }
2968 SKIP_BLANKS;
2969 *publicID = htmlParsePubidLiteral(ctxt);
2970 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002971 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2972 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2973 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002974 }
2975 SKIP_BLANKS;
2976 if ((CUR == '"') || (CUR == '\'')) {
2977 URI = htmlParseSystemLiteral(ctxt);
2978 }
2979 }
2980 return(URI);
2981}
2982
2983/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002984 * xmlParsePI:
2985 * @ctxt: an XML parser context
2986 *
2987 * parse an XML Processing Instruction.
2988 *
2989 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2990 */
2991static void
2992htmlParsePI(htmlParserCtxtPtr ctxt) {
2993 xmlChar *buf = NULL;
2994 int len = 0;
2995 int size = HTML_PARSER_BUFFER_SIZE;
2996 int cur, l;
2997 const xmlChar *target;
2998 xmlParserInputState state;
2999 int count = 0;
3000
3001 if ((RAW == '<') && (NXT(1) == '?')) {
3002 state = ctxt->instate;
3003 ctxt->instate = XML_PARSER_PI;
3004 /*
3005 * this is a Processing Instruction.
3006 */
3007 SKIP(2);
3008 SHRINK;
3009
3010 /*
3011 * Parse the target name and check for special support like
3012 * namespace.
3013 */
3014 target = htmlParseName(ctxt);
3015 if (target != NULL) {
3016 if (RAW == '>') {
3017 SKIP(1);
3018
3019 /*
3020 * SAX: PI detected.
3021 */
3022 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3023 (ctxt->sax->processingInstruction != NULL))
3024 ctxt->sax->processingInstruction(ctxt->userData,
3025 target, NULL);
3026 ctxt->instate = state;
3027 return;
3028 }
3029 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3030 if (buf == NULL) {
3031 htmlErrMemory(ctxt, NULL);
3032 ctxt->instate = state;
3033 return;
3034 }
3035 cur = CUR;
3036 if (!IS_BLANK(cur)) {
3037 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3038 "ParsePI: PI %s space expected\n", target, NULL);
3039 }
3040 SKIP_BLANKS;
3041 cur = CUR_CHAR(l);
3042 while (IS_CHAR(cur) && (cur != '>')) {
3043 if (len + 5 >= size) {
3044 xmlChar *tmp;
3045
3046 size *= 2;
3047 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3048 if (tmp == NULL) {
3049 htmlErrMemory(ctxt, NULL);
3050 xmlFree(buf);
3051 ctxt->instate = state;
3052 return;
3053 }
3054 buf = tmp;
3055 }
3056 count++;
3057 if (count > 50) {
3058 GROW;
3059 count = 0;
3060 }
3061 COPY_BUF(l,buf,len,cur);
3062 NEXTL(l);
3063 cur = CUR_CHAR(l);
3064 if (cur == 0) {
3065 SHRINK;
3066 GROW;
3067 cur = CUR_CHAR(l);
3068 }
3069 }
3070 buf[len] = 0;
3071 if (cur != '>') {
3072 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3073 "ParsePI: PI %s never end ...\n", target, NULL);
3074 } else {
3075 SKIP(1);
3076
3077 /*
3078 * SAX: PI detected.
3079 */
3080 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3081 (ctxt->sax->processingInstruction != NULL))
3082 ctxt->sax->processingInstruction(ctxt->userData,
3083 target, buf);
3084 }
3085 xmlFree(buf);
3086 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003087 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003088 "PI is not started correctly", NULL, NULL);
3089 }
3090 ctxt->instate = state;
3091 }
3092}
3093
3094/**
Owen Taylor3473f882001-02-23 17:55:21 +00003095 * htmlParseComment:
3096 * @ctxt: an HTML parser context
3097 *
3098 * Parse an XML (SGML) comment <!-- .... -->
3099 *
3100 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3101 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003102static void
Owen Taylor3473f882001-02-23 17:55:21 +00003103htmlParseComment(htmlParserCtxtPtr ctxt) {
3104 xmlChar *buf = NULL;
3105 int len;
3106 int size = HTML_PARSER_BUFFER_SIZE;
3107 int q, ql;
3108 int r, rl;
3109 int cur, l;
3110 xmlParserInputState state;
3111
3112 /*
3113 * Check that there is a comment right here.
3114 */
3115 if ((RAW != '<') || (NXT(1) != '!') ||
3116 (NXT(2) != '-') || (NXT(3) != '-')) return;
3117
3118 state = ctxt->instate;
3119 ctxt->instate = XML_PARSER_COMMENT;
3120 SHRINK;
3121 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003122 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003123 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003124 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003125 ctxt->instate = state;
3126 return;
3127 }
3128 q = CUR_CHAR(ql);
3129 NEXTL(ql);
3130 r = CUR_CHAR(rl);
3131 NEXTL(rl);
3132 cur = CUR_CHAR(l);
3133 len = 0;
3134 while (IS_CHAR(cur) &&
3135 ((cur != '>') ||
3136 (r != '-') || (q != '-'))) {
3137 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003138 xmlChar *tmp;
3139
Owen Taylor3473f882001-02-23 17:55:21 +00003140 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003141 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3142 if (tmp == NULL) {
3143 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003144 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003145 ctxt->instate = state;
3146 return;
3147 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003148 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003149 }
3150 COPY_BUF(ql,buf,len,q);
3151 q = r;
3152 ql = rl;
3153 r = cur;
3154 rl = l;
3155 NEXTL(l);
3156 cur = CUR_CHAR(l);
3157 if (cur == 0) {
3158 SHRINK;
3159 GROW;
3160 cur = CUR_CHAR(l);
3161 }
3162 }
3163 buf[len] = 0;
3164 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003165 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3166 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003167 xmlFree(buf);
3168 } else {
3169 NEXT;
3170 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3171 (!ctxt->disableSAX))
3172 ctxt->sax->comment(ctxt->userData, buf);
3173 xmlFree(buf);
3174 }
3175 ctxt->instate = state;
3176}
3177
3178/**
3179 * htmlParseCharRef:
3180 * @ctxt: an HTML parser context
3181 *
3182 * parse Reference declarations
3183 *
3184 * [66] CharRef ::= '&#' [0-9]+ ';' |
3185 * '&#x' [0-9a-fA-F]+ ';'
3186 *
3187 * Returns the value parsed (as an int)
3188 */
3189int
3190htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3191 int val = 0;
3192
Daniel Veillarda03e3652004-11-02 18:45:30 +00003193 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3194 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3195 "htmlParseCharRef: context error\n",
3196 NULL, NULL);
3197 return(0);
3198 }
Owen Taylor3473f882001-02-23 17:55:21 +00003199 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003200 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003201 SKIP(3);
3202 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003203 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003204 val = val * 16 + (CUR - '0');
3205 else if ((CUR >= 'a') && (CUR <= 'f'))
3206 val = val * 16 + (CUR - 'a') + 10;
3207 else if ((CUR >= 'A') && (CUR <= 'F'))
3208 val = val * 16 + (CUR - 'A') + 10;
3209 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003210 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003211 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003212 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003213 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003214 }
3215 NEXT;
3216 }
3217 if (CUR == ';')
3218 NEXT;
3219 } else if ((CUR == '&') && (NXT(1) == '#')) {
3220 SKIP(2);
3221 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003222 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003223 val = val * 10 + (CUR - '0');
3224 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003225 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003226 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003227 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003228 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003229 }
3230 NEXT;
3231 }
3232 if (CUR == ';')
3233 NEXT;
3234 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003235 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3236 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003237 }
3238 /*
3239 * Check the value IS_CHAR ...
3240 */
3241 if (IS_CHAR(val)) {
3242 return(val);
3243 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003244 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3245 "htmlParseCharRef: invalid xmlChar value %d\n",
3246 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003247 }
3248 return(0);
3249}
3250
3251
3252/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003253 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003254 * @ctxt: an HTML parser context
3255 *
3256 * parse a DOCTYPE declaration
3257 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003258 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003259 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3260 */
3261
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003262static void
Owen Taylor3473f882001-02-23 17:55:21 +00003263htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003264 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003265 xmlChar *ExternalID = NULL;
3266 xmlChar *URI = NULL;
3267
3268 /*
3269 * We know that '<!DOCTYPE' has been detected.
3270 */
3271 SKIP(9);
3272
3273 SKIP_BLANKS;
3274
3275 /*
3276 * Parse the DOCTYPE name.
3277 */
3278 name = htmlParseName(ctxt);
3279 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003280 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3281 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3282 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003283 }
3284 /*
3285 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3286 */
3287
3288 SKIP_BLANKS;
3289
3290 /*
3291 * Check for SystemID and ExternalID
3292 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003293 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003294 SKIP_BLANKS;
3295
3296 /*
3297 * We should be at the end of the DOCTYPE declaration.
3298 */
3299 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003300 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3301 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003302 /* We shouldn't try to resynchronize ... */
3303 }
3304 NEXT;
3305
3306 /*
3307 * Create or update the document accordingly to the DOCTYPE
3308 */
3309 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3310 (!ctxt->disableSAX))
3311 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3312
3313 /*
3314 * Cleanup, since we don't use all those identifiers
3315 */
3316 if (URI != NULL) xmlFree(URI);
3317 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003318}
3319
3320/**
3321 * htmlParseAttribute:
3322 * @ctxt: an HTML parser context
3323 * @value: a xmlChar ** used to store the value of the attribute
3324 *
3325 * parse an attribute
3326 *
3327 * [41] Attribute ::= Name Eq AttValue
3328 *
3329 * [25] Eq ::= S? '=' S?
3330 *
3331 * With namespace:
3332 *
3333 * [NS 11] Attribute ::= QName Eq AttValue
3334 *
3335 * Also the case QName == xmlns:??? is handled independently as a namespace
3336 * definition.
3337 *
3338 * Returns the attribute name, and the value in *value.
3339 */
3340
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003341static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003342htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003343 const xmlChar *name;
3344 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003345
3346 *value = NULL;
3347 name = htmlParseHTMLName(ctxt);
3348 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003349 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3350 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003351 return(NULL);
3352 }
3353
3354 /*
3355 * read the value
3356 */
3357 SKIP_BLANKS;
3358 if (CUR == '=') {
3359 NEXT;
3360 SKIP_BLANKS;
3361 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003362 } else if (htmlIsBooleanAttr(name)) {
3363 /*
3364 * assume a minimized attribute
3365 */
3366 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003367 }
3368
3369 *value = val;
3370 return(name);
3371}
3372
3373/**
3374 * htmlCheckEncoding:
3375 * @ctxt: an HTML parser context
3376 * @attvalue: the attribute value
3377 *
3378 * Checks an http-equiv attribute from a Meta tag to detect
3379 * the encoding
3380 * If a new encoding is detected the parser is switched to decode
3381 * it and pass UTF8
3382 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003383static void
Owen Taylor3473f882001-02-23 17:55:21 +00003384htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3385 const xmlChar *encoding;
3386
3387 if ((ctxt == NULL) || (attvalue == NULL))
3388 return;
3389
Daniel Veillarde77db162009-08-22 11:32:38 +02003390 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003391 if (ctxt->input->encoding != NULL)
3392 return;
3393
3394 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3395 if (encoding != NULL) {
3396 encoding += 8;
3397 } else {
3398 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3399 if (encoding != NULL)
3400 encoding += 9;
3401 }
3402 if (encoding != NULL) {
3403 xmlCharEncoding enc;
3404 xmlCharEncodingHandlerPtr handler;
3405
3406 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3407
3408 if (ctxt->input->encoding != NULL)
3409 xmlFree((xmlChar *) ctxt->input->encoding);
3410 ctxt->input->encoding = xmlStrdup(encoding);
3411
3412 enc = xmlParseCharEncoding((const char *) encoding);
3413 /*
3414 * registered set of known encodings
3415 */
3416 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003417 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003418 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3419 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3420 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3421 (ctxt->input->buf != NULL) &&
3422 (ctxt->input->buf->encoder == NULL)) {
3423 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3424 "htmlCheckEncoding: wrong encoding meta\n",
3425 NULL, NULL);
3426 } else {
3427 xmlSwitchEncoding(ctxt, enc);
3428 }
Owen Taylor3473f882001-02-23 17:55:21 +00003429 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3430 } else {
3431 /*
3432 * fallback for unknown encodings
3433 */
3434 handler = xmlFindCharEncodingHandler((const char *) encoding);
3435 if (handler != NULL) {
3436 xmlSwitchToEncoding(ctxt, handler);
3437 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3438 } else {
3439 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3440 }
3441 }
3442
3443 if ((ctxt->input->buf != NULL) &&
3444 (ctxt->input->buf->encoder != NULL) &&
3445 (ctxt->input->buf->raw != NULL) &&
3446 (ctxt->input->buf->buffer != NULL)) {
3447 int nbchars;
3448 int processed;
3449
3450 /*
3451 * convert as much as possible to the parser reading buffer.
3452 */
3453 processed = ctxt->input->cur - ctxt->input->base;
3454 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3455 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3456 ctxt->input->buf->buffer,
3457 ctxt->input->buf->raw);
3458 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003459 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3460 "htmlCheckEncoding: encoder error\n",
3461 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003462 }
3463 ctxt->input->base =
3464 ctxt->input->cur = ctxt->input->buf->buffer->content;
3465 }
3466 }
3467}
3468
3469/**
3470 * htmlCheckMeta:
3471 * @ctxt: an HTML parser context
3472 * @atts: the attributes values
3473 *
3474 * Checks an attributes from a Meta tag
3475 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003476static void
Owen Taylor3473f882001-02-23 17:55:21 +00003477htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3478 int i;
3479 const xmlChar *att, *value;
3480 int http = 0;
3481 const xmlChar *content = NULL;
3482
3483 if ((ctxt == NULL) || (atts == NULL))
3484 return;
3485
3486 i = 0;
3487 att = atts[i++];
3488 while (att != NULL) {
3489 value = atts[i++];
3490 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3491 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3492 http = 1;
3493 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3494 content = value;
3495 att = atts[i++];
3496 }
3497 if ((http) && (content != NULL))
3498 htmlCheckEncoding(ctxt, content);
3499
3500}
3501
3502/**
3503 * htmlParseStartTag:
3504 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003505 *
Owen Taylor3473f882001-02-23 17:55:21 +00003506 * parse a start of tag either for rule element or
3507 * EmptyElement. In both case we don't parse the tag closing chars.
3508 *
3509 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3510 *
3511 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3512 *
3513 * With namespace:
3514 *
3515 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3516 *
3517 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3518 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003519 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003520 */
3521
Daniel Veillard597f1c12005-07-03 23:00:18 +00003522static int
Owen Taylor3473f882001-02-23 17:55:21 +00003523htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003524 const xmlChar *name;
3525 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003526 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003527 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003528 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003529 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003530 int meta = 0;
3531 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003532 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003533
Daniel Veillarde77db162009-08-22 11:32:38 +02003534 if (ctxt->instate == XML_PARSER_EOF)
3535 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003536 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3537 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3538 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003539 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003540 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003541 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003542 NEXT;
3543
Daniel Veillard30e76072006-03-09 14:13:55 +00003544 atts = ctxt->atts;
3545 maxatts = ctxt->maxatts;
3546
Owen Taylor3473f882001-02-23 17:55:21 +00003547 GROW;
3548 name = htmlParseHTMLName(ctxt);
3549 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003550 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3551 "htmlParseStartTag: invalid element name\n",
3552 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003553 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003554 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3555 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003556 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003557 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003558 }
3559 if (xmlStrEqual(name, BAD_CAST"meta"))
3560 meta = 1;
3561
3562 /*
3563 * Check for auto-closure of HTML elements.
3564 */
3565 htmlAutoClose(ctxt, name);
3566
3567 /*
3568 * Check for implied HTML elements.
3569 */
3570 htmlCheckImplied(ctxt, name);
3571
3572 /*
3573 * Avoid html at any level > 0, head at any level != 1
3574 * or any attempt to recurse body
3575 */
3576 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003577 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3578 "htmlParseStartTag: misplaced <html> tag\n",
3579 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003580 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003581 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003582 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003583 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003584 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003585 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3586 "htmlParseStartTag: misplaced <head> tag\n",
3587 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003588 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003589 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003590 }
3591 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003592 int indx;
3593 for (indx = 0;indx < ctxt->nameNr;indx++) {
3594 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003595 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3596 "htmlParseStartTag: misplaced <body> tag\n",
3597 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003598 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003599 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003600 }
3601 }
3602 }
3603
3604 /*
3605 * Now parse the attributes, it ends up with the ending
3606 *
3607 * (S Attribute)* S?
3608 */
3609 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003610 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003611 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003612 ((CUR != '/') || (NXT(1) != '>'))) {
3613 long cons = ctxt->nbChars;
3614
3615 GROW;
3616 attname = htmlParseAttribute(ctxt, &attvalue);
3617 if (attname != NULL) {
3618
3619 /*
3620 * Well formedness requires at most one declaration of an attribute
3621 */
3622 for (i = 0; i < nbatts;i += 2) {
3623 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003624 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3625 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003626 if (attvalue != NULL)
3627 xmlFree(attvalue);
3628 goto failed;
3629 }
3630 }
3631
3632 /*
3633 * Add the pair to atts
3634 */
3635 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003636 maxatts = 22; /* allow for 10 attrs by default */
3637 atts = (const xmlChar **)
3638 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003639 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003640 htmlErrMemory(ctxt, NULL);
3641 if (attvalue != NULL)
3642 xmlFree(attvalue);
3643 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003644 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003645 ctxt->atts = atts;
3646 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003647 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003648 const xmlChar **n;
3649
Owen Taylor3473f882001-02-23 17:55:21 +00003650 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003651 n = (const xmlChar **) xmlRealloc((void *) atts,
3652 maxatts * sizeof(const xmlChar *));
3653 if (n == NULL) {
3654 htmlErrMemory(ctxt, NULL);
3655 if (attvalue != NULL)
3656 xmlFree(attvalue);
3657 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003658 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003659 atts = n;
3660 ctxt->atts = atts;
3661 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003662 }
3663 atts[nbatts++] = attname;
3664 atts[nbatts++] = attvalue;
3665 atts[nbatts] = NULL;
3666 atts[nbatts + 1] = NULL;
3667 }
3668 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003669 if (attvalue != NULL)
3670 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003671 /* Dump the bogus attribute string up to the next blank or
3672 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003673 while ((IS_CHAR_CH(CUR)) &&
3674 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003675 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003676 NEXT;
3677 }
3678
3679failed:
3680 SKIP_BLANKS;
3681 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003682 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3683 "htmlParseStartTag: problem parsing attributes\n",
3684 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003685 break;
3686 }
3687 }
3688
3689 /*
3690 * Handle specific association to the META tag
3691 */
William M. Bracke978ae22007-03-21 06:16:02 +00003692 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003693 htmlCheckMeta(ctxt, atts);
3694
3695 /*
3696 * SAX: Start of Element !
3697 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003698 if (!discardtag) {
3699 htmlnamePush(ctxt, name);
3700 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3701 if (nbatts != 0)
3702 ctxt->sax->startElement(ctxt->userData, name, atts);
3703 else
3704 ctxt->sax->startElement(ctxt->userData, name, NULL);
3705 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003706 }
Owen Taylor3473f882001-02-23 17:55:21 +00003707
3708 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003709 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003710 if (atts[i] != NULL)
3711 xmlFree((xmlChar *) atts[i]);
3712 }
Owen Taylor3473f882001-02-23 17:55:21 +00003713 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003714
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003715 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003716}
3717
3718/**
3719 * htmlParseEndTag:
3720 * @ctxt: an HTML parser context
3721 *
3722 * parse an end of tag
3723 *
3724 * [42] ETag ::= '</' Name S? '>'
3725 *
3726 * With namespace
3727 *
3728 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003729 *
3730 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003731 */
3732
Daniel Veillardf420ac52001-07-04 16:04:09 +00003733static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003734htmlParseEndTag(htmlParserCtxtPtr ctxt)
3735{
3736 const xmlChar *name;
3737 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003738 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003739
3740 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003741 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3742 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003743 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003744 }
3745 SKIP(2);
3746
3747 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003748 if (name == NULL)
3749 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003750 /*
3751 * We should definitely be at the ending "S? '>'" part
3752 */
3753 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003754 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003755 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3756 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003757 if (ctxt->recovery) {
3758 /*
3759 * We're not at the ending > !!
3760 * Error, unless in recover mode where we search forwards
3761 * until we find a >
3762 */
3763 while (CUR != '\0' && CUR != '>') NEXT;
3764 NEXT;
3765 }
Owen Taylor3473f882001-02-23 17:55:21 +00003766 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003767 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003768
3769 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003770 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3771 * out now.
3772 */
3773 if ((ctxt->depth > 0) &&
3774 (xmlStrEqual(name, BAD_CAST "html") ||
3775 xmlStrEqual(name, BAD_CAST "body") ||
3776 xmlStrEqual(name, BAD_CAST "head"))) {
3777 ctxt->depth--;
3778 return (0);
3779 }
3780
3781 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003782 * If the name read is not one of the element in the parsing stack
3783 * then return, it's just an error.
3784 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003785 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3786 if (xmlStrEqual(name, ctxt->nameTab[i]))
3787 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003788 }
3789 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003790 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3791 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003792 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003793 }
3794
3795
3796 /*
3797 * Check for auto-closure of HTML elements.
3798 */
3799
3800 htmlAutoCloseOnClose(ctxt, name);
3801
3802 /*
3803 * Well formedness constraints, opening and closing must match.
3804 * With the exception that the autoclose may have popped stuff out
3805 * of the stack.
3806 */
3807 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003808 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003809 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3810 "Opening and ending tag mismatch: %s and %s\n",
3811 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003812 }
3813 }
3814
3815 /*
3816 * SAX: End of Tag
3817 */
3818 oldname = ctxt->name;
3819 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003820 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3821 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003822 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003823 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003824 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003825 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003826 }
3827
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003828 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003829}
3830
3831
3832/**
3833 * htmlParseReference:
3834 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003835 *
Owen Taylor3473f882001-02-23 17:55:21 +00003836 * parse and handle entity references in content,
3837 * this will end-up in a call to character() since this is either a
3838 * CharRef, or a predefined entity.
3839 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003840static void
Owen Taylor3473f882001-02-23 17:55:21 +00003841htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003842 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003843 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003844 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003845 if (CUR != '&') return;
3846
3847 if (NXT(1) == '#') {
3848 unsigned int c;
3849 int bits, i = 0;
3850
3851 c = htmlParseCharRef(ctxt);
3852 if (c == 0)
3853 return;
3854
3855 if (c < 0x80) { out[i++]= c; bits= -6; }
3856 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3857 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3858 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003859
Owen Taylor3473f882001-02-23 17:55:21 +00003860 for ( ; bits >= 0; bits-= 6) {
3861 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3862 }
3863 out[i] = 0;
3864
3865 htmlCheckParagraph(ctxt);
3866 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3867 ctxt->sax->characters(ctxt->userData, out, i);
3868 } else {
3869 ent = htmlParseEntityRef(ctxt, &name);
3870 if (name == NULL) {
3871 htmlCheckParagraph(ctxt);
3872 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3873 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3874 return;
3875 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003876 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003877 htmlCheckParagraph(ctxt);
3878 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3879 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3880 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3881 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3882 }
3883 } else {
3884 unsigned int c;
3885 int bits, i = 0;
3886
3887 c = ent->value;
3888 if (c < 0x80)
3889 { out[i++]= c; bits= -6; }
3890 else if (c < 0x800)
3891 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3892 else if (c < 0x10000)
3893 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003894 else
Owen Taylor3473f882001-02-23 17:55:21 +00003895 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003896
Owen Taylor3473f882001-02-23 17:55:21 +00003897 for ( ; bits >= 0; bits-= 6) {
3898 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3899 }
3900 out[i] = 0;
3901
3902 htmlCheckParagraph(ctxt);
3903 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3904 ctxt->sax->characters(ctxt->userData, out, i);
3905 }
Owen Taylor3473f882001-02-23 17:55:21 +00003906 }
3907}
3908
3909/**
3910 * htmlParseContent:
3911 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003912 *
3913 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003914 */
3915
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003916static void
Owen Taylor3473f882001-02-23 17:55:21 +00003917htmlParseContent(htmlParserCtxtPtr ctxt) {
3918 xmlChar *currentNode;
3919 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003920 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003921
3922 currentNode = xmlStrdup(ctxt->name);
3923 depth = ctxt->nameNr;
3924 while (1) {
3925 long cons = ctxt->nbChars;
3926
3927 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003928
3929 if (ctxt->instate == XML_PARSER_EOF)
3930 break;
3931
Owen Taylor3473f882001-02-23 17:55:21 +00003932 /*
3933 * Our tag or one of it's parent or children is ending.
3934 */
3935 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003936 if (htmlParseEndTag(ctxt) &&
3937 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3938 if (currentNode != NULL)
3939 xmlFree(currentNode);
3940 return;
3941 }
3942 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003943 }
3944
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003945 else if ((CUR == '<') &&
3946 ((IS_ASCII_LETTER(NXT(1))) ||
3947 (NXT(1) == '_') || (NXT(1) == ':'))) {
3948 name = htmlParseHTMLName_nonInvasive(ctxt);
3949 if (name == NULL) {
3950 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3951 "htmlParseStartTag: invalid element name\n",
3952 NULL, NULL);
3953 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003954 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003955 NEXT;
3956
3957 if (currentNode != NULL)
3958 xmlFree(currentNode);
3959 return;
3960 }
3961
3962 if (ctxt->name != NULL) {
3963 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3964 htmlAutoClose(ctxt, name);
3965 continue;
3966 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003967 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003968 }
3969
Owen Taylor3473f882001-02-23 17:55:21 +00003970 /*
3971 * Has this node been popped out during parsing of
3972 * the next element
3973 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003974 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3975 (!xmlStrEqual(currentNode, ctxt->name)))
3976 {
Owen Taylor3473f882001-02-23 17:55:21 +00003977 if (currentNode != NULL) xmlFree(currentNode);
3978 return;
3979 }
3980
Daniel Veillardf9533d12001-03-03 10:04:57 +00003981 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3982 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003983 /*
3984 * Handle SCRIPT/STYLE separately
3985 */
3986 htmlParseScript(ctxt);
3987 } else {
3988 /*
3989 * Sometimes DOCTYPE arrives in the middle of the document
3990 */
3991 if ((CUR == '<') && (NXT(1) == '!') &&
3992 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3993 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3994 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3995 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003996 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3997 "Misplaced DOCTYPE declaration\n",
3998 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003999 htmlParseDocTypeDecl(ctxt);
4000 }
4001
4002 /*
4003 * First case : a comment
4004 */
4005 if ((CUR == '<') && (NXT(1) == '!') &&
4006 (NXT(2) == '-') && (NXT(3) == '-')) {
4007 htmlParseComment(ctxt);
4008 }
4009
4010 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004011 * Second case : a Processing Instruction.
4012 */
4013 else if ((CUR == '<') && (NXT(1) == '?')) {
4014 htmlParsePI(ctxt);
4015 }
4016
4017 /*
4018 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004019 */
4020 else if (CUR == '<') {
4021 htmlParseElement(ctxt);
4022 }
4023
4024 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004025 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004026 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004027 */
4028 else if (CUR == '&') {
4029 htmlParseReference(ctxt);
4030 }
4031
4032 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004033 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004034 */
4035 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004036 htmlAutoCloseOnEnd(ctxt);
4037 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004038 }
4039
4040 /*
4041 * Last case, text. Note that References are handled directly.
4042 */
4043 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004044 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004045 }
4046
4047 if (cons == ctxt->nbChars) {
4048 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004049 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4050 "detected an error in element content\n",
4051 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004052 }
4053 break;
4054 }
4055 }
4056 GROW;
4057 }
4058 if (currentNode != NULL) xmlFree(currentNode);
4059}
4060
4061/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004062 * htmlParseContent:
4063 * @ctxt: an HTML parser context
4064 *
4065 * Parse a content: comment, sub-element, reference or text.
4066 */
4067
4068void
4069__htmlParseContent(void *ctxt) {
4070 if (ctxt != NULL)
4071 htmlParseContent((htmlParserCtxtPtr) ctxt);
4072}
4073
4074/**
Owen Taylor3473f882001-02-23 17:55:21 +00004075 * htmlParseElement:
4076 * @ctxt: an HTML parser context
4077 *
4078 * parse an HTML element, this is highly recursive
4079 *
4080 * [39] element ::= EmptyElemTag | STag content ETag
4081 *
4082 * [41] Attribute ::= Name Eq AttValue
4083 */
4084
4085void
4086htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004087 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004088 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004089 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004090 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004091 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004092 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004093 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004094
Daniel Veillarda03e3652004-11-02 18:45:30 +00004095 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4096 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004097 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004098 return;
4099 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004100
4101 if (ctxt->instate == XML_PARSER_EOF)
4102 return;
4103
Owen Taylor3473f882001-02-23 17:55:21 +00004104 /* Capture start position */
4105 if (ctxt->record_info) {
4106 node_info.begin_pos = ctxt->input->consumed +
4107 (CUR_PTR - ctxt->input->base);
4108 node_info.begin_line = ctxt->input->line;
4109 }
4110
Daniel Veillard597f1c12005-07-03 23:00:18 +00004111 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004112 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004113 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004114 if (CUR == '>')
4115 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004116 return;
4117 }
Owen Taylor3473f882001-02-23 17:55:21 +00004118
4119 /*
4120 * Lookup the info for that element.
4121 */
4122 info = htmlTagLookup(name);
4123 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004124 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4125 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004126 }
4127
4128 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004129 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004130 */
4131 if ((CUR == '/') && (NXT(1) == '>')) {
4132 SKIP(2);
4133 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4134 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004135 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004136 return;
4137 }
4138
4139 if (CUR == '>') {
4140 NEXT;
4141 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004142 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4143 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004144
4145 /*
4146 * end of parsing of this node.
4147 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004148 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004149 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004150 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004151 }
Owen Taylor3473f882001-02-23 17:55:21 +00004152
4153 /*
4154 * Capture end position and add node
4155 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004156 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004157 node_info.end_pos = ctxt->input->consumed +
4158 (CUR_PTR - ctxt->input->base);
4159 node_info.end_line = ctxt->input->line;
4160 node_info.node = ctxt->node;
4161 xmlParserAddNodeInfo(ctxt, &node_info);
4162 }
4163 return;
4164 }
4165
4166 /*
4167 * Check for an Empty Element from DTD definition
4168 */
4169 if ((info != NULL) && (info->empty)) {
4170 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4171 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004172 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004173 return;
4174 }
4175
4176 /*
4177 * Parse the content of the element:
4178 */
4179 currentNode = xmlStrdup(ctxt->name);
4180 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004181 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004182 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004183 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004184 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004185 if (ctxt->nameNr < depth) break;
4186 }
Owen Taylor3473f882001-02-23 17:55:21 +00004187
Owen Taylor3473f882001-02-23 17:55:21 +00004188 /*
4189 * Capture end position and add node
4190 */
4191 if ( currentNode != NULL && ctxt->record_info ) {
4192 node_info.end_pos = ctxt->input->consumed +
4193 (CUR_PTR - ctxt->input->base);
4194 node_info.end_line = ctxt->input->line;
4195 node_info.node = ctxt->node;
4196 xmlParserAddNodeInfo(ctxt, &node_info);
4197 }
William M. Brack76e95df2003-10-18 16:20:14 +00004198 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004199 htmlAutoCloseOnEnd(ctxt);
4200 }
4201
Owen Taylor3473f882001-02-23 17:55:21 +00004202 if (currentNode != NULL)
4203 xmlFree(currentNode);
4204}
4205
4206/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004207 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004208 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004209 *
Owen Taylor3473f882001-02-23 17:55:21 +00004210 * parse an HTML document (and build a tree if using the standard SAX
4211 * interface).
4212 *
4213 * Returns 0, -1 in case of error. the parser context is augmented
4214 * as a result of the parsing.
4215 */
4216
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004217int
Owen Taylor3473f882001-02-23 17:55:21 +00004218htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004219 xmlChar start[4];
4220 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004221 xmlDtdPtr dtd;
4222
Daniel Veillardd0463562001-10-13 09:15:48 +00004223 xmlInitParser();
4224
Owen Taylor3473f882001-02-23 17:55:21 +00004225 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004226
Daniel Veillarda03e3652004-11-02 18:45:30 +00004227 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4228 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4229 "htmlParseDocument: context error\n", NULL, NULL);
4230 return(XML_ERR_INTERNAL_ERROR);
4231 }
4232 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004233 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004234 GROW;
4235 /*
4236 * SAX: beginning of the document processing.
4237 */
4238 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4239 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4240
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004241 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4242 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4243 /*
4244 * Get the 4 first bytes and decode the charset
4245 * if enc != XML_CHAR_ENCODING_NONE
4246 * plug some encoding conversion routines.
4247 */
4248 start[0] = RAW;
4249 start[1] = NXT(1);
4250 start[2] = NXT(2);
4251 start[3] = NXT(3);
4252 enc = xmlDetectCharEncoding(&start[0], 4);
4253 if (enc != XML_CHAR_ENCODING_NONE) {
4254 xmlSwitchEncoding(ctxt, enc);
4255 }
4256 }
4257
Owen Taylor3473f882001-02-23 17:55:21 +00004258 /*
4259 * Wipe out everything which is before the first '<'
4260 */
4261 SKIP_BLANKS;
4262 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004263 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004264 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004265 }
4266
4267 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4268 ctxt->sax->startDocument(ctxt->userData);
4269
4270
4271 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004272 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004273 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004274 while (((CUR == '<') && (NXT(1) == '!') &&
4275 (NXT(2) == '-') && (NXT(3) == '-')) ||
4276 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004277 htmlParseComment(ctxt);
4278 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004279 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004280 }
Owen Taylor3473f882001-02-23 17:55:21 +00004281
4282
4283 /*
4284 * Then possibly doc type declaration(s) and more Misc
4285 * (doctypedecl Misc*)?
4286 */
4287 if ((CUR == '<') && (NXT(1) == '!') &&
4288 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4289 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4290 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4291 (UPP(8) == 'E')) {
4292 htmlParseDocTypeDecl(ctxt);
4293 }
4294 SKIP_BLANKS;
4295
4296 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004297 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004298 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004299 while (((CUR == '<') && (NXT(1) == '!') &&
4300 (NXT(2) == '-') && (NXT(3) == '-')) ||
4301 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004302 htmlParseComment(ctxt);
4303 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004304 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004305 }
Owen Taylor3473f882001-02-23 17:55:21 +00004306
4307 /*
4308 * Time to start parsing the tree itself
4309 */
4310 htmlParseContent(ctxt);
4311
4312 /*
4313 * autoclose
4314 */
4315 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004316 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004317
4318
4319 /*
4320 * SAX: end of the document processing.
4321 */
4322 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4323 ctxt->sax->endDocument(ctxt->userData);
4324
4325 if (ctxt->myDoc != NULL) {
4326 dtd = xmlGetIntSubset(ctxt->myDoc);
4327 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004328 ctxt->myDoc->intSubset =
4329 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004330 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4331 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4332 }
4333 if (! ctxt->wellFormed) return(-1);
4334 return(0);
4335}
4336
4337
4338/************************************************************************
4339 * *
4340 * Parser contexts handling *
4341 * *
4342 ************************************************************************/
4343
4344/**
William M. Brackedb65a72004-02-06 07:36:04 +00004345 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004346 * @ctxt: an HTML parser context
4347 *
4348 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004349 *
4350 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004351 */
4352
Daniel Veillardf403d292003-10-05 13:51:35 +00004353static int
Owen Taylor3473f882001-02-23 17:55:21 +00004354htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4355{
4356 htmlSAXHandler *sax;
4357
Daniel Veillardf403d292003-10-05 13:51:35 +00004358 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004359 memset(ctxt, 0, sizeof(htmlParserCtxt));
4360
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004361 ctxt->dict = xmlDictCreate();
4362 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004363 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4364 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004365 }
Owen Taylor3473f882001-02-23 17:55:21 +00004366 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4367 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004368 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4369 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004370 }
4371 else
4372 memset(sax, 0, sizeof(htmlSAXHandler));
4373
4374 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004375 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004376 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4377 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004378 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004379 ctxt->inputNr = 0;
4380 ctxt->inputMax = 0;
4381 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004382 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004383 }
4384 ctxt->inputNr = 0;
4385 ctxt->inputMax = 5;
4386 ctxt->input = NULL;
4387 ctxt->version = NULL;
4388 ctxt->encoding = NULL;
4389 ctxt->standalone = -1;
4390 ctxt->instate = XML_PARSER_START;
4391
4392 /* Allocate the Node stack */
4393 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4394 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004395 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004396 ctxt->nodeNr = 0;
4397 ctxt->nodeMax = 0;
4398 ctxt->node = NULL;
4399 ctxt->inputNr = 0;
4400 ctxt->inputMax = 0;
4401 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004402 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004403 }
4404 ctxt->nodeNr = 0;
4405 ctxt->nodeMax = 10;
4406 ctxt->node = NULL;
4407
4408 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004409 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004410 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004411 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004412 ctxt->nameNr = 0;
4413 ctxt->nameMax = 10;
4414 ctxt->name = NULL;
4415 ctxt->nodeNr = 0;
4416 ctxt->nodeMax = 0;
4417 ctxt->node = NULL;
4418 ctxt->inputNr = 0;
4419 ctxt->inputMax = 0;
4420 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004421 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004422 }
4423 ctxt->nameNr = 0;
4424 ctxt->nameMax = 10;
4425 ctxt->name = NULL;
4426
Daniel Veillard092643b2003-09-25 14:29:29 +00004427 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004428 else {
4429 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004430 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004431 }
4432 ctxt->userData = ctxt;
4433 ctxt->myDoc = NULL;
4434 ctxt->wellFormed = 1;
4435 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004436 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004437 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004438 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004439 ctxt->vctxt.userData = ctxt;
4440 ctxt->vctxt.error = xmlParserValidityError;
4441 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004442 ctxt->record_info = 0;
4443 ctxt->validate = 0;
4444 ctxt->nbChars = 0;
4445 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004446 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004447 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004448 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004449}
4450
4451/**
4452 * htmlFreeParserCtxt:
4453 * @ctxt: an HTML parser context
4454 *
4455 * Free all the memory used by a parser context. However the parsed
4456 * document in ctxt->myDoc is not freed.
4457 */
4458
4459void
4460htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4461{
4462 xmlFreeParserCtxt(ctxt);
4463}
4464
4465/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004466 * htmlNewParserCtxt:
4467 *
4468 * Allocate and initialize a new parser context.
4469 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004470 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004471 */
4472
Daniel Veillard34c647c2006-09-21 06:53:59 +00004473htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004474htmlNewParserCtxt(void)
4475{
4476 xmlParserCtxtPtr ctxt;
4477
4478 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4479 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004480 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004481 return(NULL);
4482 }
4483 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004484 if (htmlInitParserCtxt(ctxt) < 0) {
4485 htmlFreeParserCtxt(ctxt);
4486 return(NULL);
4487 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004488 return(ctxt);
4489}
4490
4491/**
4492 * htmlCreateMemoryParserCtxt:
4493 * @buffer: a pointer to a char array
4494 * @size: the size of the array
4495 *
4496 * Create a parser context for an HTML in-memory document.
4497 *
4498 * Returns the new parser context or NULL
4499 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004500htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004501htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4502 xmlParserCtxtPtr ctxt;
4503 xmlParserInputPtr input;
4504 xmlParserInputBufferPtr buf;
4505
4506 if (buffer == NULL)
4507 return(NULL);
4508 if (size <= 0)
4509 return(NULL);
4510
4511 ctxt = htmlNewParserCtxt();
4512 if (ctxt == NULL)
4513 return(NULL);
4514
4515 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4516 if (buf == NULL) return(NULL);
4517
4518 input = xmlNewInputStream(ctxt);
4519 if (input == NULL) {
4520 xmlFreeParserCtxt(ctxt);
4521 return(NULL);
4522 }
4523
4524 input->filename = NULL;
4525 input->buf = buf;
4526 input->base = input->buf->buffer->content;
4527 input->cur = input->buf->buffer->content;
4528 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4529
4530 inputPush(ctxt, input);
4531 return(ctxt);
4532}
4533
4534/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004535 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004536 * @cur: a pointer to an array of xmlChar
4537 * @encoding: a free form C string describing the HTML document encoding, or NULL
4538 *
4539 * Create a parser context for an HTML document.
4540 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004541 * TODO: check the need to add encoding handling there
4542 *
Owen Taylor3473f882001-02-23 17:55:21 +00004543 * Returns the new parser context or NULL
4544 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004545static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004546htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004547 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004548 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004549
Daniel Veillard1d995272002-07-22 16:43:32 +00004550 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004551 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004552 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004553 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004554 if (ctxt == NULL)
4555 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004556
4557 if (encoding != NULL) {
4558 xmlCharEncoding enc;
4559 xmlCharEncodingHandlerPtr handler;
4560
4561 if (ctxt->input->encoding != NULL)
4562 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004563 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004564
4565 enc = xmlParseCharEncoding(encoding);
4566 /*
4567 * registered set of known encodings
4568 */
4569 if (enc != XML_CHAR_ENCODING_ERROR) {
4570 xmlSwitchEncoding(ctxt, enc);
4571 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004572 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004573 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004574 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004575 }
4576 } else {
4577 /*
4578 * fallback for unknown encodings
4579 */
4580 handler = xmlFindCharEncodingHandler((const char *) encoding);
4581 if (handler != NULL) {
4582 xmlSwitchToEncoding(ctxt, handler);
4583 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004584 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4585 "Unsupported encoding %s\n",
4586 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004587 }
4588 }
4589 }
4590 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004591}
4592
Daniel Veillard73b013f2003-09-30 12:36:01 +00004593#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004594/************************************************************************
4595 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004596 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004597 * *
4598 ************************************************************************/
4599
4600/**
4601 * htmlParseLookupSequence:
4602 * @ctxt: an HTML parser context
4603 * @first: the first char to lookup
4604 * @next: the next char to lookup or zero
4605 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004606 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004607 *
4608 * Try to find if a sequence (first, next, third) or just (first next) or
4609 * (first) is available in the input stream.
4610 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4611 * to avoid rescanning sequences of bytes, it DOES change the state of the
4612 * parser, do not use liberally.
4613 * This is basically similar to xmlParseLookupSequence()
4614 *
4615 * Returns the index to the current parsing point if the full sequence
4616 * is available, -1 otherwise.
4617 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004618static int
Owen Taylor3473f882001-02-23 17:55:21 +00004619htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004620 xmlChar next, xmlChar third, int iscomment,
4621 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004622 int base, len;
4623 htmlParserInputPtr in;
4624 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004625 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004626 int invalue = 0;
4627 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004628
4629 in = ctxt->input;
4630 if (in == NULL) return(-1);
4631 base = in->cur - in->base;
4632 if (base < 0) return(-1);
4633 if (ctxt->checkIndex > base)
4634 base = ctxt->checkIndex;
4635 if (in->buf == NULL) {
4636 buf = in->base;
4637 len = in->length;
4638 } else {
4639 buf = in->buf->buffer->content;
4640 len = in->buf->buffer->use;
4641 }
4642 /* take into account the sequence length */
4643 if (third) len -= 2;
4644 else if (next) len --;
4645 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004646 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004647 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4648 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4649 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004650 /* do not increment past <! - some people use <!--> */
4651 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004652 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004653 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004654 if (ignoreattrval) {
4655 if (buf[base] == '"' || buf[base] == '\'') {
4656 if (invalue) {
4657 if (buf[base] == valdellim) {
4658 invalue = 0;
4659 continue;
4660 }
4661 } else {
4662 valdellim = buf[base];
4663 invalue = 1;
4664 continue;
4665 }
4666 } else if (invalue) {
4667 continue;
4668 }
4669 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004670 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004671 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004672 return(-1);
4673 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4674 (buf[base + 2] == '>')) {
4675 incomment = 0;
4676 base += 2;
4677 }
4678 continue;
4679 }
Owen Taylor3473f882001-02-23 17:55:21 +00004680 if (buf[base] == first) {
4681 if (third != 0) {
4682 if ((buf[base + 1] != next) ||
4683 (buf[base + 2] != third)) continue;
4684 } else if (next != 0) {
4685 if (buf[base + 1] != next) continue;
4686 }
4687 ctxt->checkIndex = 0;
4688#ifdef DEBUG_PUSH
4689 if (next == 0)
4690 xmlGenericError(xmlGenericErrorContext,
4691 "HPP: lookup '%c' found at %d\n",
4692 first, base);
4693 else if (third == 0)
4694 xmlGenericError(xmlGenericErrorContext,
4695 "HPP: lookup '%c%c' found at %d\n",
4696 first, next, base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004697 else
Owen Taylor3473f882001-02-23 17:55:21 +00004698 xmlGenericError(xmlGenericErrorContext,
4699 "HPP: lookup '%c%c%c' found at %d\n",
4700 first, next, third, base);
4701#endif
4702 return(base - (in->cur - in->base));
4703 }
4704 }
4705 ctxt->checkIndex = base;
4706#ifdef DEBUG_PUSH
4707 if (next == 0)
4708 xmlGenericError(xmlGenericErrorContext,
4709 "HPP: lookup '%c' failed\n", first);
4710 else if (third == 0)
4711 xmlGenericError(xmlGenericErrorContext,
4712 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004713 else
Owen Taylor3473f882001-02-23 17:55:21 +00004714 xmlGenericError(xmlGenericErrorContext,
4715 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4716#endif
4717 return(-1);
4718}
4719
4720/**
4721 * htmlParseTryOrFinish:
4722 * @ctxt: an HTML parser context
4723 * @terminate: last chunk indicator
4724 *
4725 * Try to progress on parsing
4726 *
4727 * Returns zero if no parsing was possible
4728 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004729static int
Owen Taylor3473f882001-02-23 17:55:21 +00004730htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4731 int ret = 0;
4732 htmlParserInputPtr in;
4733 int avail = 0;
4734 xmlChar cur, next;
4735
4736#ifdef DEBUG_PUSH
4737 switch (ctxt->instate) {
4738 case XML_PARSER_EOF:
4739 xmlGenericError(xmlGenericErrorContext,
4740 "HPP: try EOF\n"); break;
4741 case XML_PARSER_START:
4742 xmlGenericError(xmlGenericErrorContext,
4743 "HPP: try START\n"); break;
4744 case XML_PARSER_MISC:
4745 xmlGenericError(xmlGenericErrorContext,
4746 "HPP: try MISC\n");break;
4747 case XML_PARSER_COMMENT:
4748 xmlGenericError(xmlGenericErrorContext,
4749 "HPP: try COMMENT\n");break;
4750 case XML_PARSER_PROLOG:
4751 xmlGenericError(xmlGenericErrorContext,
4752 "HPP: try PROLOG\n");break;
4753 case XML_PARSER_START_TAG:
4754 xmlGenericError(xmlGenericErrorContext,
4755 "HPP: try START_TAG\n");break;
4756 case XML_PARSER_CONTENT:
4757 xmlGenericError(xmlGenericErrorContext,
4758 "HPP: try CONTENT\n");break;
4759 case XML_PARSER_CDATA_SECTION:
4760 xmlGenericError(xmlGenericErrorContext,
4761 "HPP: try CDATA_SECTION\n");break;
4762 case XML_PARSER_END_TAG:
4763 xmlGenericError(xmlGenericErrorContext,
4764 "HPP: try END_TAG\n");break;
4765 case XML_PARSER_ENTITY_DECL:
4766 xmlGenericError(xmlGenericErrorContext,
4767 "HPP: try ENTITY_DECL\n");break;
4768 case XML_PARSER_ENTITY_VALUE:
4769 xmlGenericError(xmlGenericErrorContext,
4770 "HPP: try ENTITY_VALUE\n");break;
4771 case XML_PARSER_ATTRIBUTE_VALUE:
4772 xmlGenericError(xmlGenericErrorContext,
4773 "HPP: try ATTRIBUTE_VALUE\n");break;
4774 case XML_PARSER_DTD:
4775 xmlGenericError(xmlGenericErrorContext,
4776 "HPP: try DTD\n");break;
4777 case XML_PARSER_EPILOG:
4778 xmlGenericError(xmlGenericErrorContext,
4779 "HPP: try EPILOG\n");break;
4780 case XML_PARSER_PI:
4781 xmlGenericError(xmlGenericErrorContext,
4782 "HPP: try PI\n");break;
4783 case XML_PARSER_SYSTEM_LITERAL:
4784 xmlGenericError(xmlGenericErrorContext,
4785 "HPP: try SYSTEM_LITERAL\n");break;
4786 }
4787#endif
4788
4789 while (1) {
4790
4791 in = ctxt->input;
4792 if (in == NULL) break;
4793 if (in->buf == NULL)
4794 avail = in->length - (in->cur - in->base);
4795 else
4796 avail = in->buf->buffer->use - (in->cur - in->base);
4797 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004798 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004799 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004800 /*
4801 * SAX: end of the document processing.
4802 */
4803 ctxt->instate = XML_PARSER_EOF;
4804 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4805 ctxt->sax->endDocument(ctxt->userData);
4806 }
4807 }
4808 if (avail < 1)
4809 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004810 cur = in->cur[0];
4811 if (cur == 0) {
4812 SKIP(1);
4813 continue;
4814 }
4815
Owen Taylor3473f882001-02-23 17:55:21 +00004816 switch (ctxt->instate) {
4817 case XML_PARSER_EOF:
4818 /*
4819 * Document parsing is done !
4820 */
4821 goto done;
4822 case XML_PARSER_START:
4823 /*
4824 * Very first chars read from the document flow.
4825 */
4826 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004827 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004828 SKIP_BLANKS;
4829 if (in->buf == NULL)
4830 avail = in->length - (in->cur - in->base);
4831 else
4832 avail = in->buf->buffer->use - (in->cur - in->base);
4833 }
4834 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4835 ctxt->sax->setDocumentLocator(ctxt->userData,
4836 &xmlDefaultSAXLocator);
4837 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4838 (!ctxt->disableSAX))
4839 ctxt->sax->startDocument(ctxt->userData);
4840
4841 cur = in->cur[0];
4842 next = in->cur[1];
4843 if ((cur == '<') && (next == '!') &&
4844 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4845 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4846 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4847 (UPP(8) == 'E')) {
4848 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004849 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004850 goto done;
4851#ifdef DEBUG_PUSH
4852 xmlGenericError(xmlGenericErrorContext,
4853 "HPP: Parsing internal subset\n");
4854#endif
4855 htmlParseDocTypeDecl(ctxt);
4856 ctxt->instate = XML_PARSER_PROLOG;
4857#ifdef DEBUG_PUSH
4858 xmlGenericError(xmlGenericErrorContext,
4859 "HPP: entering PROLOG\n");
4860#endif
4861 } else {
4862 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004863#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004864 xmlGenericError(xmlGenericErrorContext,
4865 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004866#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004867 }
Owen Taylor3473f882001-02-23 17:55:21 +00004868 break;
4869 case XML_PARSER_MISC:
4870 SKIP_BLANKS;
4871 if (in->buf == NULL)
4872 avail = in->length - (in->cur - in->base);
4873 else
4874 avail = in->buf->buffer->use - (in->cur - in->base);
4875 if (avail < 2)
4876 goto done;
4877 cur = in->cur[0];
4878 next = in->cur[1];
4879 if ((cur == '<') && (next == '!') &&
4880 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4881 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004882 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004883 goto done;
4884#ifdef DEBUG_PUSH
4885 xmlGenericError(xmlGenericErrorContext,
4886 "HPP: Parsing Comment\n");
4887#endif
4888 htmlParseComment(ctxt);
4889 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004890 } else if ((cur == '<') && (next == '?')) {
4891 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004892 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004893 goto done;
4894#ifdef DEBUG_PUSH
4895 xmlGenericError(xmlGenericErrorContext,
4896 "HPP: Parsing PI\n");
4897#endif
4898 htmlParsePI(ctxt);
4899 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004900 } else if ((cur == '<') && (next == '!') &&
4901 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4902 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4903 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4904 (UPP(8) == 'E')) {
4905 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004906 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004907 goto done;
4908#ifdef DEBUG_PUSH
4909 xmlGenericError(xmlGenericErrorContext,
4910 "HPP: Parsing internal subset\n");
4911#endif
4912 htmlParseDocTypeDecl(ctxt);
4913 ctxt->instate = XML_PARSER_PROLOG;
4914#ifdef DEBUG_PUSH
4915 xmlGenericError(xmlGenericErrorContext,
4916 "HPP: entering PROLOG\n");
4917#endif
4918 } else if ((cur == '<') && (next == '!') &&
4919 (avail < 9)) {
4920 goto done;
4921 } else {
4922 ctxt->instate = XML_PARSER_START_TAG;
4923#ifdef DEBUG_PUSH
4924 xmlGenericError(xmlGenericErrorContext,
4925 "HPP: entering START_TAG\n");
4926#endif
4927 }
4928 break;
4929 case XML_PARSER_PROLOG:
4930 SKIP_BLANKS;
4931 if (in->buf == NULL)
4932 avail = in->length - (in->cur - in->base);
4933 else
4934 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004935 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00004936 goto done;
4937 cur = in->cur[0];
4938 next = in->cur[1];
4939 if ((cur == '<') && (next == '!') &&
4940 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4941 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004942 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004943 goto done;
4944#ifdef DEBUG_PUSH
4945 xmlGenericError(xmlGenericErrorContext,
4946 "HPP: Parsing Comment\n");
4947#endif
4948 htmlParseComment(ctxt);
4949 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004950 } else if ((cur == '<') && (next == '?')) {
4951 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004952 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004953 goto done;
4954#ifdef DEBUG_PUSH
4955 xmlGenericError(xmlGenericErrorContext,
4956 "HPP: Parsing PI\n");
4957#endif
4958 htmlParsePI(ctxt);
4959 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004960 } else if ((cur == '<') && (next == '!') &&
4961 (avail < 4)) {
4962 goto done;
4963 } else {
4964 ctxt->instate = XML_PARSER_START_TAG;
4965#ifdef DEBUG_PUSH
4966 xmlGenericError(xmlGenericErrorContext,
4967 "HPP: entering START_TAG\n");
4968#endif
4969 }
4970 break;
4971 case XML_PARSER_EPILOG:
4972 if (in->buf == NULL)
4973 avail = in->length - (in->cur - in->base);
4974 else
4975 avail = in->buf->buffer->use - (in->cur - in->base);
4976 if (avail < 1)
4977 goto done;
4978 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004979 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004980 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004981 goto done;
4982 }
4983 if (avail < 2)
4984 goto done;
4985 next = in->cur[1];
4986 if ((cur == '<') && (next == '!') &&
4987 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4988 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004989 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004990 goto done;
4991#ifdef DEBUG_PUSH
4992 xmlGenericError(xmlGenericErrorContext,
4993 "HPP: Parsing Comment\n");
4994#endif
4995 htmlParseComment(ctxt);
4996 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004997 } else if ((cur == '<') && (next == '?')) {
4998 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004999 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005000 goto done;
5001#ifdef DEBUG_PUSH
5002 xmlGenericError(xmlGenericErrorContext,
5003 "HPP: Parsing PI\n");
5004#endif
5005 htmlParsePI(ctxt);
5006 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005007 } else if ((cur == '<') && (next == '!') &&
5008 (avail < 4)) {
5009 goto done;
5010 } else {
5011 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005012 ctxt->wellFormed = 0;
5013 ctxt->instate = XML_PARSER_EOF;
5014#ifdef DEBUG_PUSH
5015 xmlGenericError(xmlGenericErrorContext,
5016 "HPP: entering EOF\n");
5017#endif
5018 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5019 ctxt->sax->endDocument(ctxt->userData);
5020 goto done;
5021 }
5022 break;
5023 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005024 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005025 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005026 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005027
5028 if (avail < 2)
5029 goto done;
5030 cur = in->cur[0];
5031 if (cur != '<') {
5032 ctxt->instate = XML_PARSER_CONTENT;
5033#ifdef DEBUG_PUSH
5034 xmlGenericError(xmlGenericErrorContext,
5035 "HPP: entering CONTENT\n");
5036#endif
5037 break;
5038 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005039 if (in->cur[1] == '/') {
5040 ctxt->instate = XML_PARSER_END_TAG;
5041 ctxt->checkIndex = 0;
5042#ifdef DEBUG_PUSH
5043 xmlGenericError(xmlGenericErrorContext,
5044 "HPP: entering END_TAG\n");
5045#endif
5046 break;
5047 }
Owen Taylor3473f882001-02-23 17:55:21 +00005048 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005049 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005050 goto done;
5051
Daniel Veillard597f1c12005-07-03 23:00:18 +00005052 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005053 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005054 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005055 (name == NULL)) {
5056 if (CUR == '>')
5057 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005058 break;
5059 }
Owen Taylor3473f882001-02-23 17:55:21 +00005060
5061 /*
5062 * Lookup the info for that element.
5063 */
5064 info = htmlTagLookup(name);
5065 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005066 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5067 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005068 }
5069
5070 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005071 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005072 */
5073 if ((CUR == '/') && (NXT(1) == '>')) {
5074 SKIP(2);
5075 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5076 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005077 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005078 ctxt->instate = XML_PARSER_CONTENT;
5079#ifdef DEBUG_PUSH
5080 xmlGenericError(xmlGenericErrorContext,
5081 "HPP: entering CONTENT\n");
5082#endif
5083 break;
5084 }
5085
5086 if (CUR == '>') {
5087 NEXT;
5088 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005089 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5090 "Couldn't find end of Start Tag %s\n",
5091 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005092
5093 /*
5094 * end of parsing of this node.
5095 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005096 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005097 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005098 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005099 }
Owen Taylor3473f882001-02-23 17:55:21 +00005100
5101 ctxt->instate = XML_PARSER_CONTENT;
5102#ifdef DEBUG_PUSH
5103 xmlGenericError(xmlGenericErrorContext,
5104 "HPP: entering CONTENT\n");
5105#endif
5106 break;
5107 }
5108
5109 /*
5110 * Check for an Empty Element from DTD definition
5111 */
5112 if ((info != NULL) && (info->empty)) {
5113 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5114 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005115 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005116 }
5117 ctxt->instate = XML_PARSER_CONTENT;
5118#ifdef DEBUG_PUSH
5119 xmlGenericError(xmlGenericErrorContext,
5120 "HPP: entering CONTENT\n");
5121#endif
5122 break;
5123 }
5124 case XML_PARSER_CONTENT: {
5125 long cons;
5126 /*
5127 * Handle preparsed entities and charRef
5128 */
5129 if (ctxt->token != 0) {
5130 xmlChar chr[2] = { 0 , 0 } ;
5131
5132 chr[0] = (xmlChar) ctxt->token;
5133 htmlCheckParagraph(ctxt);
5134 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5135 ctxt->sax->characters(ctxt->userData, chr, 1);
5136 ctxt->token = 0;
5137 ctxt->checkIndex = 0;
5138 }
5139 if ((avail == 1) && (terminate)) {
5140 cur = in->cur[0];
5141 if ((cur != '<') && (cur != '&')) {
5142 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005143 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005144 if (ctxt->sax->ignorableWhitespace != NULL)
5145 ctxt->sax->ignorableWhitespace(
5146 ctxt->userData, &cur, 1);
5147 } else {
5148 htmlCheckParagraph(ctxt);
5149 if (ctxt->sax->characters != NULL)
5150 ctxt->sax->characters(
5151 ctxt->userData, &cur, 1);
5152 }
5153 }
5154 ctxt->token = 0;
5155 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005156 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005157 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005158 }
Owen Taylor3473f882001-02-23 17:55:21 +00005159 }
5160 if (avail < 2)
5161 goto done;
5162 cur = in->cur[0];
5163 next = in->cur[1];
5164 cons = ctxt->nbChars;
5165 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5166 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5167 /*
5168 * Handle SCRIPT/STYLE separately
5169 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005170 if (!terminate) {
5171 int idx;
5172 xmlChar val;
5173
Jiri Netolicky446e1262009-08-07 17:05:36 +02005174 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005175 if (idx < 0)
5176 goto done;
5177 val = in->cur[idx + 2];
5178 if (val == 0) /* bad cut of input */
5179 goto done;
5180 }
Owen Taylor3473f882001-02-23 17:55:21 +00005181 htmlParseScript(ctxt);
5182 if ((cur == '<') && (next == '/')) {
5183 ctxt->instate = XML_PARSER_END_TAG;
5184 ctxt->checkIndex = 0;
5185#ifdef DEBUG_PUSH
5186 xmlGenericError(xmlGenericErrorContext,
5187 "HPP: entering END_TAG\n");
5188#endif
5189 break;
5190 }
5191 } else {
5192 /*
5193 * Sometimes DOCTYPE arrives in the middle of the document
5194 */
5195 if ((cur == '<') && (next == '!') &&
5196 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5197 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5198 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5199 (UPP(8) == 'E')) {
5200 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005201 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005202 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005203 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5204 "Misplaced DOCTYPE declaration\n",
5205 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005206 htmlParseDocTypeDecl(ctxt);
5207 } else if ((cur == '<') && (next == '!') &&
5208 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5209 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005210 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005211 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005212 goto done;
5213#ifdef DEBUG_PUSH
5214 xmlGenericError(xmlGenericErrorContext,
5215 "HPP: Parsing Comment\n");
5216#endif
5217 htmlParseComment(ctxt);
5218 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005219 } else if ((cur == '<') && (next == '?')) {
5220 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005221 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005222 goto done;
5223#ifdef DEBUG_PUSH
5224 xmlGenericError(xmlGenericErrorContext,
5225 "HPP: Parsing PI\n");
5226#endif
5227 htmlParsePI(ctxt);
5228 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005229 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5230 goto done;
5231 } else if ((cur == '<') && (next == '/')) {
5232 ctxt->instate = XML_PARSER_END_TAG;
5233 ctxt->checkIndex = 0;
5234#ifdef DEBUG_PUSH
5235 xmlGenericError(xmlGenericErrorContext,
5236 "HPP: entering END_TAG\n");
5237#endif
5238 break;
5239 } else if (cur == '<') {
5240 ctxt->instate = XML_PARSER_START_TAG;
5241 ctxt->checkIndex = 0;
5242#ifdef DEBUG_PUSH
5243 xmlGenericError(xmlGenericErrorContext,
5244 "HPP: entering START_TAG\n");
5245#endif
5246 break;
5247 } else if (cur == '&') {
5248 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005249 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005250 goto done;
5251#ifdef DEBUG_PUSH
5252 xmlGenericError(xmlGenericErrorContext,
5253 "HPP: Parsing Reference\n");
5254#endif
5255 /* TODO: check generation of subtrees if noent !!! */
5256 htmlParseReference(ctxt);
5257 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005258 /*
5259 * check that the text sequence is complete
5260 * before handing out the data to the parser
5261 * to avoid problems with erroneous end of
5262 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005263 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005264 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005265 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005266 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005267 ctxt->checkIndex = 0;
5268#ifdef DEBUG_PUSH
5269 xmlGenericError(xmlGenericErrorContext,
5270 "HPP: Parsing char data\n");
5271#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005272 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005273 }
5274 }
5275 if (cons == ctxt->nbChars) {
5276 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005277 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5278 "detected an error in element content\n",
5279 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005280 }
5281 NEXT;
5282 break;
5283 }
5284
5285 break;
5286 }
5287 case XML_PARSER_END_TAG:
5288 if (avail < 2)
5289 goto done;
5290 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005291 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005292 goto done;
5293 htmlParseEndTag(ctxt);
5294 if (ctxt->nameNr == 0) {
5295 ctxt->instate = XML_PARSER_EPILOG;
5296 } else {
5297 ctxt->instate = XML_PARSER_CONTENT;
5298 }
5299 ctxt->checkIndex = 0;
5300#ifdef DEBUG_PUSH
5301 xmlGenericError(xmlGenericErrorContext,
5302 "HPP: entering CONTENT\n");
5303#endif
5304 break;
5305 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005306 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5307 "HPP: internal error, state == CDATA\n",
5308 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005309 ctxt->instate = XML_PARSER_CONTENT;
5310 ctxt->checkIndex = 0;
5311#ifdef DEBUG_PUSH
5312 xmlGenericError(xmlGenericErrorContext,
5313 "HPP: entering CONTENT\n");
5314#endif
5315 break;
5316 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005317 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5318 "HPP: internal error, state == DTD\n",
5319 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005320 ctxt->instate = XML_PARSER_CONTENT;
5321 ctxt->checkIndex = 0;
5322#ifdef DEBUG_PUSH
5323 xmlGenericError(xmlGenericErrorContext,
5324 "HPP: entering CONTENT\n");
5325#endif
5326 break;
5327 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005328 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5329 "HPP: internal error, state == COMMENT\n",
5330 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005331 ctxt->instate = XML_PARSER_CONTENT;
5332 ctxt->checkIndex = 0;
5333#ifdef DEBUG_PUSH
5334 xmlGenericError(xmlGenericErrorContext,
5335 "HPP: entering CONTENT\n");
5336#endif
5337 break;
5338 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005339 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5340 "HPP: internal error, state == PI\n",
5341 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005342 ctxt->instate = XML_PARSER_CONTENT;
5343 ctxt->checkIndex = 0;
5344#ifdef DEBUG_PUSH
5345 xmlGenericError(xmlGenericErrorContext,
5346 "HPP: entering CONTENT\n");
5347#endif
5348 break;
5349 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005350 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5351 "HPP: internal error, state == ENTITY_DECL\n",
5352 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005353 ctxt->instate = XML_PARSER_CONTENT;
5354 ctxt->checkIndex = 0;
5355#ifdef DEBUG_PUSH
5356 xmlGenericError(xmlGenericErrorContext,
5357 "HPP: entering CONTENT\n");
5358#endif
5359 break;
5360 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005361 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5362 "HPP: internal error, state == ENTITY_VALUE\n",
5363 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005364 ctxt->instate = XML_PARSER_CONTENT;
5365 ctxt->checkIndex = 0;
5366#ifdef DEBUG_PUSH
5367 xmlGenericError(xmlGenericErrorContext,
5368 "HPP: entering DTD\n");
5369#endif
5370 break;
5371 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005372 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5373 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5374 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005375 ctxt->instate = XML_PARSER_START_TAG;
5376 ctxt->checkIndex = 0;
5377#ifdef DEBUG_PUSH
5378 xmlGenericError(xmlGenericErrorContext,
5379 "HPP: entering START_TAG\n");
5380#endif
5381 break;
5382 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005383 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5384 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5385 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005386 ctxt->instate = XML_PARSER_CONTENT;
5387 ctxt->checkIndex = 0;
5388#ifdef DEBUG_PUSH
5389 xmlGenericError(xmlGenericErrorContext,
5390 "HPP: entering CONTENT\n");
5391#endif
5392 break;
5393 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005394 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5395 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5396 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005397 ctxt->instate = XML_PARSER_CONTENT;
5398 ctxt->checkIndex = 0;
5399#ifdef DEBUG_PUSH
5400 xmlGenericError(xmlGenericErrorContext,
5401 "HPP: entering CONTENT\n");
5402#endif
5403 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005404 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005405 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5406 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5407 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005408 ctxt->instate = XML_PARSER_CONTENT;
5409 ctxt->checkIndex = 0;
5410#ifdef DEBUG_PUSH
5411 xmlGenericError(xmlGenericErrorContext,
5412 "HPP: entering CONTENT\n");
5413#endif
5414 break;
5415
Owen Taylor3473f882001-02-23 17:55:21 +00005416 }
5417 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005418done:
Owen Taylor3473f882001-02-23 17:55:21 +00005419 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005420 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005421 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005422 /*
5423 * SAX: end of the document processing.
5424 */
5425 ctxt->instate = XML_PARSER_EOF;
5426 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5427 ctxt->sax->endDocument(ctxt->userData);
5428 }
5429 }
5430 if ((ctxt->myDoc != NULL) &&
5431 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5432 (ctxt->instate == XML_PARSER_EPILOG))) {
5433 xmlDtdPtr dtd;
5434 dtd = xmlGetIntSubset(ctxt->myDoc);
5435 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005436 ctxt->myDoc->intSubset =
5437 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005438 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5439 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5440 }
5441#ifdef DEBUG_PUSH
5442 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5443#endif
5444 return(ret);
5445}
5446
5447/**
Owen Taylor3473f882001-02-23 17:55:21 +00005448 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005449 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005450 * @chunk: an char array
5451 * @size: the size in byte of the chunk
5452 * @terminate: last chunk indicator
5453 *
5454 * Parse a Chunk of memory
5455 *
5456 * Returns zero if no error, the xmlParserErrors otherwise.
5457 */
5458int
5459htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5460 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005461 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5462 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5463 "htmlParseChunk: context error\n", NULL, NULL);
5464 return(XML_ERR_INTERNAL_ERROR);
5465 }
Owen Taylor3473f882001-02-23 17:55:21 +00005466 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5467 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5468 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5469 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005470 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005471
5472 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005473 if (res < 0) {
5474 ctxt->errNo = XML_PARSER_EOF;
5475 ctxt->disableSAX = 1;
5476 return (XML_PARSER_EOF);
5477 }
Owen Taylor3473f882001-02-23 17:55:21 +00005478 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5479 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005480 ctxt->input->end =
5481 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005482#ifdef DEBUG_PUSH
5483 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5484#endif
5485
Daniel Veillard14f752c2003-08-09 11:44:50 +00005486#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005487 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5488 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005489#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005490 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005491 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5492 xmlParserInputBufferPtr in = ctxt->input->buf;
5493 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5494 (in->raw != NULL)) {
5495 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005496
Daniel Veillard14f752c2003-08-09 11:44:50 +00005497 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5498 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005499 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5500 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005501 return(XML_ERR_INVALID_ENCODING);
5502 }
5503 }
5504 }
Owen Taylor3473f882001-02-23 17:55:21 +00005505 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005506 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005507 if (terminate) {
5508 if ((ctxt->instate != XML_PARSER_EOF) &&
5509 (ctxt->instate != XML_PARSER_EPILOG) &&
5510 (ctxt->instate != XML_PARSER_MISC)) {
5511 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005512 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005513 }
Owen Taylor3473f882001-02-23 17:55:21 +00005514 if (ctxt->instate != XML_PARSER_EOF) {
5515 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5516 ctxt->sax->endDocument(ctxt->userData);
5517 }
5518 ctxt->instate = XML_PARSER_EOF;
5519 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005520 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005521}
5522
5523/************************************************************************
5524 * *
5525 * User entry points *
5526 * *
5527 ************************************************************************/
5528
5529/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005530 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005531 * @sax: a SAX handler
5532 * @user_data: The user data returned on SAX callbacks
5533 * @chunk: a pointer to an array of chars
5534 * @size: number of chars in the array
5535 * @filename: an optional file name or URI
5536 * @enc: an optional encoding
5537 *
5538 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005539 * The value of @filename is used for fetching external entities
5540 * and error/warning reports.
5541 *
5542 * Returns the new parser context or NULL
5543 */
5544htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005545htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005546 const char *chunk, int size, const char *filename,
5547 xmlCharEncoding enc) {
5548 htmlParserCtxtPtr ctxt;
5549 htmlParserInputPtr inputStream;
5550 xmlParserInputBufferPtr buf;
5551
Daniel Veillardd0463562001-10-13 09:15:48 +00005552 xmlInitParser();
5553
Owen Taylor3473f882001-02-23 17:55:21 +00005554 buf = xmlAllocParserInputBuffer(enc);
5555 if (buf == NULL) return(NULL);
5556
Daniel Veillardf403d292003-10-05 13:51:35 +00005557 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005558 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005559 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005560 return(NULL);
5561 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005562 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5563 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005564 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005565 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005566 xmlFree(ctxt->sax);
5567 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5568 if (ctxt->sax == NULL) {
5569 xmlFree(buf);
5570 xmlFree(ctxt);
5571 return(NULL);
5572 }
5573 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5574 if (user_data != NULL)
5575 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005576 }
Owen Taylor3473f882001-02-23 17:55:21 +00005577 if (filename == NULL) {
5578 ctxt->directory = NULL;
5579 } else {
5580 ctxt->directory = xmlParserGetDirectory(filename);
5581 }
5582
5583 inputStream = htmlNewInputStream(ctxt);
5584 if (inputStream == NULL) {
5585 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005586 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005587 return(NULL);
5588 }
5589
5590 if (filename == NULL)
5591 inputStream->filename = NULL;
5592 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005593 inputStream->filename = (char *)
5594 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005595 inputStream->buf = buf;
5596 inputStream->base = inputStream->buf->buffer->content;
5597 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005598 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005599 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005600
5601 inputPush(ctxt, inputStream);
5602
5603 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005604 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005605 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5606 int cur = ctxt->input->cur - ctxt->input->base;
5607
Daniel Veillarde77db162009-08-22 11:32:38 +02005608 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005609
5610 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5611 ctxt->input->cur = ctxt->input->base + cur;
5612 ctxt->input->end =
5613 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005614#ifdef DEBUG_PUSH
5615 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5616#endif
5617 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005618 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005619
5620 return(ctxt);
5621}
William M. Brack21e4ef22005-01-02 09:53:13 +00005622#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005623
5624/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005625 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005626 * @cur: a pointer to an array of xmlChar
5627 * @encoding: a free form C string describing the HTML document encoding, or NULL
5628 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005629 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005630 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005631 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5632 * to handle parse events. If sax is NULL, fallback to the default DOM
5633 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005634 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005635 * Returns the resulting document tree unless SAX is NULL or the document is
5636 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005637 */
5638
5639htmlDocPtr
5640htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5641 htmlDocPtr ret;
5642 htmlParserCtxtPtr ctxt;
5643
Daniel Veillardd0463562001-10-13 09:15:48 +00005644 xmlInitParser();
5645
Owen Taylor3473f882001-02-23 17:55:21 +00005646 if (cur == NULL) return(NULL);
5647
5648
5649 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5650 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005651 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005652 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005653 ctxt->sax = sax;
5654 ctxt->userData = userData;
5655 }
5656
5657 htmlParseDocument(ctxt);
5658 ret = ctxt->myDoc;
5659 if (sax != NULL) {
5660 ctxt->sax = NULL;
5661 ctxt->userData = NULL;
5662 }
5663 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005664
Owen Taylor3473f882001-02-23 17:55:21 +00005665 return(ret);
5666}
5667
5668/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005669 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005670 * @cur: a pointer to an array of xmlChar
5671 * @encoding: a free form C string describing the HTML document encoding, or NULL
5672 *
5673 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005674 *
Owen Taylor3473f882001-02-23 17:55:21 +00005675 * Returns the resulting document tree
5676 */
5677
5678htmlDocPtr
5679htmlParseDoc(xmlChar *cur, const char *encoding) {
5680 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5681}
5682
5683
5684/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005685 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005686 * @filename: the filename
5687 * @encoding: a free form C string describing the HTML document encoding, or NULL
5688 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005689 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005690 * Automatic support for ZLIB/Compress compressed document is provided
5691 * by default if found at compile-time.
5692 *
5693 * Returns the new parser context or NULL
5694 */
5695htmlParserCtxtPtr
5696htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5697{
5698 htmlParserCtxtPtr ctxt;
5699 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005700 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005701 /* htmlCharEncoding enc; */
5702 xmlChar *content, *content_line = (xmlChar *) "charset=";
5703
Daniel Veillarda03e3652004-11-02 18:45:30 +00005704 if (filename == NULL)
5705 return(NULL);
5706
Daniel Veillardf403d292003-10-05 13:51:35 +00005707 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005708 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005709 return(NULL);
5710 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005711 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5712 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005713#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005714 if (xmlDefaultSAXHandler.error != NULL) {
5715 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5716 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005717#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005718 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005719 return(NULL);
5720 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005721
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005722 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5723 xmlFree(canonicFilename);
5724 if (inputStream == NULL) {
5725 xmlFreeParserCtxt(ctxt);
5726 return(NULL);
5727 }
Owen Taylor3473f882001-02-23 17:55:21 +00005728
5729 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005730
Owen Taylor3473f882001-02-23 17:55:21 +00005731 /* set encoding */
5732 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005733 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005734 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005735 strcpy ((char *)content, (char *)content_line);
5736 strcat ((char *)content, (char *)encoding);
5737 htmlCheckEncoding (ctxt, content);
5738 xmlFree (content);
5739 }
5740 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005741
Owen Taylor3473f882001-02-23 17:55:21 +00005742 return(ctxt);
5743}
5744
5745/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005746 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005747 * @filename: the filename
5748 * @encoding: a free form C string describing the HTML document encoding, or NULL
5749 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005750 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005751 *
5752 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5753 * compressed document is provided by default if found at compile-time.
5754 * It use the given SAX function block to handle the parsing callback.
5755 * If sax is NULL, fallback to the default DOM tree building routines.
5756 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005757 * Returns the resulting document tree unless SAX is NULL or the document is
5758 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005759 */
5760
5761htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005762htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005763 void *userData) {
5764 htmlDocPtr ret;
5765 htmlParserCtxtPtr ctxt;
5766 htmlSAXHandlerPtr oldsax = NULL;
5767
Daniel Veillardd0463562001-10-13 09:15:48 +00005768 xmlInitParser();
5769
Owen Taylor3473f882001-02-23 17:55:21 +00005770 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5771 if (ctxt == NULL) return(NULL);
5772 if (sax != NULL) {
5773 oldsax = ctxt->sax;
5774 ctxt->sax = sax;
5775 ctxt->userData = userData;
5776 }
5777
5778 htmlParseDocument(ctxt);
5779
5780 ret = ctxt->myDoc;
5781 if (sax != NULL) {
5782 ctxt->sax = oldsax;
5783 ctxt->userData = NULL;
5784 }
5785 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005786
Owen Taylor3473f882001-02-23 17:55:21 +00005787 return(ret);
5788}
5789
5790/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005791 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005792 * @filename: the filename
5793 * @encoding: a free form C string describing the HTML document encoding, or NULL
5794 *
5795 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5796 * compressed document is provided by default if found at compile-time.
5797 *
5798 * Returns the resulting document tree
5799 */
5800
5801htmlDocPtr
5802htmlParseFile(const char *filename, const char *encoding) {
5803 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5804}
5805
5806/**
5807 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005808 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005809 *
5810 * Set and return the previous value for handling HTML omitted tags.
5811 *
5812 * Returns the last value for 0 for no handling, 1 for auto insertion.
5813 */
5814
5815int
5816htmlHandleOmittedElem(int val) {
5817 int old = htmlOmittedDefaultValue;
5818
5819 htmlOmittedDefaultValue = val;
5820 return(old);
5821}
5822
Daniel Veillard930dfb62003-02-05 10:17:38 +00005823/**
5824 * htmlElementAllowedHere:
5825 * @parent: HTML parent element
5826 * @elt: HTML element
5827 *
5828 * Checks whether an HTML element may be a direct child of a parent element.
5829 * Note - doesn't check for deprecated elements
5830 *
5831 * Returns 1 if allowed; 0 otherwise.
5832 */
5833int
5834htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5835 const char** p ;
5836
5837 if ( ! elt || ! parent || ! parent->subelts )
5838 return 0 ;
5839
5840 for ( p = parent->subelts; *p; ++p )
5841 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5842 return 1 ;
5843
5844 return 0 ;
5845}
5846/**
5847 * htmlElementStatusHere:
5848 * @parent: HTML parent element
5849 * @elt: HTML element
5850 *
5851 * Checks whether an HTML element may be a direct child of a parent element.
5852 * and if so whether it is valid or deprecated.
5853 *
5854 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5855 */
5856htmlStatus
5857htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5858 if ( ! parent || ! elt )
5859 return HTML_INVALID ;
5860 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5861 return HTML_INVALID ;
5862
5863 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5864}
5865/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005866 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005867 * @elt: HTML element
5868 * @attr: HTML attribute
5869 * @legacy: whether to allow deprecated attributes
5870 *
5871 * Checks whether an attribute is valid for an element
5872 * Has full knowledge of Required and Deprecated attributes
5873 *
5874 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5875 */
5876htmlStatus
5877htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5878 const char** p ;
5879
5880 if ( !elt || ! attr )
5881 return HTML_INVALID ;
5882
5883 if ( elt->attrs_req )
5884 for ( p = elt->attrs_req; *p; ++p)
5885 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5886 return HTML_REQUIRED ;
5887
5888 if ( elt->attrs_opt )
5889 for ( p = elt->attrs_opt; *p; ++p)
5890 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5891 return HTML_VALID ;
5892
5893 if ( legacy && elt->attrs_depr )
5894 for ( p = elt->attrs_depr; *p; ++p)
5895 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5896 return HTML_DEPRECATED ;
5897
5898 return HTML_INVALID ;
5899}
5900/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005901 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005902 * @node: an htmlNodePtr in a tree
5903 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005904 * for Element nodes)
5905 *
5906 * Checks whether the tree node is valid. Experimental (the author
5907 * only uses the HTML enhancements in a SAX parser)
5908 *
5909 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5910 * legacy allowed) or htmlElementStatusHere (otherwise).
5911 * for Attribute nodes, a return from htmlAttrAllowed
5912 * for other nodes, HTML_NA (no checks performed)
5913 */
5914htmlStatus
5915htmlNodeStatus(const htmlNodePtr node, int legacy) {
5916 if ( ! node )
5917 return HTML_INVALID ;
5918
5919 switch ( node->type ) {
5920 case XML_ELEMENT_NODE:
5921 return legacy
5922 ? ( htmlElementAllowedHere (
5923 htmlTagLookup(node->parent->name) , node->name
5924 ) ? HTML_VALID : HTML_INVALID )
5925 : htmlElementStatusHere(
5926 htmlTagLookup(node->parent->name) ,
5927 htmlTagLookup(node->name) )
5928 ;
5929 case XML_ATTRIBUTE_NODE:
5930 return htmlAttrAllowed(
5931 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5932 default: return HTML_NA ;
5933 }
5934}
Daniel Veillard9475a352003-09-26 12:47:50 +00005935/************************************************************************
5936 * *
5937 * New set (2.6.0) of simpler and more flexible APIs *
5938 * *
5939 ************************************************************************/
5940/**
5941 * DICT_FREE:
5942 * @str: a string
5943 *
5944 * Free a string if it is not owned by the "dict" dictionnary in the
5945 * current scope
5946 */
5947#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02005948 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00005949 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5950 xmlFree((char *)(str));
5951
5952/**
5953 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005954 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005955 *
5956 * Reset a parser context
5957 */
5958void
5959htmlCtxtReset(htmlParserCtxtPtr ctxt)
5960{
5961 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005962 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02005963
Daniel Veillarda03e3652004-11-02 18:45:30 +00005964 if (ctxt == NULL)
5965 return;
5966
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005967 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005968 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005969
5970 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5971 xmlFreeInputStream(input);
5972 }
5973 ctxt->inputNr = 0;
5974 ctxt->input = NULL;
5975
5976 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005977 if (ctxt->spaceTab != NULL) {
5978 ctxt->spaceTab[0] = -1;
5979 ctxt->space = &ctxt->spaceTab[0];
5980 } else {
5981 ctxt->space = NULL;
5982 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005983
5984
5985 ctxt->nodeNr = 0;
5986 ctxt->node = NULL;
5987
5988 ctxt->nameNr = 0;
5989 ctxt->name = NULL;
5990
5991 DICT_FREE(ctxt->version);
5992 ctxt->version = NULL;
5993 DICT_FREE(ctxt->encoding);
5994 ctxt->encoding = NULL;
5995 DICT_FREE(ctxt->directory);
5996 ctxt->directory = NULL;
5997 DICT_FREE(ctxt->extSubURI);
5998 ctxt->extSubURI = NULL;
5999 DICT_FREE(ctxt->extSubSystem);
6000 ctxt->extSubSystem = NULL;
6001 if (ctxt->myDoc != NULL)
6002 xmlFreeDoc(ctxt->myDoc);
6003 ctxt->myDoc = NULL;
6004
6005 ctxt->standalone = -1;
6006 ctxt->hasExternalSubset = 0;
6007 ctxt->hasPErefs = 0;
6008 ctxt->html = 1;
6009 ctxt->external = 0;
6010 ctxt->instate = XML_PARSER_START;
6011 ctxt->token = 0;
6012
6013 ctxt->wellFormed = 1;
6014 ctxt->nsWellFormed = 1;
6015 ctxt->valid = 1;
6016 ctxt->vctxt.userData = ctxt;
6017 ctxt->vctxt.error = xmlParserValidityError;
6018 ctxt->vctxt.warning = xmlParserValidityWarning;
6019 ctxt->record_info = 0;
6020 ctxt->nbChars = 0;
6021 ctxt->checkIndex = 0;
6022 ctxt->inSubset = 0;
6023 ctxt->errNo = XML_ERR_OK;
6024 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006025 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006026 ctxt->catalogs = NULL;
6027 xmlInitNodeInfoSeq(&ctxt->node_seq);
6028
6029 if (ctxt->attsDefault != NULL) {
6030 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6031 ctxt->attsDefault = NULL;
6032 }
6033 if (ctxt->attsSpecial != NULL) {
6034 xmlHashFree(ctxt->attsSpecial, NULL);
6035 ctxt->attsSpecial = NULL;
6036 }
6037}
6038
6039/**
6040 * htmlCtxtUseOptions:
6041 * @ctxt: an HTML parser context
6042 * @options: a combination of htmlParserOption(s)
6043 *
6044 * Applies the options to the parser context
6045 *
6046 * Returns 0 in case of success, the set of unknown or unimplemented options
6047 * in case of error.
6048 */
6049int
6050htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6051{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006052 if (ctxt == NULL)
6053 return(-1);
6054
Daniel Veillard9475a352003-09-26 12:47:50 +00006055 if (options & HTML_PARSE_NOWARNING) {
6056 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006057 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006058 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006059 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006060 }
6061 if (options & HTML_PARSE_NOERROR) {
6062 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006063 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006064 ctxt->sax->fatalError = NULL;
6065 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006066 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006067 }
6068 if (options & HTML_PARSE_PEDANTIC) {
6069 ctxt->pedantic = 1;
6070 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006071 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006072 } else
6073 ctxt->pedantic = 0;
6074 if (options & XML_PARSE_NOBLANKS) {
6075 ctxt->keepBlanks = 0;
6076 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6077 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006078 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006079 } else
6080 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006081 if (options & HTML_PARSE_RECOVER) {
6082 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006083 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006084 } else
6085 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006086 if (options & HTML_PARSE_COMPACT) {
6087 ctxt->options |= HTML_PARSE_COMPACT;
6088 options -= HTML_PARSE_COMPACT;
6089 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006090 if (options & XML_PARSE_HUGE) {
6091 ctxt->options |= XML_PARSE_HUGE;
6092 options -= XML_PARSE_HUGE;
6093 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006094 ctxt->dictNames = 0;
6095 return (options);
6096}
6097
6098/**
6099 * htmlDoRead:
6100 * @ctxt: an HTML parser context
6101 * @URL: the base URL to use for the document
6102 * @encoding: the document encoding, or NULL
6103 * @options: a combination of htmlParserOption(s)
6104 * @reuse: keep the context for reuse
6105 *
6106 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006107 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006108 * Returns the resulting document tree or NULL
6109 */
6110static htmlDocPtr
6111htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6112 int options, int reuse)
6113{
6114 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006115
Daniel Veillard9475a352003-09-26 12:47:50 +00006116 htmlCtxtUseOptions(ctxt, options);
6117 ctxt->html = 1;
6118 if (encoding != NULL) {
6119 xmlCharEncodingHandlerPtr hdlr;
6120
6121 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006122 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006123 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006124 if (ctxt->input->encoding != NULL)
6125 xmlFree((xmlChar *) ctxt->input->encoding);
6126 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6127 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006128 }
6129 if ((URL != NULL) && (ctxt->input != NULL) &&
6130 (ctxt->input->filename == NULL))
6131 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6132 htmlParseDocument(ctxt);
6133 ret = ctxt->myDoc;
6134 ctxt->myDoc = NULL;
6135 if (!reuse) {
6136 if ((ctxt->dictNames) &&
6137 (ret != NULL) &&
6138 (ret->dict == ctxt->dict))
6139 ctxt->dict = NULL;
6140 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006141 }
6142 return (ret);
6143}
6144
6145/**
6146 * htmlReadDoc:
6147 * @cur: a pointer to a zero terminated string
6148 * @URL: the base URL to use for the document
6149 * @encoding: the document encoding, or NULL
6150 * @options: a combination of htmlParserOption(s)
6151 *
6152 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006153 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006154 * Returns the resulting document tree
6155 */
6156htmlDocPtr
6157htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6158{
6159 htmlParserCtxtPtr ctxt;
6160
6161 if (cur == NULL)
6162 return (NULL);
6163
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006164 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006165 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006166 if (ctxt == NULL)
6167 return (NULL);
6168 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6169}
6170
6171/**
6172 * htmlReadFile:
6173 * @filename: a file or URL
6174 * @encoding: the document encoding, or NULL
6175 * @options: a combination of htmlParserOption(s)
6176 *
6177 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006178 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006179 * Returns the resulting document tree
6180 */
6181htmlDocPtr
6182htmlReadFile(const char *filename, const char *encoding, int options)
6183{
6184 htmlParserCtxtPtr ctxt;
6185
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006186 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006187 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6188 if (ctxt == NULL)
6189 return (NULL);
6190 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6191}
6192
6193/**
6194 * htmlReadMemory:
6195 * @buffer: a pointer to a char array
6196 * @size: the size of the array
6197 * @URL: the base URL to use for the document
6198 * @encoding: the document encoding, or NULL
6199 * @options: a combination of htmlParserOption(s)
6200 *
6201 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006202 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006203 * Returns the resulting document tree
6204 */
6205htmlDocPtr
6206htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6207{
6208 htmlParserCtxtPtr ctxt;
6209
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006210 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006211 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6212 if (ctxt == NULL)
6213 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006214 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006215 if (ctxt->sax != NULL)
6216 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006217 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6218}
6219
6220/**
6221 * htmlReadFd:
6222 * @fd: an open file descriptor
6223 * @URL: the base URL to use for the document
6224 * @encoding: the document encoding, or NULL
6225 * @options: a combination of htmlParserOption(s)
6226 *
6227 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006228 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006229 * Returns the resulting document tree
6230 */
6231htmlDocPtr
6232htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6233{
6234 htmlParserCtxtPtr ctxt;
6235 xmlParserInputBufferPtr input;
6236 xmlParserInputPtr stream;
6237
6238 if (fd < 0)
6239 return (NULL);
6240
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006241 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006242 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6243 if (input == NULL)
6244 return (NULL);
6245 ctxt = xmlNewParserCtxt();
6246 if (ctxt == NULL) {
6247 xmlFreeParserInputBuffer(input);
6248 return (NULL);
6249 }
6250 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6251 if (stream == NULL) {
6252 xmlFreeParserInputBuffer(input);
6253 xmlFreeParserCtxt(ctxt);
6254 return (NULL);
6255 }
6256 inputPush(ctxt, stream);
6257 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6258}
6259
6260/**
6261 * htmlReadIO:
6262 * @ioread: an I/O read function
6263 * @ioclose: an I/O close function
6264 * @ioctx: an I/O handler
6265 * @URL: the base URL to use for the document
6266 * @encoding: the document encoding, or NULL
6267 * @options: a combination of htmlParserOption(s)
6268 *
6269 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006270 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006271 * Returns the resulting document tree
6272 */
6273htmlDocPtr
6274htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6275 void *ioctx, const char *URL, const char *encoding, int options)
6276{
6277 htmlParserCtxtPtr ctxt;
6278 xmlParserInputBufferPtr input;
6279 xmlParserInputPtr stream;
6280
6281 if (ioread == NULL)
6282 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006283 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006284
6285 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6286 XML_CHAR_ENCODING_NONE);
6287 if (input == NULL)
6288 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006289 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006290 if (ctxt == NULL) {
6291 xmlFreeParserInputBuffer(input);
6292 return (NULL);
6293 }
6294 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6295 if (stream == NULL) {
6296 xmlFreeParserInputBuffer(input);
6297 xmlFreeParserCtxt(ctxt);
6298 return (NULL);
6299 }
6300 inputPush(ctxt, stream);
6301 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6302}
6303
6304/**
6305 * htmlCtxtReadDoc:
6306 * @ctxt: an HTML parser context
6307 * @cur: a pointer to a zero terminated string
6308 * @URL: the base URL to use for the document
6309 * @encoding: the document encoding, or NULL
6310 * @options: a combination of htmlParserOption(s)
6311 *
6312 * parse an XML in-memory document and build a tree.
6313 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006314 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006315 * Returns the resulting document tree
6316 */
6317htmlDocPtr
6318htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6319 const char *URL, const char *encoding, int options)
6320{
6321 xmlParserInputPtr stream;
6322
6323 if (cur == NULL)
6324 return (NULL);
6325 if (ctxt == NULL)
6326 return (NULL);
6327
6328 htmlCtxtReset(ctxt);
6329
6330 stream = xmlNewStringInputStream(ctxt, cur);
6331 if (stream == NULL) {
6332 return (NULL);
6333 }
6334 inputPush(ctxt, stream);
6335 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6336}
6337
6338/**
6339 * htmlCtxtReadFile:
6340 * @ctxt: an HTML parser context
6341 * @filename: a file or URL
6342 * @encoding: the document encoding, or NULL
6343 * @options: a combination of htmlParserOption(s)
6344 *
6345 * parse an XML file from the filesystem or the network.
6346 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006347 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006348 * Returns the resulting document tree
6349 */
6350htmlDocPtr
6351htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6352 const char *encoding, int options)
6353{
6354 xmlParserInputPtr stream;
6355
6356 if (filename == NULL)
6357 return (NULL);
6358 if (ctxt == NULL)
6359 return (NULL);
6360
6361 htmlCtxtReset(ctxt);
6362
Daniel Veillard29614c72004-11-26 10:47:26 +00006363 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006364 if (stream == NULL) {
6365 return (NULL);
6366 }
6367 inputPush(ctxt, stream);
6368 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6369}
6370
6371/**
6372 * htmlCtxtReadMemory:
6373 * @ctxt: an HTML parser context
6374 * @buffer: a pointer to a char array
6375 * @size: the size of the array
6376 * @URL: the base URL to use for the document
6377 * @encoding: the document encoding, or NULL
6378 * @options: a combination of htmlParserOption(s)
6379 *
6380 * parse an XML in-memory document and build a tree.
6381 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006382 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006383 * Returns the resulting document tree
6384 */
6385htmlDocPtr
6386htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6387 const char *URL, const char *encoding, int options)
6388{
6389 xmlParserInputBufferPtr input;
6390 xmlParserInputPtr stream;
6391
6392 if (ctxt == NULL)
6393 return (NULL);
6394 if (buffer == NULL)
6395 return (NULL);
6396
6397 htmlCtxtReset(ctxt);
6398
6399 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6400 if (input == NULL) {
6401 return(NULL);
6402 }
6403
6404 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6405 if (stream == NULL) {
6406 xmlFreeParserInputBuffer(input);
6407 return(NULL);
6408 }
6409
6410 inputPush(ctxt, stream);
6411 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6412}
6413
6414/**
6415 * htmlCtxtReadFd:
6416 * @ctxt: an HTML parser context
6417 * @fd: an open file descriptor
6418 * @URL: the base URL to use for the document
6419 * @encoding: the document encoding, or NULL
6420 * @options: a combination of htmlParserOption(s)
6421 *
6422 * parse an XML from a file descriptor and build a tree.
6423 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006424 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006425 * Returns the resulting document tree
6426 */
6427htmlDocPtr
6428htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6429 const char *URL, const char *encoding, int options)
6430{
6431 xmlParserInputBufferPtr input;
6432 xmlParserInputPtr stream;
6433
6434 if (fd < 0)
6435 return (NULL);
6436 if (ctxt == NULL)
6437 return (NULL);
6438
6439 htmlCtxtReset(ctxt);
6440
6441
6442 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6443 if (input == NULL)
6444 return (NULL);
6445 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6446 if (stream == NULL) {
6447 xmlFreeParserInputBuffer(input);
6448 return (NULL);
6449 }
6450 inputPush(ctxt, stream);
6451 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6452}
6453
6454/**
6455 * htmlCtxtReadIO:
6456 * @ctxt: an HTML parser context
6457 * @ioread: an I/O read function
6458 * @ioclose: an I/O close function
6459 * @ioctx: an I/O handler
6460 * @URL: the base URL to use for the document
6461 * @encoding: the document encoding, or NULL
6462 * @options: a combination of htmlParserOption(s)
6463 *
6464 * parse an HTML document from I/O functions and source and build a tree.
6465 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006466 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006467 * Returns the resulting document tree
6468 */
6469htmlDocPtr
6470htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6471 xmlInputCloseCallback ioclose, void *ioctx,
6472 const char *URL,
6473 const char *encoding, int options)
6474{
6475 xmlParserInputBufferPtr input;
6476 xmlParserInputPtr stream;
6477
6478 if (ioread == NULL)
6479 return (NULL);
6480 if (ctxt == NULL)
6481 return (NULL);
6482
6483 htmlCtxtReset(ctxt);
6484
6485 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6486 XML_CHAR_ENCODING_NONE);
6487 if (input == NULL)
6488 return (NULL);
6489 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6490 if (stream == NULL) {
6491 xmlFreeParserInputBuffer(input);
6492 return (NULL);
6493 }
6494 inputPush(ctxt, stream);
6495 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6496}
6497
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006498#define bottom_HTMLparser
6499#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006500#endif /* LIBXML_HTML_ENABLED */