blob: d3c09d6095ad157ff74cd716365c97f111b3fdd4 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
212/*
213 * Macros for accessing the content. Those should be used only by the parser,
214 * and not exported.
215 *
216 * Dirty macros, i.e. one need to make assumption on the context to use them
217 *
218 * CUR_PTR return the current pointer to the xmlChar to be parsed.
219 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
220 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
221 * in UNICODE mode. This should be used internally by the parser
222 * only to compare to ASCII values otherwise it would break when
223 * running with UTF-8 encoding.
224 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
225 * to compare on ASCII based substring.
226 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
227 * it should be used only to compare on ASCII based substring.
228 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 *
231 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
232 *
233 * CURRENT Returns the current char value, with the full decoding of
234 * UTF-8 if we are using this mode. It returns an int.
235 * NEXT Skip to the next character, this does the proper decoding
236 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000237 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000238 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
239 */
240
241#define UPPER (toupper(*ctxt->input->cur))
242
Daniel Veillard77a90a72003-03-22 00:04:05 +0000243#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000244
245#define NXT(val) ctxt->input->cur[(val)]
246
247#define UPP(val) (toupper(ctxt->input->cur[(val)]))
248
249#define CUR_PTR ctxt->input->cur
250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
252 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
253 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000255#define GROW if ((ctxt->progressive == 0) && \
256 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
257 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000258
259#define CURRENT ((int) (*ctxt->input->cur))
260
261#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
262
263/* Inported from XML */
264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
266#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000267#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000268
Daniel Veillard561b7f82002-03-20 21:55:57 +0000269#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000270#define NXT(val) ctxt->input->cur[(val)]
271#define CUR_PTR ctxt->input->cur
272
273
274#define NEXTL(l) do { \
275 if (*(ctxt->input->cur) == '\n') { \
276 ctxt->input->line++; ctxt->input->col = 1; \
277 } else ctxt->input->col++; \
278 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200280
Owen Taylor3473f882001-02-23 17:55:21 +0000281/************
282 \
283 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
284 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
285 ************/
286
287#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
288#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
289
290#define COPY_BUF(l,b,i,v) \
291 if (l == 1) b[i++] = (xmlChar) v; \
292 else i += xmlCopyChar(l,&b[i],v)
293
294/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200295 * htmlFindEncoding:
296 * @the HTML parser context
297 *
298 * Ty to find and encoding in the current data available in the input
299 * buffer this is needed to try to switch to the proper encoding when
300 * one face a character error.
301 * That's an heuristic, since it's operating outside of parsing it could
302 * try to use a meta which had been commented out, that's the reason it
303 * should only be used in case of error, not as a default.
304 *
305 * Returns an encoding string or NULL if not found, the string need to
306 * be freed
307 */
308static xmlChar *
309htmlFindEncoding(xmlParserCtxtPtr ctxt) {
310 const xmlChar *start, *cur, *end;
311
312 if ((ctxt == NULL) || (ctxt->input == NULL) ||
313 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
314 (ctxt->input->buf->encoder != NULL))
315 return(NULL);
316 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
317 return(NULL);
318
319 start = ctxt->input->cur;
320 end = ctxt->input->end;
321 /* we also expect the input buffer to be zero terminated */
322 if (*end != 0)
323 return(NULL);
324
325 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
326 if (cur == NULL)
327 return(NULL);
328 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
329 if (cur == NULL)
330 return(NULL);
331 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
332 if (cur == NULL)
333 return(NULL);
334 cur += 8;
335 start = cur;
336 while (((*cur >= 'A') && (*cur <= 'Z')) ||
337 ((*cur >= 'a') && (*cur <= 'z')) ||
338 ((*cur >= '0') && (*cur <= '9')) ||
339 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
340 cur++;
341 if (cur == start)
342 return(NULL);
343 return(xmlStrndup(start, cur - start));
344}
345
346/**
Owen Taylor3473f882001-02-23 17:55:21 +0000347 * htmlCurrentChar:
348 * @ctxt: the HTML parser context
349 * @len: pointer to the length of the char read
350 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000351 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000352 * bytes in the input buffer. Implement the end of line normalization:
353 * 2.11 End-of-Line Handling
354 * If the encoding is unspecified, in the case we find an ISO-Latin-1
355 * char, then the encoding converter is plugged in automatically.
356 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000357 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
359
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000360static int
Owen Taylor3473f882001-02-23 17:55:21 +0000361htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
362 if (ctxt->instate == XML_PARSER_EOF)
363 return(0);
364
365 if (ctxt->token != 0) {
366 *len = 0;
367 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200368 }
Owen Taylor3473f882001-02-23 17:55:21 +0000369 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
370 /*
371 * We are supposed to handle UTF8, check it's valid
372 * From rfc2044: encoding of the Unicode values on UTF-8:
373 *
374 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
375 * 0000 0000-0000 007F 0xxxxxxx
376 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200377 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000378 *
379 * Check for the 0x110000 limit too
380 */
381 const unsigned char *cur = ctxt->input->cur;
382 unsigned char c;
383 unsigned int val;
384
385 c = *cur;
386 if (c & 0x80) {
387 if (cur[1] == 0)
388 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
389 if ((cur[1] & 0xc0) != 0x80)
390 goto encoding_error;
391 if ((c & 0xe0) == 0xe0) {
392
393 if (cur[2] == 0)
394 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
395 if ((cur[2] & 0xc0) != 0x80)
396 goto encoding_error;
397 if ((c & 0xf0) == 0xf0) {
398 if (cur[3] == 0)
399 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
400 if (((c & 0xf8) != 0xf0) ||
401 ((cur[3] & 0xc0) != 0x80))
402 goto encoding_error;
403 /* 4-byte code */
404 *len = 4;
405 val = (cur[0] & 0x7) << 18;
406 val |= (cur[1] & 0x3f) << 12;
407 val |= (cur[2] & 0x3f) << 6;
408 val |= cur[3] & 0x3f;
409 } else {
410 /* 3-byte code */
411 *len = 3;
412 val = (cur[0] & 0xf) << 12;
413 val |= (cur[1] & 0x3f) << 6;
414 val |= cur[2] & 0x3f;
415 }
416 } else {
417 /* 2-byte code */
418 *len = 2;
419 val = (cur[0] & 0x1f) << 6;
420 val |= cur[1] & 0x3f;
421 }
422 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000423 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
424 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200425 }
Owen Taylor3473f882001-02-23 17:55:21 +0000426 return(val);
427 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200428 if ((*ctxt->input->cur == 0) &&
429 (ctxt->input->cur < ctxt->input->end)) {
430 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
431 "Char 0x%X out of allowed range\n", 0);
432 *len = 1;
433 return(' ');
434 }
Owen Taylor3473f882001-02-23 17:55:21 +0000435 /* 1-byte code */
436 *len = 1;
437 return((int) *ctxt->input->cur);
438 }
439 }
440 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000441 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000442 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000443 * XML constructs only use < 128 chars
444 */
445 *len = 1;
446 if ((int) *ctxt->input->cur < 0x80)
447 return((int) *ctxt->input->cur);
448
449 /*
450 * Humm this is bad, do an automatic flow conversion
451 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200452 {
453 xmlChar * guess;
454 xmlCharEncodingHandlerPtr handler;
455
456 guess = htmlFindEncoding(ctxt);
457 if (guess == NULL) {
458 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
459 } else {
460 if (ctxt->input->encoding != NULL)
461 xmlFree((xmlChar *) ctxt->input->encoding);
462 ctxt->input->encoding = guess;
463 handler = xmlFindCharEncodingHandler((const char *) guess);
464 if (handler != NULL) {
465 xmlSwitchToEncoding(ctxt, handler);
466 } else {
467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468 "Unsupported encoding %s", guess, NULL);
469 }
470 }
471 ctxt->charset = XML_CHAR_ENCODING_UTF8;
472 }
473
Owen Taylor3473f882001-02-23 17:55:21 +0000474 return(xmlCurrentChar(ctxt, len));
475
476encoding_error:
477 /*
478 * If we detect an UTF8 error that probably mean that the
479 * input encoding didn't get properly advertized in the
480 * declaration header. Report the error and switch the encoding
481 * to ISO-Latin-1 (if you don't like this policy, just declare the
482 * encoding !)
483 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000484 {
485 char buffer[150];
486
Daniel Veillard861101d2007-06-12 08:38:57 +0000487 if (ctxt->input->end - ctxt->input->cur >= 4) {
488 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
489 ctxt->input->cur[0], ctxt->input->cur[1],
490 ctxt->input->cur[2], ctxt->input->cur[3]);
491 } else {
492 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
493 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000494 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
495 "Input is not proper UTF-8, indicate encoding !\n",
496 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000497 }
498
Daniel Veillarde77db162009-08-22 11:32:38 +0200499 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000500 *len = 1;
501 return((int) *ctxt->input->cur);
502}
503
504/**
Owen Taylor3473f882001-02-23 17:55:21 +0000505 * htmlSkipBlankChars:
506 * @ctxt: the HTML parser context
507 *
508 * skip all blanks character found at that point in the input streams.
509 *
510 * Returns the number of space chars skipped
511 */
512
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000513static int
Owen Taylor3473f882001-02-23 17:55:21 +0000514htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
515 int res = 0;
516
William M. Brack76e95df2003-10-18 16:20:14 +0000517 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000518 if ((*ctxt->input->cur == 0) &&
519 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
520 xmlPopInput(ctxt);
521 } else {
522 if (*(ctxt->input->cur) == '\n') {
523 ctxt->input->line++; ctxt->input->col = 1;
524 } else ctxt->input->col++;
525 ctxt->input->cur++;
526 ctxt->nbChars++;
527 if (*ctxt->input->cur == 0)
528 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
529 }
530 res++;
531 }
532 return(res);
533}
534
535
536
537/************************************************************************
538 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200539 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000540 * *
541 ************************************************************************/
542
543/*
544 * Start Tag: 1 means the start tag can be ommited
545 * End Tag: 1 means the end tag can be ommited
546 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000547 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000548 * Depr: this element is deprecated
549 * DTD: 1 means that this element is valid only in the Loose DTD
550 * 2 means that this element is valid only in the Frameset DTD
551 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000552 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000554 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000555
556/* Definitions and a couple of vars for HTML Elements */
557
558#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000559#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000560#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000561#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000562#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
563#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000564#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000565#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
566#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
567#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000568#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000569#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000570#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000571#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000572#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000573#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000575#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000576#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000577#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000578#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000579#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000580#define EMPTY NULL
581
582
Daniel Veillard065abe82006-07-03 08:55:04 +0000583static const char* const html_flow[] = { FLOW, NULL } ;
584static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000585
586/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000587static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000588#define html_cdata html_pcdata
589
590
591/* ... and for HTML Attributes */
592
593#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000594#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000595#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000596#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000597#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000598#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000599#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000600#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000601#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000602#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000604#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000605
Daniel Veillard065abe82006-07-03 08:55:04 +0000606static const char* const html_attrs[] = { ATTRS, NULL } ;
607static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
608static const char* const core_attrs[] = { COREATTRS, NULL } ;
609static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000610
611
612/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000613static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000614 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
615 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000616static const char* const target_attr[] = { "target", NULL } ;
617static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
618static const char* const alt_attr[] = { "alt", NULL } ;
619static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
620static const char* const href_attrs[] = { "href", NULL } ;
621static const char* const clear_attrs[] = { "clear", NULL } ;
622static const char* const inline_p[] = { INLINE, "p", NULL } ;
623
624static const char* const flow_param[] = { FLOW, "param", NULL } ;
625static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000626 "archive", "alt", "name", "height", "width", "align",
627 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000628static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000629 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000630static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000631 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000632static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
633static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
634static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
635static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000636 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000637static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000638 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
639
640
Daniel Veillard065abe82006-07-03 08:55:04 +0000641static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
642static const char* const col_elt[] = { "col", NULL } ;
643static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
644static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
645static const char* const dl_contents[] = { "dt", "dd", NULL } ;
646static const char* const compact_attr[] = { "compact", NULL } ;
647static const char* const label_attr[] = { "label", NULL } ;
648static const char* const fieldset_contents[] = { FLOW, "legend" } ;
649static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
650static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
651static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
652static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
653static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
654static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
655static const char* const head_attrs[] = { I18N, "profile", NULL } ;
656static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
657static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
658static const char* const version_attr[] = { "version", NULL } ;
659static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
660static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
661static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000662static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000663static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
664static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
665static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
666static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
667static const char* const align_attr[] = { "align", NULL } ;
668static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
669static const char* const map_contents[] = { BLOCK, "area", NULL } ;
670static const char* const name_attr[] = { "name", NULL } ;
671static const char* const action_attr[] = { "action", NULL } ;
672static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
673static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
674static const char* const content_attr[] = { "content", NULL } ;
675static const char* const type_attr[] = { "type", NULL } ;
676static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
677static const char* const object_contents[] = { FLOW, "param", NULL } ;
678static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
679static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
680static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
681static const char* const option_elt[] = { "option", NULL } ;
682static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
683static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
684static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
685static const char* const width_attr[] = { "width", NULL } ;
686static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
687static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
688static const char* const language_attr[] = { "language", NULL } ;
689static const char* const select_content[] = { "optgroup", "option", NULL } ;
690static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
691static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200692static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000693static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
694static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
695static const char* const tr_elt[] = { "tr", NULL } ;
696static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
697static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
698static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
699static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
700static const char* const tr_contents[] = { "th", "td", NULL } ;
701static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
702static const char* const li_elt[] = { "li", NULL } ;
703static const char* const ul_depr[] = { "type", "compact", NULL} ;
704static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000705
706#define DECL (const char**)
707
Daniel Veillard22090732001-07-16 00:06:07 +0000708static const htmlElemDesc
709html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000710{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
711 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
712},
713{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
714 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
715},
716{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
717 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
718},
719{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
720 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
721},
722{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
723 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
724},
725{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
726 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
727},
728{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
729 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
730},
731{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
732 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
733},
734{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
735 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
736},
737{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
738 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
739},
740{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
741 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
742},
743{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
744 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
745},
746{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
747 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
748},
749{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
750 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
751},
752{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
753 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
754},
755{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
756 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
757},
758{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
759 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
760},
761{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
762 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
763},
764{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
765 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
766},
767{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
768 EMPTY , NULL , DECL col_attrs , NULL, NULL
769},
770{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
771 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
772},
773{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
774 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
775},
776{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
777 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
778},
779{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
780 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
781},
782{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
783 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
784},
785{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
786 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
787},
788{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000789 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000790},
791{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
792 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
793},
794{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
795 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
796},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000797{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000798 EMPTY, NULL, DECL embed_attrs, NULL, NULL
799},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000800{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
801 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
802},
803{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
804 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
805},
806{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
807 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
808},
809{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
810 EMPTY, NULL, NULL, DECL frame_attrs, NULL
811},
812{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
813 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
814},
815{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
816 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
817},
818{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
819 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
820},
821{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
822 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
823},
824{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
825 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
826},
827{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
828 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
829},
830{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
831 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
832},
833{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
834 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
835},
836{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
837 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
838},
839{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
840 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
841},
842{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
843 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
844},
845{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
846 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
847},
848{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000849 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000850},
851{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
852 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
853},
854{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
855 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
856},
857{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
858 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
859},
860{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
861 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
862},
863{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
864 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
865},
866{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
867 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
868},
869{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
870 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
871},
872{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
873 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
874},
875{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000876 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000877},
878{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
879 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
880},
881{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
882 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
883},
884{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
885 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
886},
887{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
888 DECL html_flow, "div", DECL html_attrs, NULL, NULL
889},
890{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
891 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
892},
893{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
894 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
895},
896{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000897 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000898},
899{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
900 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
901},
902{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904},
905{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000906 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000907},
908{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
909 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
910},
911{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
912 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
913},
914{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
915 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
916},
917{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919},
920{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
921 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
922},
923{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
924 DECL select_content, NULL, DECL select_attrs, NULL, NULL
925},
926{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
927 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
928},
929{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931},
932{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
933 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
934},
935{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
936 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
937},
938{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
939 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
940},
941{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
942 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
943},
944{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
945 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
946},
947{ "table", 0, 0, 0, 0, 0, 0, 0, "",
948 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
949},
950{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
951 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
952},
953{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
954 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
955},
956{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
957 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
958},
959{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
960 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
961},
962{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
963 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
964},
965{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
966 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
967},
968{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
969 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
970},
971{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
972 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
973},
974{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976},
977{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
978 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
979},
980{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
981 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
982},
983{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985}
Owen Taylor3473f882001-02-23 17:55:21 +0000986};
987
988/*
Owen Taylor3473f882001-02-23 17:55:21 +0000989 * start tags that imply the end of current element
990 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000991static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000992"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
993 "dl", "ul", "ol", "menu", "dir", "address", "pre",
994 "listing", "xmp", "head", NULL,
995"head", "p", NULL,
996"title", "p", NULL,
997"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000998"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000999"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1000 "pre", "listing", "xmp", "head", "li", NULL,
1001"hr", "p", "head", NULL,
1002"h1", "p", "head", NULL,
1003"h2", "p", "head", NULL,
1004"h3", "p", "head", NULL,
1005"h4", "p", "head", NULL,
1006"h5", "p", "head", NULL,
1007"h6", "p", "head", NULL,
1008"dir", "p", "head", NULL,
1009"address", "p", "head", "ul", NULL,
1010"pre", "p", "head", "ul", NULL,
1011"listing", "p", "head", NULL,
1012"xmp", "p", "head", NULL,
1013"blockquote", "p", "head", NULL,
1014"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1015 "xmp", "head", NULL,
1016"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1017 "head", "dd", NULL,
1018"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1019 "head", "dt", NULL,
1020"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1021 "listing", "xmp", NULL,
1022"ol", "p", "head", "ul", NULL,
1023"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001024"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001025"div", "p", "head", NULL,
1026"noscript", "p", "head", NULL,
1027"center", "font", "b", "i", "p", "head", NULL,
1028"a", "a", NULL,
1029"caption", "p", NULL,
1030"colgroup", "caption", "colgroup", "col", "p", NULL,
1031"col", "caption", "col", "p", NULL,
1032"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1033 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001034"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001035"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001036"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1037"thead", "caption", "col", "colgroup", NULL,
1038"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1039 "tbody", "p", NULL,
1040"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1041 "tfoot", "tbody", "p", NULL,
1042"optgroup", "option", NULL,
1043"option", "option", NULL,
1044"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1045 "pre", "listing", "xmp", "a", NULL,
1046NULL
1047};
1048
1049/*
1050 * The list of HTML elements which are supposed not to have
1051 * CDATA content and where a p element will be implied
1052 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001053 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001054 * implied paragraph
1055 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001056static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001057 "html",
1058 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001059 NULL
1060};
1061
1062/*
1063 * The list of HTML attributes which are of content %Script;
1064 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1065 * it assumes the name starts with 'on'
1066 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001067static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001068 "onclick",
1069 "ondblclick",
1070 "onmousedown",
1071 "onmouseup",
1072 "onmouseover",
1073 "onmousemove",
1074 "onmouseout",
1075 "onkeypress",
1076 "onkeydown",
1077 "onkeyup",
1078 "onload",
1079 "onunload",
1080 "onfocus",
1081 "onblur",
1082 "onsubmit",
1083 "onrest",
1084 "onchange",
1085 "onselect"
1086};
1087
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001088/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001089 * This table is used by the htmlparser to know what to do with
1090 * broken html pages. By assigning different priorities to different
1091 * elements the parser can decide how to handle extra endtags.
1092 * Endtags are only allowed to close elements with lower or equal
1093 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001094 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001095
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001096typedef struct {
1097 const char *name;
1098 int priority;
1099} elementPriority;
1100
Daniel Veillard22090732001-07-16 00:06:07 +00001101static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001102 {"div", 150},
1103 {"td", 160},
1104 {"th", 160},
1105 {"tr", 170},
1106 {"thead", 180},
1107 {"tbody", 180},
1108 {"tfoot", 180},
1109 {"table", 190},
1110 {"head", 200},
1111 {"body", 200},
1112 {"html", 220},
1113 {NULL, 100} /* Default priority */
1114};
Owen Taylor3473f882001-02-23 17:55:21 +00001115
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001116static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001117static int htmlStartCloseIndexinitialized = 0;
1118
1119/************************************************************************
1120 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001121 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001122 * *
1123 ************************************************************************/
1124
1125/**
1126 * htmlInitAutoClose:
1127 *
1128 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1129 * This is not reentrant. Call xmlInitParser() once before processing in
1130 * case of use in multithreaded programs.
1131 */
1132void
1133htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001134 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001135
1136 if (htmlStartCloseIndexinitialized) return;
1137
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001138 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1139 indx = 0;
1140 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001141 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001142 while (htmlStartClose[i] != NULL) i++;
1143 i++;
1144 }
1145 htmlStartCloseIndexinitialized = 1;
1146}
1147
1148/**
1149 * htmlTagLookup:
1150 * @tag: The tag name in lowercase
1151 *
1152 * Lookup the HTML tag in the ElementTable
1153 *
1154 * Returns the related htmlElemDescPtr or NULL if not found.
1155 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001156const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001157htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001158 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001159
1160 for (i = 0; i < (sizeof(html40ElementTable) /
1161 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001162 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001163 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001164 }
1165 return(NULL);
1166}
1167
1168/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001169 * htmlGetEndPriority:
1170 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001171 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001172 * Return value: The "endtag" priority.
1173 **/
1174static int
1175htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001176 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001177
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001178 while ((htmlEndPriority[i].name != NULL) &&
1179 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1180 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001181
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001182 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001183}
1184
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001185
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001186/**
Owen Taylor3473f882001-02-23 17:55:21 +00001187 * htmlCheckAutoClose:
1188 * @newtag: The new tag name
1189 * @oldtag: The old tag name
1190 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001191 * Checks whether the new tag is one of the registered valid tags for
1192 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001193 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1194 *
1195 * Returns 0 if no, 1 if yes.
1196 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001197static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1199{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001200 int i, indx;
1201 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001202
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001203 if (htmlStartCloseIndexinitialized == 0)
1204 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001205
1206 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001207 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001208 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001209 if (closed == NULL)
1210 return (0);
1211 if (xmlStrEqual(BAD_CAST * closed, newtag))
1212 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001213 }
1214
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001215 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001216 i++;
1217 while (htmlStartClose[i] != NULL) {
1218 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001219 return (1);
1220 }
1221 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001222 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001223 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001224}
1225
1226/**
1227 * htmlAutoCloseOnClose:
1228 * @ctxt: an HTML parser context
1229 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001230 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001231 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001232 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001233 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001234static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001235htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1236{
1237 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001238 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001239
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001240 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001241
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001242 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001243
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1245 break;
1246 /*
1247 * A missplaced endtag can only close elements with lower
1248 * or equal priority, so if we find an element with higher
1249 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001250 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001251 */
1252 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1253 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001254 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001255 if (i < 0)
1256 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001257
1258 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001259 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001260 if ((info != NULL) && (info->endTag == 3)) {
1261 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1262 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001263 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001264 }
1265 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1266 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001267 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001268 }
1269}
1270
1271/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001272 * htmlAutoCloseOnEnd:
1273 * @ctxt: an HTML parser context
1274 *
1275 * Close all remaining tags at the end of the stream
1276 */
1277static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001278htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1279{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001280 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001281
William M. Brack899e64a2003-09-26 18:03:42 +00001282 if (ctxt->nameNr == 0)
1283 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001284 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001285 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1286 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001287 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001288 }
1289}
1290
1291/**
Owen Taylor3473f882001-02-23 17:55:21 +00001292 * htmlAutoClose:
1293 * @ctxt: an HTML parser context
1294 * @newtag: The new tag name or NULL
1295 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001296 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001297 * The list is kept in htmlStartClose array. This function is
1298 * called when a new tag has been detected and generates the
1299 * appropriates closes if possible/needed.
1300 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001301 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001302 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001303static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001304htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1305{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001307 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001308 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1309 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001310 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001311 }
1312 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001313 htmlAutoCloseOnEnd(ctxt);
1314 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001315 }
1316 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001317 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1318 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1319 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1321 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001322 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001323 }
Owen Taylor3473f882001-02-23 17:55:21 +00001324}
1325
1326/**
1327 * htmlAutoCloseTag:
1328 * @doc: the HTML document
1329 * @name: The tag name
1330 * @elem: the HTML element
1331 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001332 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001333 * The list is kept in htmlStartClose array. This function checks
1334 * if the element or one of it's children would autoclose the
1335 * given tag.
1336 *
1337 * Returns 1 if autoclose, 0 otherwise
1338 */
1339int
1340htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1341 htmlNodePtr child;
1342
1343 if (elem == NULL) return(1);
1344 if (xmlStrEqual(name, elem->name)) return(0);
1345 if (htmlCheckAutoClose(elem->name, name)) return(1);
1346 child = elem->children;
1347 while (child != NULL) {
1348 if (htmlAutoCloseTag(doc, name, child)) return(1);
1349 child = child->next;
1350 }
1351 return(0);
1352}
1353
1354/**
1355 * htmlIsAutoClosed:
1356 * @doc: the HTML document
1357 * @elem: the HTML element
1358 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001359 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001360 * The list is kept in htmlStartClose array. This function checks
1361 * if a tag is autoclosed by one of it's child
1362 *
1363 * Returns 1 if autoclosed, 0 otherwise
1364 */
1365int
1366htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1367 htmlNodePtr child;
1368
1369 if (elem == NULL) return(1);
1370 child = elem->children;
1371 while (child != NULL) {
1372 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1373 child = child->next;
1374 }
1375 return(0);
1376}
1377
1378/**
1379 * htmlCheckImplied:
1380 * @ctxt: an HTML parser context
1381 * @newtag: The new tag name
1382 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001383 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001384 * called when a new tag has been detected and generates the
1385 * appropriates implicit tags if missing
1386 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001387static void
Owen Taylor3473f882001-02-23 17:55:21 +00001388htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1389 if (!htmlOmittedDefaultValue)
1390 return;
1391 if (xmlStrEqual(newtag, BAD_CAST"html"))
1392 return;
1393 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001394 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001395 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1396 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1397 }
1398 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1399 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001400 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001401 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1402 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1403 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1404 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1405 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1406 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001407 if (ctxt->html >= 3) {
1408 /* we already saw or generated an <head> before */
1409 return;
1410 }
1411 /*
1412 * dropped OBJECT ... i you put it first BODY will be
1413 * assumed !
1414 */
1415 htmlnamePush(ctxt, BAD_CAST"head");
1416 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1417 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001418 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1419 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1420 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001421 if (ctxt->html >= 10) {
1422 /* we already saw or generated a <body> before */
1423 return;
1424 }
Owen Taylor3473f882001-02-23 17:55:21 +00001425 int i;
1426 for (i = 0;i < ctxt->nameNr;i++) {
1427 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1428 return;
1429 }
1430 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1431 return;
1432 }
1433 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001434
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001435 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001436 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1437 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1438 }
1439}
1440
1441/**
1442 * htmlCheckParagraph
1443 * @ctxt: an HTML parser context
1444 *
1445 * Check whether a p element need to be implied before inserting
1446 * characters in the current element.
1447 *
1448 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1449 * in case of error.
1450 */
1451
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001452static int
Owen Taylor3473f882001-02-23 17:55:21 +00001453htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1454 const xmlChar *tag;
1455 int i;
1456
1457 if (ctxt == NULL)
1458 return(-1);
1459 tag = ctxt->name;
1460 if (tag == NULL) {
1461 htmlAutoClose(ctxt, BAD_CAST"p");
1462 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001463 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001464 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1465 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1466 return(1);
1467 }
1468 if (!htmlOmittedDefaultValue)
1469 return(0);
1470 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1471 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001472 htmlAutoClose(ctxt, BAD_CAST"p");
1473 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001474 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001475 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1476 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1477 return(1);
1478 }
1479 }
1480 return(0);
1481}
1482
1483/**
1484 * htmlIsScriptAttribute:
1485 * @name: an attribute name
1486 *
1487 * Check if an attribute is of content type Script
1488 *
1489 * Returns 1 is the attribute is a script 0 otherwise
1490 */
1491int
1492htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001493 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001494
1495 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001496 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001497 /*
1498 * all script attributes start with 'on'
1499 */
1500 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001501 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001502 for (i = 0;
1503 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1504 i++) {
1505 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1506 return(1);
1507 }
1508 return(0);
1509}
1510
1511/************************************************************************
1512 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001513 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001514 * *
1515 ************************************************************************/
1516
1517
Daniel Veillard22090732001-07-16 00:06:07 +00001518static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001519/*
1520 * the 4 absolute ones, plus apostrophe.
1521 */
1522{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1523{ 38, "amp", "ampersand, U+0026 ISOnum" },
1524{ 39, "apos", "single quote" },
1525{ 60, "lt", "less-than sign, U+003C ISOnum" },
1526{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1527
1528/*
1529 * A bunch still in the 128-255 range
1530 * Replacing them depend really on the charset used.
1531 */
1532{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1533{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1534{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1535{ 163, "pound","pound sign, U+00A3 ISOnum" },
1536{ 164, "curren","currency sign, U+00A4 ISOnum" },
1537{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1538{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1539{ 167, "sect", "section sign, U+00A7 ISOnum" },
1540{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1541{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1542{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1543{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1544{ 172, "not", "not sign, U+00AC ISOnum" },
1545{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1546{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1547{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1548{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1549{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1550{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1551{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1552{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1553{ 181, "micro","micro sign, U+00B5 ISOnum" },
1554{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1555{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1556{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1557{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1558{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1559{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1560{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1561{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1562{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1563{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1564{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1565{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1566{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1567{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1568{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1569{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1570{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1571{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1572{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1573{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1574{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1575{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1576{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1577{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1578{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1579{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1580{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1581{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1582{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1583{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1584{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1585{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1586{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1587{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1588{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1589{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1590{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1591{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1592{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1593{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1594{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1595{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1596{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1597{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1598{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1599{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1600{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1601{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1602{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1603{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1604{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1605{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1606{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1607{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1608{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1609{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1610{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1611{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1612{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1613{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1614{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1615{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1616{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1617{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1618{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1619{ 247, "divide","division sign, U+00F7 ISOnum" },
1620{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1621{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1622{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1623{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1624{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1625{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1626{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1627{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1628
1629{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1630{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1631{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1632{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1633{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1634
1635/*
1636 * Anything below should really be kept as entities references
1637 */
1638{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1639
1640{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1641{ 732, "tilde","small tilde, U+02DC ISOdia" },
1642
1643{ 913, "Alpha","greek capital letter alpha, U+0391" },
1644{ 914, "Beta", "greek capital letter beta, U+0392" },
1645{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1646{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1647{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1648{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1649{ 919, "Eta", "greek capital letter eta, U+0397" },
1650{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1651{ 921, "Iota", "greek capital letter iota, U+0399" },
1652{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001653{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001654{ 924, "Mu", "greek capital letter mu, U+039C" },
1655{ 925, "Nu", "greek capital letter nu, U+039D" },
1656{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1657{ 927, "Omicron","greek capital letter omicron, U+039F" },
1658{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1659{ 929, "Rho", "greek capital letter rho, U+03A1" },
1660{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1661{ 932, "Tau", "greek capital letter tau, U+03A4" },
1662{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1663{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1664{ 935, "Chi", "greek capital letter chi, U+03A7" },
1665{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1666{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1667
1668{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1669{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1670{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1671{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1672{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1673{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1674{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1675{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1676{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1677{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1678{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1679{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1680{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1681{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1682{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1683{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1684{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1685{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1686{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1687{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1688{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1689{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1690{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1691{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1692{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1693{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1694{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1695{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1696
1697{ 8194, "ensp", "en space, U+2002 ISOpub" },
1698{ 8195, "emsp", "em space, U+2003 ISOpub" },
1699{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1700{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1701{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1702{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1703{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1704{ 8211, "ndash","en dash, U+2013 ISOpub" },
1705{ 8212, "mdash","em dash, U+2014 ISOpub" },
1706{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1707{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1708{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1709{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1710{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1711{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1712{ 8224, "dagger","dagger, U+2020 ISOpub" },
1713{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1714
1715{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1716{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1717
1718{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1719
1720{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1721{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1722
1723{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1724{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1725
1726{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1727{ 8260, "frasl","fraction slash, U+2044 NEW" },
1728
1729{ 8364, "euro", "euro sign, U+20AC NEW" },
1730
1731{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1732{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1733{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1734{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1735{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1736{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1737{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1738{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1739{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1740{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1741{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1742{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1743{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1744{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1745{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1746{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1747
1748{ 8704, "forall","for all, U+2200 ISOtech" },
1749{ 8706, "part", "partial differential, U+2202 ISOtech" },
1750{ 8707, "exist","there exists, U+2203 ISOtech" },
1751{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1752{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1753{ 8712, "isin", "element of, U+2208 ISOtech" },
1754{ 8713, "notin","not an element of, U+2209 ISOtech" },
1755{ 8715, "ni", "contains as member, U+220B ISOtech" },
1756{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001757{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001758{ 8722, "minus","minus sign, U+2212 ISOtech" },
1759{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1760{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1761{ 8733, "prop", "proportional to, U+221D ISOtech" },
1762{ 8734, "infin","infinity, U+221E ISOtech" },
1763{ 8736, "ang", "angle, U+2220 ISOamso" },
1764{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1765{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1766{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1767{ 8746, "cup", "union = cup, U+222A ISOtech" },
1768{ 8747, "int", "integral, U+222B ISOtech" },
1769{ 8756, "there4","therefore, U+2234 ISOtech" },
1770{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1771{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1772{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1773{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1774{ 8801, "equiv","identical to, U+2261 ISOtech" },
1775{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1776{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1777{ 8834, "sub", "subset of, U+2282 ISOtech" },
1778{ 8835, "sup", "superset of, U+2283 ISOtech" },
1779{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1780{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1781{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1782{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1783{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1784{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1785{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1786{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1787{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1788{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1789{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1790{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1791{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1792{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1793
1794{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1795{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1796{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1797{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1798
1799};
1800
1801/************************************************************************
1802 * *
1803 * Commodity functions to handle entities *
1804 * *
1805 ************************************************************************/
1806
1807/*
1808 * Macro used to grow the current buffer.
1809 */
1810#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001811 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001812 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001813 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1814 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001815 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001816 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001817 return(NULL); \
1818 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001819 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001820}
1821
1822/**
1823 * htmlEntityLookup:
1824 * @name: the entity name
1825 *
1826 * Lookup the given entity in EntitiesTable
1827 *
1828 * TODO: the linear scan is really ugly, an hash table is really needed.
1829 *
1830 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1831 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001832const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001833htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001834 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001835
1836 for (i = 0;i < (sizeof(html40EntitiesTable)/
1837 sizeof(html40EntitiesTable[0]));i++) {
1838 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001839 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001840 }
1841 }
1842 return(NULL);
1843}
1844
1845/**
1846 * htmlEntityValueLookup:
1847 * @value: the entity's unicode value
1848 *
1849 * Lookup the given entity in EntitiesTable
1850 *
1851 * TODO: the linear scan is really ugly, an hash table is really needed.
1852 *
1853 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1854 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001855const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001856htmlEntityValueLookup(unsigned int value) {
1857 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001858
1859 for (i = 0;i < (sizeof(html40EntitiesTable)/
1860 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001861 if (html40EntitiesTable[i].value >= value) {
1862 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001863 break;
William M. Brack78637da2003-07-31 14:47:38 +00001864 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001865 }
Owen Taylor3473f882001-02-23 17:55:21 +00001866 }
1867 return(NULL);
1868}
1869
1870/**
1871 * UTF8ToHtml:
1872 * @out: a pointer to an array of bytes to store the result
1873 * @outlen: the length of @out
1874 * @in: a pointer to an array of UTF-8 chars
1875 * @inlen: the length of @in
1876 *
1877 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1878 * plus HTML entities block of chars out.
1879 *
1880 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1881 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001882 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001883 * The value of @outlen after return is the number of octets consumed.
1884 */
1885int
1886UTF8ToHtml(unsigned char* out, int *outlen,
1887 const unsigned char* in, int *inlen) {
1888 const unsigned char* processed = in;
1889 const unsigned char* outend;
1890 const unsigned char* outstart = out;
1891 const unsigned char* instart = in;
1892 const unsigned char* inend;
1893 unsigned int c, d;
1894 int trailing;
1895
Daniel Veillardce682bc2004-11-05 17:22:25 +00001896 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001897 if (in == NULL) {
1898 /*
1899 * initialization nothing to do
1900 */
1901 *outlen = 0;
1902 *inlen = 0;
1903 return(0);
1904 }
1905 inend = in + (*inlen);
1906 outend = out + (*outlen);
1907 while (in < inend) {
1908 d = *in++;
1909 if (d < 0x80) { c= d; trailing= 0; }
1910 else if (d < 0xC0) {
1911 /* trailing byte in leading position */
1912 *outlen = out - outstart;
1913 *inlen = processed - instart;
1914 return(-2);
1915 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1916 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1917 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1918 else {
1919 /* no chance for this in Ascii */
1920 *outlen = out - outstart;
1921 *inlen = processed - instart;
1922 return(-2);
1923 }
1924
1925 if (inend - in < trailing) {
1926 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001927 }
Owen Taylor3473f882001-02-23 17:55:21 +00001928
1929 for ( ; trailing; trailing--) {
1930 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1931 break;
1932 c <<= 6;
1933 c |= d & 0x3F;
1934 }
1935
1936 /* assertion: c is a single UTF-4 value */
1937 if (c < 0x80) {
1938 if (out + 1 >= outend)
1939 break;
1940 *out++ = c;
1941 } else {
1942 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001943 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001944 const char *cp;
1945 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001946
1947 /*
1948 * Try to lookup a predefined HTML entity for it
1949 */
1950
1951 ent = htmlEntityValueLookup(c);
1952 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001953 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1954 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001955 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001956 else
1957 cp = ent->name;
1958 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001959 if (out + 2 + len >= outend)
1960 break;
1961 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001962 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001963 out += len;
1964 *out++ = ';';
1965 }
1966 processed = in;
1967 }
1968 *outlen = out - outstart;
1969 *inlen = processed - instart;
1970 return(0);
1971}
1972
1973/**
1974 * htmlEncodeEntities:
1975 * @out: a pointer to an array of bytes to store the result
1976 * @outlen: the length of @out
1977 * @in: a pointer to an array of UTF-8 chars
1978 * @inlen: the length of @in
1979 * @quoteChar: the quote character to escape (' or ") or zero.
1980 *
1981 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1982 * plus HTML entities block of chars out.
1983 *
1984 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1985 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001986 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001987 * The value of @outlen after return is the number of octets consumed.
1988 */
1989int
1990htmlEncodeEntities(unsigned char* out, int *outlen,
1991 const unsigned char* in, int *inlen, int quoteChar) {
1992 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001993 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001994 const unsigned char* outstart = out;
1995 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001996 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001997 unsigned int c, d;
1998 int trailing;
1999
Daniel Veillardce682bc2004-11-05 17:22:25 +00002000 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2001 return(-1);
2002 outend = out + (*outlen);
2003 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002004 while (in < inend) {
2005 d = *in++;
2006 if (d < 0x80) { c= d; trailing= 0; }
2007 else if (d < 0xC0) {
2008 /* trailing byte in leading position */
2009 *outlen = out - outstart;
2010 *inlen = processed - instart;
2011 return(-2);
2012 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2013 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2014 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2015 else {
2016 /* no chance for this in Ascii */
2017 *outlen = out - outstart;
2018 *inlen = processed - instart;
2019 return(-2);
2020 }
2021
2022 if (inend - in < trailing)
2023 break;
2024
2025 while (trailing--) {
2026 if (((d= *in++) & 0xC0) != 0x80) {
2027 *outlen = out - outstart;
2028 *inlen = processed - instart;
2029 return(-2);
2030 }
2031 c <<= 6;
2032 c |= d & 0x3F;
2033 }
2034
2035 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002036 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2037 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002038 if (out >= outend)
2039 break;
2040 *out++ = c;
2041 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002042 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002043 const char *cp;
2044 char nbuf[16];
2045 int len;
2046
2047 /*
2048 * Try to lookup a predefined HTML entity for it
2049 */
2050 ent = htmlEntityValueLookup(c);
2051 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002052 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002053 cp = nbuf;
2054 }
2055 else
2056 cp = ent->name;
2057 len = strlen(cp);
2058 if (out + 2 + len > outend)
2059 break;
2060 *out++ = '&';
2061 memcpy(out, cp, len);
2062 out += len;
2063 *out++ = ';';
2064 }
2065 processed = in;
2066 }
2067 *outlen = out - outstart;
2068 *inlen = processed - instart;
2069 return(0);
2070}
2071
Owen Taylor3473f882001-02-23 17:55:21 +00002072/************************************************************************
2073 * *
2074 * Commodity functions to handle streams *
2075 * *
2076 ************************************************************************/
2077
2078/**
Owen Taylor3473f882001-02-23 17:55:21 +00002079 * htmlNewInputStream:
2080 * @ctxt: an HTML parser context
2081 *
2082 * Create a new input stream structure
2083 * Returns the new input stream or NULL
2084 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002085static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002086htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2087 htmlParserInputPtr input;
2088
2089 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2090 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002091 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002092 return(NULL);
2093 }
2094 memset(input, 0, sizeof(htmlParserInput));
2095 input->filename = NULL;
2096 input->directory = NULL;
2097 input->base = NULL;
2098 input->cur = NULL;
2099 input->buf = NULL;
2100 input->line = 1;
2101 input->col = 1;
2102 input->buf = NULL;
2103 input->free = NULL;
2104 input->version = NULL;
2105 input->consumed = 0;
2106 input->length = 0;
2107 return(input);
2108}
2109
2110
2111/************************************************************************
2112 * *
2113 * Commodity functions, cleanup needed ? *
2114 * *
2115 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002116/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002117 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002118 * NOTE: it might be more apropriate to integrate this information
2119 * into the html40ElementTable array but I don't want to risk any
2120 * binary incomptibility
2121 */
2122static const char *allowPCData[] = {
2123 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2124 "blockquote", "body", "button", "caption", "center", "cite", "code",
2125 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2126 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2127 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2128 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2129};
Owen Taylor3473f882001-02-23 17:55:21 +00002130
2131/**
2132 * areBlanks:
2133 * @ctxt: an HTML parser context
2134 * @str: a xmlChar *
2135 * @len: the size of @str
2136 *
2137 * Is this a sequence of blank chars that one can ignore ?
2138 *
2139 * Returns 1 if ignorable 0 otherwise.
2140 */
2141
2142static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002143 unsigned int i;
2144 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002145 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002146 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002147
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002148 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002149 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002150
2151 if (CUR == 0) return(1);
2152 if (CUR != '<') return(0);
2153 if (ctxt->name == NULL)
2154 return(1);
2155 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2156 return(1);
2157 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2158 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002159
2160 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2161 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2162 dtd = xmlGetIntSubset(ctxt->myDoc);
2163 if (dtd != NULL && dtd->ExternalID != NULL) {
2164 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2165 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2166 return(1);
2167 }
2168 }
2169
Owen Taylor3473f882001-02-23 17:55:21 +00002170 if (ctxt->node == NULL) return(0);
2171 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002172 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2173 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002174 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002175 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2176 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002177 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002178 for all tags "b" allowing PCDATA */
2179 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2180 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2181 return(0);
2182 }
2183 }
Owen Taylor3473f882001-02-23 17:55:21 +00002184 } else if (xmlNodeIsText(lastChild)) {
2185 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002186 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002187 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002188 for all tags "p" allowing PCDATA */
2189 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2190 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2191 return(0);
2192 }
2193 }
Owen Taylor3473f882001-02-23 17:55:21 +00002194 }
2195 return(1);
2196}
2197
2198/**
Owen Taylor3473f882001-02-23 17:55:21 +00002199 * htmlNewDocNoDtD:
2200 * @URI: URI for the dtd, or NULL
2201 * @ExternalID: the external ID of the DTD, or NULL
2202 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002203 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2204 * are NULL
2205 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002206 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002207 */
2208htmlDocPtr
2209htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2210 xmlDocPtr cur;
2211
2212 /*
2213 * Allocate a new document and fill the fields.
2214 */
2215 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2216 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002217 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002218 return(NULL);
2219 }
2220 memset(cur, 0, sizeof(xmlDoc));
2221
2222 cur->type = XML_HTML_DOCUMENT_NODE;
2223 cur->version = NULL;
2224 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002225 cur->doc = cur;
2226 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002227 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002228 cur->extSubset = NULL;
2229 cur->oldNs = NULL;
2230 cur->encoding = NULL;
2231 cur->standalone = 1;
2232 cur->compression = 0;
2233 cur->ids = NULL;
2234 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002235 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002236 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002237 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002238 if ((ExternalID != NULL) ||
2239 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002240 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002241 return(cur);
2242}
2243
2244/**
2245 * htmlNewDoc:
2246 * @URI: URI for the dtd, or NULL
2247 * @ExternalID: the external ID of the DTD, or NULL
2248 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002249 * Creates a new HTML document
2250 *
Owen Taylor3473f882001-02-23 17:55:21 +00002251 * Returns a new document
2252 */
2253htmlDocPtr
2254htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2255 if ((URI == NULL) && (ExternalID == NULL))
2256 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002257 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2258 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002259
2260 return(htmlNewDocNoDtD(URI, ExternalID));
2261}
2262
2263
2264/************************************************************************
2265 * *
2266 * The parser itself *
2267 * Relates to http://www.w3.org/TR/html40 *
2268 * *
2269 ************************************************************************/
2270
2271/************************************************************************
2272 * *
2273 * The parser itself *
2274 * *
2275 ************************************************************************/
2276
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002277static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002278
Owen Taylor3473f882001-02-23 17:55:21 +00002279/**
2280 * htmlParseHTMLName:
2281 * @ctxt: an HTML parser context
2282 *
2283 * parse an HTML tag or attribute name, note that we convert it to lowercase
2284 * since HTML names are not case-sensitive.
2285 *
2286 * Returns the Tag Name parsed or NULL
2287 */
2288
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002289static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002290htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002291 int i = 0;
2292 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2293
William M. Brackd1757ab2004-10-02 22:07:48 +00002294 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002295 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002296
2297 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002298 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002299 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2300 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002301 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2302 else loc[i] = CUR;
2303 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002304
Owen Taylor3473f882001-02-23 17:55:21 +00002305 NEXT;
2306 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002307
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002308 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002309}
2310
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002311
2312/**
2313 * htmlParseHTMLName_nonInvasive:
2314 * @ctxt: an HTML parser context
2315 *
2316 * parse an HTML tag or attribute name, note that we convert it to lowercase
2317 * since HTML names are not case-sensitive, this doesn't consume the data
2318 * from the stream, it's a look-ahead
2319 *
2320 * Returns the Tag Name parsed or NULL
2321 */
2322
2323static const xmlChar *
2324htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2325 int i = 0;
2326 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2327
2328 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2329 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002330
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002331 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2332 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2333 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2334 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2335 else loc[i] = NXT(1+i);
2336 i++;
2337 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002338
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002339 return(xmlDictLookup(ctxt->dict, loc, i));
2340}
2341
2342
Owen Taylor3473f882001-02-23 17:55:21 +00002343/**
2344 * htmlParseName:
2345 * @ctxt: an HTML parser context
2346 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002347 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002348 *
2349 * Returns the Name parsed or NULL
2350 */
2351
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002352static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002353htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002354 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002355 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002356 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002357
2358 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002359
2360 /*
2361 * Accelerator for simple ASCII names
2362 */
2363 in = ctxt->input->cur;
2364 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2365 ((*in >= 0x41) && (*in <= 0x5A)) ||
2366 (*in == '_') || (*in == ':')) {
2367 in++;
2368 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2369 ((*in >= 0x41) && (*in <= 0x5A)) ||
2370 ((*in >= 0x30) && (*in <= 0x39)) ||
2371 (*in == '_') || (*in == '-') ||
2372 (*in == ':') || (*in == '.'))
2373 in++;
2374 if ((*in > 0) && (*in < 0x80)) {
2375 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002376 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002377 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002378 ctxt->nbChars += count;
2379 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002380 return(ret);
2381 }
2382 }
2383 return(htmlParseNameComplex(ctxt));
2384}
2385
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002386static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002387htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002388 int len = 0, l;
2389 int c;
2390 int count = 0;
2391
2392 /*
2393 * Handler for more complex cases
2394 */
2395 GROW;
2396 c = CUR_CHAR(l);
2397 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2398 (!IS_LETTER(c) && (c != '_') &&
2399 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002400 return(NULL);
2401 }
2402
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002403 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2404 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2405 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002406 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002407 (IS_COMBINING(c)) ||
2408 (IS_EXTENDER(c)))) {
2409 if (count++ > 100) {
2410 count = 0;
2411 GROW;
2412 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002413 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002414 NEXTL(l);
2415 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002416 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002417 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002418}
2419
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002420
Owen Taylor3473f882001-02-23 17:55:21 +00002421/**
2422 * htmlParseHTMLAttribute:
2423 * @ctxt: an HTML parser context
2424 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002425 *
Owen Taylor3473f882001-02-23 17:55:21 +00002426 * parse an HTML attribute value till the stop (quote), if
2427 * stop is 0 then it stops at the first space
2428 *
2429 * Returns the attribute parsed or NULL
2430 */
2431
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002432static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002433htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2434 xmlChar *buffer = NULL;
2435 int buffer_size = 0;
2436 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002437 const xmlChar *name = NULL;
2438 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002439 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002440
2441 /*
2442 * allocate a translation buffer.
2443 */
2444 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002445 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002446 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002447 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002448 return(NULL);
2449 }
2450 out = buffer;
2451
2452 /*
2453 * Ok loop until we reach one of the ending chars
2454 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002455 while ((CUR != 0) && (CUR != stop)) {
2456 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002457 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002458 if (CUR == '&') {
2459 if (NXT(1) == '#') {
2460 unsigned int c;
2461 int bits;
2462
2463 c = htmlParseCharRef(ctxt);
2464 if (c < 0x80)
2465 { *out++ = c; bits= -6; }
2466 else if (c < 0x800)
2467 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2468 else if (c < 0x10000)
2469 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002470 else
Owen Taylor3473f882001-02-23 17:55:21 +00002471 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002472
Owen Taylor3473f882001-02-23 17:55:21 +00002473 for ( ; bits >= 0; bits-= 6) {
2474 *out++ = ((c >> bits) & 0x3F) | 0x80;
2475 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002476
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002477 if (out - buffer > buffer_size - 100) {
2478 int indx = out - buffer;
2479
2480 growBuffer(buffer);
2481 out = &buffer[indx];
2482 }
Owen Taylor3473f882001-02-23 17:55:21 +00002483 } else {
2484 ent = htmlParseEntityRef(ctxt, &name);
2485 if (name == NULL) {
2486 *out++ = '&';
2487 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002488 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002489
2490 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002491 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002492 }
2493 } else if (ent == NULL) {
2494 *out++ = '&';
2495 cur = name;
2496 while (*cur != 0) {
2497 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002498 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002499
2500 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002501 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002502 }
2503 *out++ = *cur++;
2504 }
Owen Taylor3473f882001-02-23 17:55:21 +00002505 } else {
2506 unsigned int c;
2507 int bits;
2508
2509 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002510 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002511
2512 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002513 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002514 }
Daniel Veillard48519092006-10-17 15:56:35 +00002515 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002516 if (c < 0x80)
2517 { *out++ = c; bits= -6; }
2518 else if (c < 0x800)
2519 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2520 else if (c < 0x10000)
2521 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002522 else
Owen Taylor3473f882001-02-23 17:55:21 +00002523 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002524
Owen Taylor3473f882001-02-23 17:55:21 +00002525 for ( ; bits >= 0; bits-= 6) {
2526 *out++ = ((c >> bits) & 0x3F) | 0x80;
2527 }
Owen Taylor3473f882001-02-23 17:55:21 +00002528 }
2529 }
2530 } else {
2531 unsigned int c;
2532 int bits, l;
2533
2534 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002535 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002536
2537 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002538 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002539 }
2540 c = CUR_CHAR(l);
2541 if (c < 0x80)
2542 { *out++ = c; bits= -6; }
2543 else if (c < 0x800)
2544 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2545 else if (c < 0x10000)
2546 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002547 else
Owen Taylor3473f882001-02-23 17:55:21 +00002548 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002549
Owen Taylor3473f882001-02-23 17:55:21 +00002550 for ( ; bits >= 0; bits-= 6) {
2551 *out++ = ((c >> bits) & 0x3F) | 0x80;
2552 }
2553 NEXT;
2554 }
2555 }
2556 *out++ = 0;
2557 return(buffer);
2558}
2559
2560/**
Owen Taylor3473f882001-02-23 17:55:21 +00002561 * htmlParseEntityRef:
2562 * @ctxt: an HTML parser context
2563 * @str: location to store the entity name
2564 *
2565 * parse an HTML ENTITY references
2566 *
2567 * [68] EntityRef ::= '&' Name ';'
2568 *
2569 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2570 * if non-NULL *str will have to be freed by the caller.
2571 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002572const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002573htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2574 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002575 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002576
2577 if (str != NULL) *str = NULL;
2578 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002579
2580 if (CUR == '&') {
2581 NEXT;
2582 name = htmlParseName(ctxt);
2583 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002584 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2585 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002586 } else {
2587 GROW;
2588 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002589 if (str != NULL)
2590 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002591
2592 /*
2593 * Lookup the entity in the table.
2594 */
2595 ent = htmlEntityLookup(name);
2596 if (ent != NULL) /* OK that's ugly !!! */
2597 NEXT;
2598 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002599 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2600 "htmlParseEntityRef: expecting ';'\n",
2601 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002602 if (str != NULL)
2603 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002604 }
2605 }
2606 }
2607 return(ent);
2608}
2609
2610/**
2611 * htmlParseAttValue:
2612 * @ctxt: an HTML parser context
2613 *
2614 * parse a value for an attribute
2615 * Note: the parser won't do substitution of entities here, this
2616 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002617 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002618 *
2619 * Returns the AttValue parsed or NULL.
2620 */
2621
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002622static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002623htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2624 xmlChar *ret = NULL;
2625
2626 if (CUR == '"') {
2627 NEXT;
2628 ret = htmlParseHTMLAttribute(ctxt, '"');
2629 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002630 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2631 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002632 } else
2633 NEXT;
2634 } else if (CUR == '\'') {
2635 NEXT;
2636 ret = htmlParseHTMLAttribute(ctxt, '\'');
2637 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002638 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2639 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002640 } else
2641 NEXT;
2642 } else {
2643 /*
2644 * That's an HTMLism, the attribute value may not be quoted
2645 */
2646 ret = htmlParseHTMLAttribute(ctxt, 0);
2647 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002648 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2649 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002650 }
2651 }
2652 return(ret);
2653}
2654
2655/**
2656 * htmlParseSystemLiteral:
2657 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002658 *
Owen Taylor3473f882001-02-23 17:55:21 +00002659 * parse an HTML Literal
2660 *
2661 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2662 *
2663 * Returns the SystemLiteral parsed or NULL
2664 */
2665
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002666static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002667htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2668 const xmlChar *q;
2669 xmlChar *ret = NULL;
2670
2671 if (CUR == '"') {
2672 NEXT;
2673 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002674 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002675 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002676 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002677 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2678 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002679 } else {
2680 ret = xmlStrndup(q, CUR_PTR - q);
2681 NEXT;
2682 }
2683 } else if (CUR == '\'') {
2684 NEXT;
2685 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002686 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002687 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002688 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002689 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2690 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002691 } else {
2692 ret = xmlStrndup(q, CUR_PTR - q);
2693 NEXT;
2694 }
2695 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002696 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2697 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002698 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002699
Owen Taylor3473f882001-02-23 17:55:21 +00002700 return(ret);
2701}
2702
2703/**
2704 * htmlParsePubidLiteral:
2705 * @ctxt: an HTML parser context
2706 *
2707 * parse an HTML public literal
2708 *
2709 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2710 *
2711 * Returns the PubidLiteral parsed or NULL.
2712 */
2713
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002714static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002715htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2716 const xmlChar *q;
2717 xmlChar *ret = NULL;
2718 /*
2719 * Name ::= (Letter | '_') (NameChar)*
2720 */
2721 if (CUR == '"') {
2722 NEXT;
2723 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002724 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002725 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002726 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2727 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002728 } else {
2729 ret = xmlStrndup(q, CUR_PTR - q);
2730 NEXT;
2731 }
2732 } else if (CUR == '\'') {
2733 NEXT;
2734 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002735 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002736 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002737 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002738 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2739 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002740 } else {
2741 ret = xmlStrndup(q, CUR_PTR - q);
2742 NEXT;
2743 }
2744 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002745 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2746 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002747 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002748
Owen Taylor3473f882001-02-23 17:55:21 +00002749 return(ret);
2750}
2751
2752/**
2753 * htmlParseScript:
2754 * @ctxt: an HTML parser context
2755 *
2756 * parse the content of an HTML SCRIPT or STYLE element
2757 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2758 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2759 * http://www.w3.org/TR/html4/types.html#type-script
2760 * http://www.w3.org/TR/html4/types.html#h-6.15
2761 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2762 *
2763 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2764 * element and the value of intrinsic event attributes. User agents must
2765 * not evaluate script data as HTML markup but instead must pass it on as
2766 * data to a script engine.
2767 * NOTES:
2768 * - The content is passed like CDATA
2769 * - the attributes for style and scripting "onXXX" are also described
2770 * as CDATA but SGML allows entities references in attributes so their
2771 * processing is identical as other attributes
2772 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002773static void
Owen Taylor3473f882001-02-23 17:55:21 +00002774htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002775 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002776 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002777 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002778
2779 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002780 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002781 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002782 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002783 /*
2784 * One should break here, the specification is clear:
2785 * Authors should therefore escape "</" within the content.
2786 * Escape mechanisms are specific to each scripting or
2787 * style sheet language.
2788 *
2789 * In recovery mode, only break if end tag match the
2790 * current tag, effectively ignoring all tags inside the
2791 * script/style block and treating the entire block as
2792 * CDATA.
2793 */
2794 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002795 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2796 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002797 {
2798 break; /* while */
2799 } else {
2800 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002801 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002802 ctxt->name, NULL);
2803 }
2804 } else {
2805 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002806 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002807 {
2808 break; /* while */
2809 }
2810 }
Owen Taylor3473f882001-02-23 17:55:21 +00002811 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002812 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002813 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2814 if (ctxt->sax->cdataBlock!= NULL) {
2815 /*
2816 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2817 */
2818 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002819 } else if (ctxt->sax->characters != NULL) {
2820 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002821 }
2822 nbchar = 0;
2823 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002824 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002825 NEXTL(l);
2826 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002827 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002828
Daniel Veillard68716a72006-10-16 09:32:17 +00002829 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002830 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2831 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002832 NEXT;
2833 }
2834
2835 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2836 if (ctxt->sax->cdataBlock!= NULL) {
2837 /*
2838 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2839 */
2840 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002841 } else if (ctxt->sax->characters != NULL) {
2842 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002843 }
2844 }
2845}
2846
2847
2848/**
2849 * htmlParseCharData:
2850 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002851 *
2852 * parse a CharData section.
2853 * if we are within a CDATA section ']]>' marks an end of section.
2854 *
2855 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2856 */
2857
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002858static void
2859htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002860 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2861 int nbchar = 0;
2862 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002863 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002864
2865 SHRINK;
2866 cur = CUR_CHAR(l);
2867 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002868 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002869 (cur != 0)) {
2870 if (!(IS_CHAR(cur))) {
2871 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2872 "Invalid char in CDATA 0x%X\n", cur);
2873 } else {
2874 COPY_BUF(l,buf,nbchar,cur);
2875 }
Owen Taylor3473f882001-02-23 17:55:21 +00002876 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2877 /*
2878 * Ok the segment is to be consumed as chars.
2879 */
2880 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2881 if (areBlanks(ctxt, buf, nbchar)) {
2882 if (ctxt->sax->ignorableWhitespace != NULL)
2883 ctxt->sax->ignorableWhitespace(ctxt->userData,
2884 buf, nbchar);
2885 } else {
2886 htmlCheckParagraph(ctxt);
2887 if (ctxt->sax->characters != NULL)
2888 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2889 }
2890 }
2891 nbchar = 0;
2892 }
2893 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002894 chunk++;
2895 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2896 chunk = 0;
2897 SHRINK;
2898 GROW;
2899 }
Owen Taylor3473f882001-02-23 17:55:21 +00002900 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002901 if (cur == 0) {
2902 SHRINK;
2903 GROW;
2904 cur = CUR_CHAR(l);
2905 }
Owen Taylor3473f882001-02-23 17:55:21 +00002906 }
2907 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002908 buf[nbchar] = 0;
2909
Owen Taylor3473f882001-02-23 17:55:21 +00002910 /*
2911 * Ok the segment is to be consumed as chars.
2912 */
2913 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2914 if (areBlanks(ctxt, buf, nbchar)) {
2915 if (ctxt->sax->ignorableWhitespace != NULL)
2916 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2917 } else {
2918 htmlCheckParagraph(ctxt);
2919 if (ctxt->sax->characters != NULL)
2920 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2921 }
2922 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002923 } else {
2924 /*
2925 * Loop detection
2926 */
2927 if (cur == 0)
2928 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002929 }
2930}
2931
2932/**
2933 * htmlParseExternalID:
2934 * @ctxt: an HTML parser context
2935 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002936 *
2937 * Parse an External ID or a Public ID
2938 *
Owen Taylor3473f882001-02-23 17:55:21 +00002939 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2940 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2941 *
2942 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2943 *
2944 * Returns the function returns SystemLiteral and in the second
2945 * case publicID receives PubidLiteral, is strict is off
2946 * it is possible to return NULL and have publicID set.
2947 */
2948
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002949static xmlChar *
2950htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002951 xmlChar *URI = NULL;
2952
2953 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2954 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2955 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2956 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002957 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002958 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2959 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002960 }
2961 SKIP_BLANKS;
2962 URI = htmlParseSystemLiteral(ctxt);
2963 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002964 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2965 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002966 }
2967 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2968 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2969 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2970 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002971 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002972 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2973 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002974 }
2975 SKIP_BLANKS;
2976 *publicID = htmlParsePubidLiteral(ctxt);
2977 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002978 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2979 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2980 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002981 }
2982 SKIP_BLANKS;
2983 if ((CUR == '"') || (CUR == '\'')) {
2984 URI = htmlParseSystemLiteral(ctxt);
2985 }
2986 }
2987 return(URI);
2988}
2989
2990/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002991 * xmlParsePI:
2992 * @ctxt: an XML parser context
2993 *
2994 * parse an XML Processing Instruction.
2995 *
2996 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2997 */
2998static void
2999htmlParsePI(htmlParserCtxtPtr ctxt) {
3000 xmlChar *buf = NULL;
3001 int len = 0;
3002 int size = HTML_PARSER_BUFFER_SIZE;
3003 int cur, l;
3004 const xmlChar *target;
3005 xmlParserInputState state;
3006 int count = 0;
3007
3008 if ((RAW == '<') && (NXT(1) == '?')) {
3009 state = ctxt->instate;
3010 ctxt->instate = XML_PARSER_PI;
3011 /*
3012 * this is a Processing Instruction.
3013 */
3014 SKIP(2);
3015 SHRINK;
3016
3017 /*
3018 * Parse the target name and check for special support like
3019 * namespace.
3020 */
3021 target = htmlParseName(ctxt);
3022 if (target != NULL) {
3023 if (RAW == '>') {
3024 SKIP(1);
3025
3026 /*
3027 * SAX: PI detected.
3028 */
3029 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3030 (ctxt->sax->processingInstruction != NULL))
3031 ctxt->sax->processingInstruction(ctxt->userData,
3032 target, NULL);
3033 ctxt->instate = state;
3034 return;
3035 }
3036 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3037 if (buf == NULL) {
3038 htmlErrMemory(ctxt, NULL);
3039 ctxt->instate = state;
3040 return;
3041 }
3042 cur = CUR;
3043 if (!IS_BLANK(cur)) {
3044 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3045 "ParsePI: PI %s space expected\n", target, NULL);
3046 }
3047 SKIP_BLANKS;
3048 cur = CUR_CHAR(l);
3049 while (IS_CHAR(cur) && (cur != '>')) {
3050 if (len + 5 >= size) {
3051 xmlChar *tmp;
3052
3053 size *= 2;
3054 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3055 if (tmp == NULL) {
3056 htmlErrMemory(ctxt, NULL);
3057 xmlFree(buf);
3058 ctxt->instate = state;
3059 return;
3060 }
3061 buf = tmp;
3062 }
3063 count++;
3064 if (count > 50) {
3065 GROW;
3066 count = 0;
3067 }
3068 COPY_BUF(l,buf,len,cur);
3069 NEXTL(l);
3070 cur = CUR_CHAR(l);
3071 if (cur == 0) {
3072 SHRINK;
3073 GROW;
3074 cur = CUR_CHAR(l);
3075 }
3076 }
3077 buf[len] = 0;
3078 if (cur != '>') {
3079 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3080 "ParsePI: PI %s never end ...\n", target, NULL);
3081 } else {
3082 SKIP(1);
3083
3084 /*
3085 * SAX: PI detected.
3086 */
3087 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3088 (ctxt->sax->processingInstruction != NULL))
3089 ctxt->sax->processingInstruction(ctxt->userData,
3090 target, buf);
3091 }
3092 xmlFree(buf);
3093 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003094 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003095 "PI is not started correctly", NULL, NULL);
3096 }
3097 ctxt->instate = state;
3098 }
3099}
3100
3101/**
Owen Taylor3473f882001-02-23 17:55:21 +00003102 * htmlParseComment:
3103 * @ctxt: an HTML parser context
3104 *
3105 * Parse an XML (SGML) comment <!-- .... -->
3106 *
3107 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3108 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003109static void
Owen Taylor3473f882001-02-23 17:55:21 +00003110htmlParseComment(htmlParserCtxtPtr ctxt) {
3111 xmlChar *buf = NULL;
3112 int len;
3113 int size = HTML_PARSER_BUFFER_SIZE;
3114 int q, ql;
3115 int r, rl;
3116 int cur, l;
3117 xmlParserInputState state;
3118
3119 /*
3120 * Check that there is a comment right here.
3121 */
3122 if ((RAW != '<') || (NXT(1) != '!') ||
3123 (NXT(2) != '-') || (NXT(3) != '-')) return;
3124
3125 state = ctxt->instate;
3126 ctxt->instate = XML_PARSER_COMMENT;
3127 SHRINK;
3128 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003129 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003130 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003131 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003132 ctxt->instate = state;
3133 return;
3134 }
3135 q = CUR_CHAR(ql);
3136 NEXTL(ql);
3137 r = CUR_CHAR(rl);
3138 NEXTL(rl);
3139 cur = CUR_CHAR(l);
3140 len = 0;
3141 while (IS_CHAR(cur) &&
3142 ((cur != '>') ||
3143 (r != '-') || (q != '-'))) {
3144 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003145 xmlChar *tmp;
3146
Owen Taylor3473f882001-02-23 17:55:21 +00003147 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003148 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3149 if (tmp == NULL) {
3150 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003151 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003152 ctxt->instate = state;
3153 return;
3154 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003155 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003156 }
3157 COPY_BUF(ql,buf,len,q);
3158 q = r;
3159 ql = rl;
3160 r = cur;
3161 rl = l;
3162 NEXTL(l);
3163 cur = CUR_CHAR(l);
3164 if (cur == 0) {
3165 SHRINK;
3166 GROW;
3167 cur = CUR_CHAR(l);
3168 }
3169 }
3170 buf[len] = 0;
3171 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003172 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3173 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003174 xmlFree(buf);
3175 } else {
3176 NEXT;
3177 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3178 (!ctxt->disableSAX))
3179 ctxt->sax->comment(ctxt->userData, buf);
3180 xmlFree(buf);
3181 }
3182 ctxt->instate = state;
3183}
3184
3185/**
3186 * htmlParseCharRef:
3187 * @ctxt: an HTML parser context
3188 *
3189 * parse Reference declarations
3190 *
3191 * [66] CharRef ::= '&#' [0-9]+ ';' |
3192 * '&#x' [0-9a-fA-F]+ ';'
3193 *
3194 * Returns the value parsed (as an int)
3195 */
3196int
3197htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3198 int val = 0;
3199
Daniel Veillarda03e3652004-11-02 18:45:30 +00003200 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3201 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3202 "htmlParseCharRef: context error\n",
3203 NULL, NULL);
3204 return(0);
3205 }
Owen Taylor3473f882001-02-23 17:55:21 +00003206 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003207 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003208 SKIP(3);
3209 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003210 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003211 val = val * 16 + (CUR - '0');
3212 else if ((CUR >= 'a') && (CUR <= 'f'))
3213 val = val * 16 + (CUR - 'a') + 10;
3214 else if ((CUR >= 'A') && (CUR <= 'F'))
3215 val = val * 16 + (CUR - 'A') + 10;
3216 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003217 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003218 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003219 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003220 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003221 }
3222 NEXT;
3223 }
3224 if (CUR == ';')
3225 NEXT;
3226 } else if ((CUR == '&') && (NXT(1) == '#')) {
3227 SKIP(2);
3228 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003229 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003230 val = val * 10 + (CUR - '0');
3231 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003232 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003233 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003234 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003235 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003236 }
3237 NEXT;
3238 }
3239 if (CUR == ';')
3240 NEXT;
3241 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003242 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3243 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003244 }
3245 /*
3246 * Check the value IS_CHAR ...
3247 */
3248 if (IS_CHAR(val)) {
3249 return(val);
3250 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003251 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3252 "htmlParseCharRef: invalid xmlChar value %d\n",
3253 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003254 }
3255 return(0);
3256}
3257
3258
3259/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003260 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003261 * @ctxt: an HTML parser context
3262 *
3263 * parse a DOCTYPE declaration
3264 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003265 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003266 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3267 */
3268
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003269static void
Owen Taylor3473f882001-02-23 17:55:21 +00003270htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003271 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003272 xmlChar *ExternalID = NULL;
3273 xmlChar *URI = NULL;
3274
3275 /*
3276 * We know that '<!DOCTYPE' has been detected.
3277 */
3278 SKIP(9);
3279
3280 SKIP_BLANKS;
3281
3282 /*
3283 * Parse the DOCTYPE name.
3284 */
3285 name = htmlParseName(ctxt);
3286 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003287 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3288 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3289 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003290 }
3291 /*
3292 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3293 */
3294
3295 SKIP_BLANKS;
3296
3297 /*
3298 * Check for SystemID and ExternalID
3299 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003300 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003301 SKIP_BLANKS;
3302
3303 /*
3304 * We should be at the end of the DOCTYPE declaration.
3305 */
3306 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003307 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3308 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003309 /* We shouldn't try to resynchronize ... */
3310 }
3311 NEXT;
3312
3313 /*
3314 * Create or update the document accordingly to the DOCTYPE
3315 */
3316 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3317 (!ctxt->disableSAX))
3318 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3319
3320 /*
3321 * Cleanup, since we don't use all those identifiers
3322 */
3323 if (URI != NULL) xmlFree(URI);
3324 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003325}
3326
3327/**
3328 * htmlParseAttribute:
3329 * @ctxt: an HTML parser context
3330 * @value: a xmlChar ** used to store the value of the attribute
3331 *
3332 * parse an attribute
3333 *
3334 * [41] Attribute ::= Name Eq AttValue
3335 *
3336 * [25] Eq ::= S? '=' S?
3337 *
3338 * With namespace:
3339 *
3340 * [NS 11] Attribute ::= QName Eq AttValue
3341 *
3342 * Also the case QName == xmlns:??? is handled independently as a namespace
3343 * definition.
3344 *
3345 * Returns the attribute name, and the value in *value.
3346 */
3347
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003348static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003349htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003350 const xmlChar *name;
3351 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003352
3353 *value = NULL;
3354 name = htmlParseHTMLName(ctxt);
3355 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003356 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3357 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003358 return(NULL);
3359 }
3360
3361 /*
3362 * read the value
3363 */
3364 SKIP_BLANKS;
3365 if (CUR == '=') {
3366 NEXT;
3367 SKIP_BLANKS;
3368 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003369 } else if (htmlIsBooleanAttr(name)) {
3370 /*
3371 * assume a minimized attribute
3372 */
3373 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003374 }
3375
3376 *value = val;
3377 return(name);
3378}
3379
3380/**
3381 * htmlCheckEncoding:
3382 * @ctxt: an HTML parser context
3383 * @attvalue: the attribute value
3384 *
3385 * Checks an http-equiv attribute from a Meta tag to detect
3386 * the encoding
3387 * If a new encoding is detected the parser is switched to decode
3388 * it and pass UTF8
3389 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003390static void
Owen Taylor3473f882001-02-23 17:55:21 +00003391htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3392 const xmlChar *encoding;
3393
3394 if ((ctxt == NULL) || (attvalue == NULL))
3395 return;
3396
Daniel Veillarde77db162009-08-22 11:32:38 +02003397 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003398 if (ctxt->input->encoding != NULL)
3399 return;
3400
3401 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3402 if (encoding != NULL) {
3403 encoding += 8;
3404 } else {
3405 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3406 if (encoding != NULL)
3407 encoding += 9;
3408 }
3409 if (encoding != NULL) {
3410 xmlCharEncoding enc;
3411 xmlCharEncodingHandlerPtr handler;
3412
3413 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3414
3415 if (ctxt->input->encoding != NULL)
3416 xmlFree((xmlChar *) ctxt->input->encoding);
3417 ctxt->input->encoding = xmlStrdup(encoding);
3418
3419 enc = xmlParseCharEncoding((const char *) encoding);
3420 /*
3421 * registered set of known encodings
3422 */
3423 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003424 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003425 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3426 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3427 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3428 (ctxt->input->buf != NULL) &&
3429 (ctxt->input->buf->encoder == NULL)) {
3430 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3431 "htmlCheckEncoding: wrong encoding meta\n",
3432 NULL, NULL);
3433 } else {
3434 xmlSwitchEncoding(ctxt, enc);
3435 }
Owen Taylor3473f882001-02-23 17:55:21 +00003436 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3437 } else {
3438 /*
3439 * fallback for unknown encodings
3440 */
3441 handler = xmlFindCharEncodingHandler((const char *) encoding);
3442 if (handler != NULL) {
3443 xmlSwitchToEncoding(ctxt, handler);
3444 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3445 } else {
3446 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3447 }
3448 }
3449
3450 if ((ctxt->input->buf != NULL) &&
3451 (ctxt->input->buf->encoder != NULL) &&
3452 (ctxt->input->buf->raw != NULL) &&
3453 (ctxt->input->buf->buffer != NULL)) {
3454 int nbchars;
3455 int processed;
3456
3457 /*
3458 * convert as much as possible to the parser reading buffer.
3459 */
3460 processed = ctxt->input->cur - ctxt->input->base;
3461 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3462 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3463 ctxt->input->buf->buffer,
3464 ctxt->input->buf->raw);
3465 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003466 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3467 "htmlCheckEncoding: encoder error\n",
3468 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003469 }
3470 ctxt->input->base =
3471 ctxt->input->cur = ctxt->input->buf->buffer->content;
3472 }
3473 }
3474}
3475
3476/**
3477 * htmlCheckMeta:
3478 * @ctxt: an HTML parser context
3479 * @atts: the attributes values
3480 *
3481 * Checks an attributes from a Meta tag
3482 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003483static void
Owen Taylor3473f882001-02-23 17:55:21 +00003484htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3485 int i;
3486 const xmlChar *att, *value;
3487 int http = 0;
3488 const xmlChar *content = NULL;
3489
3490 if ((ctxt == NULL) || (atts == NULL))
3491 return;
3492
3493 i = 0;
3494 att = atts[i++];
3495 while (att != NULL) {
3496 value = atts[i++];
3497 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3498 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3499 http = 1;
3500 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3501 content = value;
3502 att = atts[i++];
3503 }
3504 if ((http) && (content != NULL))
3505 htmlCheckEncoding(ctxt, content);
3506
3507}
3508
3509/**
3510 * htmlParseStartTag:
3511 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003512 *
Owen Taylor3473f882001-02-23 17:55:21 +00003513 * parse a start of tag either for rule element or
3514 * EmptyElement. In both case we don't parse the tag closing chars.
3515 *
3516 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3517 *
3518 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3519 *
3520 * With namespace:
3521 *
3522 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3523 *
3524 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3525 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003526 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003527 */
3528
Daniel Veillard597f1c12005-07-03 23:00:18 +00003529static int
Owen Taylor3473f882001-02-23 17:55:21 +00003530htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003531 const xmlChar *name;
3532 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003533 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003534 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003535 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003536 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003537 int meta = 0;
3538 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003539 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003540
Daniel Veillarde77db162009-08-22 11:32:38 +02003541 if (ctxt->instate == XML_PARSER_EOF)
3542 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003543 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3544 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3545 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003546 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003547 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003548 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003549 NEXT;
3550
Daniel Veillard30e76072006-03-09 14:13:55 +00003551 atts = ctxt->atts;
3552 maxatts = ctxt->maxatts;
3553
Owen Taylor3473f882001-02-23 17:55:21 +00003554 GROW;
3555 name = htmlParseHTMLName(ctxt);
3556 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003557 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3558 "htmlParseStartTag: invalid element name\n",
3559 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003560 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003561 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3562 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003563 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003564 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003565 }
3566 if (xmlStrEqual(name, BAD_CAST"meta"))
3567 meta = 1;
3568
3569 /*
3570 * Check for auto-closure of HTML elements.
3571 */
3572 htmlAutoClose(ctxt, name);
3573
3574 /*
3575 * Check for implied HTML elements.
3576 */
3577 htmlCheckImplied(ctxt, name);
3578
3579 /*
3580 * Avoid html at any level > 0, head at any level != 1
3581 * or any attempt to recurse body
3582 */
3583 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003584 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3585 "htmlParseStartTag: misplaced <html> tag\n",
3586 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003587 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003588 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003589 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003590 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003591 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003592 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3593 "htmlParseStartTag: misplaced <head> tag\n",
3594 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003595 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003596 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003597 }
3598 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003599 int indx;
3600 for (indx = 0;indx < ctxt->nameNr;indx++) {
3601 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003602 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3603 "htmlParseStartTag: misplaced <body> tag\n",
3604 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003605 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003606 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003607 }
3608 }
3609 }
3610
3611 /*
3612 * Now parse the attributes, it ends up with the ending
3613 *
3614 * (S Attribute)* S?
3615 */
3616 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003617 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003618 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003619 ((CUR != '/') || (NXT(1) != '>'))) {
3620 long cons = ctxt->nbChars;
3621
3622 GROW;
3623 attname = htmlParseAttribute(ctxt, &attvalue);
3624 if (attname != NULL) {
3625
3626 /*
3627 * Well formedness requires at most one declaration of an attribute
3628 */
3629 for (i = 0; i < nbatts;i += 2) {
3630 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003631 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3632 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003633 if (attvalue != NULL)
3634 xmlFree(attvalue);
3635 goto failed;
3636 }
3637 }
3638
3639 /*
3640 * Add the pair to atts
3641 */
3642 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003643 maxatts = 22; /* allow for 10 attrs by default */
3644 atts = (const xmlChar **)
3645 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003646 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003647 htmlErrMemory(ctxt, NULL);
3648 if (attvalue != NULL)
3649 xmlFree(attvalue);
3650 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003651 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003652 ctxt->atts = atts;
3653 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003654 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003655 const xmlChar **n;
3656
Owen Taylor3473f882001-02-23 17:55:21 +00003657 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003658 n = (const xmlChar **) xmlRealloc((void *) atts,
3659 maxatts * sizeof(const xmlChar *));
3660 if (n == NULL) {
3661 htmlErrMemory(ctxt, NULL);
3662 if (attvalue != NULL)
3663 xmlFree(attvalue);
3664 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003665 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003666 atts = n;
3667 ctxt->atts = atts;
3668 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003669 }
3670 atts[nbatts++] = attname;
3671 atts[nbatts++] = attvalue;
3672 atts[nbatts] = NULL;
3673 atts[nbatts + 1] = NULL;
3674 }
3675 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003676 if (attvalue != NULL)
3677 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003678 /* Dump the bogus attribute string up to the next blank or
3679 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003680 while ((IS_CHAR_CH(CUR)) &&
3681 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003682 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003683 NEXT;
3684 }
3685
3686failed:
3687 SKIP_BLANKS;
3688 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003689 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3690 "htmlParseStartTag: problem parsing attributes\n",
3691 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003692 break;
3693 }
3694 }
3695
3696 /*
3697 * Handle specific association to the META tag
3698 */
William M. Bracke978ae22007-03-21 06:16:02 +00003699 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003700 htmlCheckMeta(ctxt, atts);
3701
3702 /*
3703 * SAX: Start of Element !
3704 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003705 if (!discardtag) {
3706 htmlnamePush(ctxt, name);
3707 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3708 if (nbatts != 0)
3709 ctxt->sax->startElement(ctxt->userData, name, atts);
3710 else
3711 ctxt->sax->startElement(ctxt->userData, name, NULL);
3712 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003713 }
Owen Taylor3473f882001-02-23 17:55:21 +00003714
3715 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003716 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003717 if (atts[i] != NULL)
3718 xmlFree((xmlChar *) atts[i]);
3719 }
Owen Taylor3473f882001-02-23 17:55:21 +00003720 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003721
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003722 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003723}
3724
3725/**
3726 * htmlParseEndTag:
3727 * @ctxt: an HTML parser context
3728 *
3729 * parse an end of tag
3730 *
3731 * [42] ETag ::= '</' Name S? '>'
3732 *
3733 * With namespace
3734 *
3735 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003736 *
3737 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003738 */
3739
Daniel Veillardf420ac52001-07-04 16:04:09 +00003740static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003741htmlParseEndTag(htmlParserCtxtPtr ctxt)
3742{
3743 const xmlChar *name;
3744 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003745 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003746
3747 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003748 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3749 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003750 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003751 }
3752 SKIP(2);
3753
3754 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003755 if (name == NULL)
3756 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003757 /*
3758 * We should definitely be at the ending "S? '>'" part
3759 */
3760 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003761 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003762 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3763 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003764 if (ctxt->recovery) {
3765 /*
3766 * We're not at the ending > !!
3767 * Error, unless in recover mode where we search forwards
3768 * until we find a >
3769 */
3770 while (CUR != '\0' && CUR != '>') NEXT;
3771 NEXT;
3772 }
Owen Taylor3473f882001-02-23 17:55:21 +00003773 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003774 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003775
3776 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003777 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3778 * out now.
3779 */
3780 if ((ctxt->depth > 0) &&
3781 (xmlStrEqual(name, BAD_CAST "html") ||
3782 xmlStrEqual(name, BAD_CAST "body") ||
3783 xmlStrEqual(name, BAD_CAST "head"))) {
3784 ctxt->depth--;
3785 return (0);
3786 }
3787
3788 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003789 * If the name read is not one of the element in the parsing stack
3790 * then return, it's just an error.
3791 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003792 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3793 if (xmlStrEqual(name, ctxt->nameTab[i]))
3794 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003795 }
3796 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003797 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3798 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003799 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003800 }
3801
3802
3803 /*
3804 * Check for auto-closure of HTML elements.
3805 */
3806
3807 htmlAutoCloseOnClose(ctxt, name);
3808
3809 /*
3810 * Well formedness constraints, opening and closing must match.
3811 * With the exception that the autoclose may have popped stuff out
3812 * of the stack.
3813 */
3814 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003815 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003816 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3817 "Opening and ending tag mismatch: %s and %s\n",
3818 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003819 }
3820 }
3821
3822 /*
3823 * SAX: End of Tag
3824 */
3825 oldname = ctxt->name;
3826 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003827 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3828 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003829 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003830 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003831 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003832 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003833 }
3834
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003835 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003836}
3837
3838
3839/**
3840 * htmlParseReference:
3841 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003842 *
Owen Taylor3473f882001-02-23 17:55:21 +00003843 * parse and handle entity references in content,
3844 * this will end-up in a call to character() since this is either a
3845 * CharRef, or a predefined entity.
3846 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003847static void
Owen Taylor3473f882001-02-23 17:55:21 +00003848htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003849 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003850 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003851 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003852 if (CUR != '&') return;
3853
3854 if (NXT(1) == '#') {
3855 unsigned int c;
3856 int bits, i = 0;
3857
3858 c = htmlParseCharRef(ctxt);
3859 if (c == 0)
3860 return;
3861
3862 if (c < 0x80) { out[i++]= c; bits= -6; }
3863 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3864 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3865 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003866
Owen Taylor3473f882001-02-23 17:55:21 +00003867 for ( ; bits >= 0; bits-= 6) {
3868 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3869 }
3870 out[i] = 0;
3871
3872 htmlCheckParagraph(ctxt);
3873 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3874 ctxt->sax->characters(ctxt->userData, out, i);
3875 } else {
3876 ent = htmlParseEntityRef(ctxt, &name);
3877 if (name == NULL) {
3878 htmlCheckParagraph(ctxt);
3879 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3880 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3881 return;
3882 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003883 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003884 htmlCheckParagraph(ctxt);
3885 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3886 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3887 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3888 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3889 }
3890 } else {
3891 unsigned int c;
3892 int bits, i = 0;
3893
3894 c = ent->value;
3895 if (c < 0x80)
3896 { out[i++]= c; bits= -6; }
3897 else if (c < 0x800)
3898 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3899 else if (c < 0x10000)
3900 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003901 else
Owen Taylor3473f882001-02-23 17:55:21 +00003902 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003903
Owen Taylor3473f882001-02-23 17:55:21 +00003904 for ( ; bits >= 0; bits-= 6) {
3905 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3906 }
3907 out[i] = 0;
3908
3909 htmlCheckParagraph(ctxt);
3910 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3911 ctxt->sax->characters(ctxt->userData, out, i);
3912 }
Owen Taylor3473f882001-02-23 17:55:21 +00003913 }
3914}
3915
3916/**
3917 * htmlParseContent:
3918 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003919 *
3920 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003921 */
3922
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003923static void
Owen Taylor3473f882001-02-23 17:55:21 +00003924htmlParseContent(htmlParserCtxtPtr ctxt) {
3925 xmlChar *currentNode;
3926 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003927 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003928
3929 currentNode = xmlStrdup(ctxt->name);
3930 depth = ctxt->nameNr;
3931 while (1) {
3932 long cons = ctxt->nbChars;
3933
3934 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003935
3936 if (ctxt->instate == XML_PARSER_EOF)
3937 break;
3938
Owen Taylor3473f882001-02-23 17:55:21 +00003939 /*
3940 * Our tag or one of it's parent or children is ending.
3941 */
3942 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003943 if (htmlParseEndTag(ctxt) &&
3944 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3945 if (currentNode != NULL)
3946 xmlFree(currentNode);
3947 return;
3948 }
3949 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003950 }
3951
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003952 else if ((CUR == '<') &&
3953 ((IS_ASCII_LETTER(NXT(1))) ||
3954 (NXT(1) == '_') || (NXT(1) == ':'))) {
3955 name = htmlParseHTMLName_nonInvasive(ctxt);
3956 if (name == NULL) {
3957 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3958 "htmlParseStartTag: invalid element name\n",
3959 NULL, NULL);
3960 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003961 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003962 NEXT;
3963
3964 if (currentNode != NULL)
3965 xmlFree(currentNode);
3966 return;
3967 }
3968
3969 if (ctxt->name != NULL) {
3970 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3971 htmlAutoClose(ctxt, name);
3972 continue;
3973 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003974 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003975 }
3976
Owen Taylor3473f882001-02-23 17:55:21 +00003977 /*
3978 * Has this node been popped out during parsing of
3979 * the next element
3980 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003981 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3982 (!xmlStrEqual(currentNode, ctxt->name)))
3983 {
Owen Taylor3473f882001-02-23 17:55:21 +00003984 if (currentNode != NULL) xmlFree(currentNode);
3985 return;
3986 }
3987
Daniel Veillardf9533d12001-03-03 10:04:57 +00003988 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3989 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003990 /*
3991 * Handle SCRIPT/STYLE separately
3992 */
3993 htmlParseScript(ctxt);
3994 } else {
3995 /*
3996 * Sometimes DOCTYPE arrives in the middle of the document
3997 */
3998 if ((CUR == '<') && (NXT(1) == '!') &&
3999 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4000 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4001 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4002 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004003 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4004 "Misplaced DOCTYPE declaration\n",
4005 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004006 htmlParseDocTypeDecl(ctxt);
4007 }
4008
4009 /*
4010 * First case : a comment
4011 */
4012 if ((CUR == '<') && (NXT(1) == '!') &&
4013 (NXT(2) == '-') && (NXT(3) == '-')) {
4014 htmlParseComment(ctxt);
4015 }
4016
4017 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004018 * Second case : a Processing Instruction.
4019 */
4020 else if ((CUR == '<') && (NXT(1) == '?')) {
4021 htmlParsePI(ctxt);
4022 }
4023
4024 /*
4025 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004026 */
4027 else if (CUR == '<') {
4028 htmlParseElement(ctxt);
4029 }
4030
4031 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004032 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004033 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004034 */
4035 else if (CUR == '&') {
4036 htmlParseReference(ctxt);
4037 }
4038
4039 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004040 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004041 */
4042 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004043 htmlAutoCloseOnEnd(ctxt);
4044 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004045 }
4046
4047 /*
4048 * Last case, text. Note that References are handled directly.
4049 */
4050 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004051 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004052 }
4053
4054 if (cons == ctxt->nbChars) {
4055 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004056 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4057 "detected an error in element content\n",
4058 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004059 }
4060 break;
4061 }
4062 }
4063 GROW;
4064 }
4065 if (currentNode != NULL) xmlFree(currentNode);
4066}
4067
4068/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004069 * htmlParseContent:
4070 * @ctxt: an HTML parser context
4071 *
4072 * Parse a content: comment, sub-element, reference or text.
4073 */
4074
4075void
4076__htmlParseContent(void *ctxt) {
4077 if (ctxt != NULL)
4078 htmlParseContent((htmlParserCtxtPtr) ctxt);
4079}
4080
4081/**
Owen Taylor3473f882001-02-23 17:55:21 +00004082 * htmlParseElement:
4083 * @ctxt: an HTML parser context
4084 *
4085 * parse an HTML element, this is highly recursive
4086 *
4087 * [39] element ::= EmptyElemTag | STag content ETag
4088 *
4089 * [41] Attribute ::= Name Eq AttValue
4090 */
4091
4092void
4093htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004094 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004095 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004096 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004097 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004098 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004099 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004100 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004101
Daniel Veillarda03e3652004-11-02 18:45:30 +00004102 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4103 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004104 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004105 return;
4106 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004107
4108 if (ctxt->instate == XML_PARSER_EOF)
4109 return;
4110
Owen Taylor3473f882001-02-23 17:55:21 +00004111 /* Capture start position */
4112 if (ctxt->record_info) {
4113 node_info.begin_pos = ctxt->input->consumed +
4114 (CUR_PTR - ctxt->input->base);
4115 node_info.begin_line = ctxt->input->line;
4116 }
4117
Daniel Veillard597f1c12005-07-03 23:00:18 +00004118 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004119 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004120 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004121 if (CUR == '>')
4122 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004123 return;
4124 }
Owen Taylor3473f882001-02-23 17:55:21 +00004125
4126 /*
4127 * Lookup the info for that element.
4128 */
4129 info = htmlTagLookup(name);
4130 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004131 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4132 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004133 }
4134
4135 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004136 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004137 */
4138 if ((CUR == '/') && (NXT(1) == '>')) {
4139 SKIP(2);
4140 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4141 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004142 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004143 return;
4144 }
4145
4146 if (CUR == '>') {
4147 NEXT;
4148 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004149 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4150 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004151
4152 /*
4153 * end of parsing of this node.
4154 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004155 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004156 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004157 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004158 }
Owen Taylor3473f882001-02-23 17:55:21 +00004159
4160 /*
4161 * Capture end position and add node
4162 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004163 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004164 node_info.end_pos = ctxt->input->consumed +
4165 (CUR_PTR - ctxt->input->base);
4166 node_info.end_line = ctxt->input->line;
4167 node_info.node = ctxt->node;
4168 xmlParserAddNodeInfo(ctxt, &node_info);
4169 }
4170 return;
4171 }
4172
4173 /*
4174 * Check for an Empty Element from DTD definition
4175 */
4176 if ((info != NULL) && (info->empty)) {
4177 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4178 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004179 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004180 return;
4181 }
4182
4183 /*
4184 * Parse the content of the element:
4185 */
4186 currentNode = xmlStrdup(ctxt->name);
4187 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004188 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004189 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004190 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004191 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004192 if (ctxt->nameNr < depth) break;
4193 }
Owen Taylor3473f882001-02-23 17:55:21 +00004194
Owen Taylor3473f882001-02-23 17:55:21 +00004195 /*
4196 * Capture end position and add node
4197 */
4198 if ( currentNode != NULL && ctxt->record_info ) {
4199 node_info.end_pos = ctxt->input->consumed +
4200 (CUR_PTR - ctxt->input->base);
4201 node_info.end_line = ctxt->input->line;
4202 node_info.node = ctxt->node;
4203 xmlParserAddNodeInfo(ctxt, &node_info);
4204 }
William M. Brack76e95df2003-10-18 16:20:14 +00004205 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004206 htmlAutoCloseOnEnd(ctxt);
4207 }
4208
Owen Taylor3473f882001-02-23 17:55:21 +00004209 if (currentNode != NULL)
4210 xmlFree(currentNode);
4211}
4212
4213/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004214 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004215 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004216 *
Owen Taylor3473f882001-02-23 17:55:21 +00004217 * parse an HTML document (and build a tree if using the standard SAX
4218 * interface).
4219 *
4220 * Returns 0, -1 in case of error. the parser context is augmented
4221 * as a result of the parsing.
4222 */
4223
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004224int
Owen Taylor3473f882001-02-23 17:55:21 +00004225htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004226 xmlChar start[4];
4227 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004228 xmlDtdPtr dtd;
4229
Daniel Veillardd0463562001-10-13 09:15:48 +00004230 xmlInitParser();
4231
Owen Taylor3473f882001-02-23 17:55:21 +00004232 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004233
Daniel Veillarda03e3652004-11-02 18:45:30 +00004234 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4235 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4236 "htmlParseDocument: context error\n", NULL, NULL);
4237 return(XML_ERR_INTERNAL_ERROR);
4238 }
4239 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004240 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004241 GROW;
4242 /*
4243 * SAX: beginning of the document processing.
4244 */
4245 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4246 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4247
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004248 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4249 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4250 /*
4251 * Get the 4 first bytes and decode the charset
4252 * if enc != XML_CHAR_ENCODING_NONE
4253 * plug some encoding conversion routines.
4254 */
4255 start[0] = RAW;
4256 start[1] = NXT(1);
4257 start[2] = NXT(2);
4258 start[3] = NXT(3);
4259 enc = xmlDetectCharEncoding(&start[0], 4);
4260 if (enc != XML_CHAR_ENCODING_NONE) {
4261 xmlSwitchEncoding(ctxt, enc);
4262 }
4263 }
4264
Owen Taylor3473f882001-02-23 17:55:21 +00004265 /*
4266 * Wipe out everything which is before the first '<'
4267 */
4268 SKIP_BLANKS;
4269 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004270 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004271 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004272 }
4273
4274 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4275 ctxt->sax->startDocument(ctxt->userData);
4276
4277
4278 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004279 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004280 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004281 while (((CUR == '<') && (NXT(1) == '!') &&
4282 (NXT(2) == '-') && (NXT(3) == '-')) ||
4283 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004284 htmlParseComment(ctxt);
4285 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004286 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004287 }
Owen Taylor3473f882001-02-23 17:55:21 +00004288
4289
4290 /*
4291 * Then possibly doc type declaration(s) and more Misc
4292 * (doctypedecl Misc*)?
4293 */
4294 if ((CUR == '<') && (NXT(1) == '!') &&
4295 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4296 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4297 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4298 (UPP(8) == 'E')) {
4299 htmlParseDocTypeDecl(ctxt);
4300 }
4301 SKIP_BLANKS;
4302
4303 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004304 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004305 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004306 while (((CUR == '<') && (NXT(1) == '!') &&
4307 (NXT(2) == '-') && (NXT(3) == '-')) ||
4308 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004309 htmlParseComment(ctxt);
4310 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004311 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004312 }
Owen Taylor3473f882001-02-23 17:55:21 +00004313
4314 /*
4315 * Time to start parsing the tree itself
4316 */
4317 htmlParseContent(ctxt);
4318
4319 /*
4320 * autoclose
4321 */
4322 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004323 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004324
4325
4326 /*
4327 * SAX: end of the document processing.
4328 */
4329 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4330 ctxt->sax->endDocument(ctxt->userData);
4331
4332 if (ctxt->myDoc != NULL) {
4333 dtd = xmlGetIntSubset(ctxt->myDoc);
4334 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004335 ctxt->myDoc->intSubset =
4336 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004337 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4338 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4339 }
4340 if (! ctxt->wellFormed) return(-1);
4341 return(0);
4342}
4343
4344
4345/************************************************************************
4346 * *
4347 * Parser contexts handling *
4348 * *
4349 ************************************************************************/
4350
4351/**
William M. Brackedb65a72004-02-06 07:36:04 +00004352 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004353 * @ctxt: an HTML parser context
4354 *
4355 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004356 *
4357 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004358 */
4359
Daniel Veillardf403d292003-10-05 13:51:35 +00004360static int
Owen Taylor3473f882001-02-23 17:55:21 +00004361htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4362{
4363 htmlSAXHandler *sax;
4364
Daniel Veillardf403d292003-10-05 13:51:35 +00004365 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004366 memset(ctxt, 0, sizeof(htmlParserCtxt));
4367
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004368 ctxt->dict = xmlDictCreate();
4369 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004370 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4371 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004372 }
Owen Taylor3473f882001-02-23 17:55:21 +00004373 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4374 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004375 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4376 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004377 }
4378 else
4379 memset(sax, 0, sizeof(htmlSAXHandler));
4380
4381 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004382 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004383 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4384 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004385 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004386 ctxt->inputNr = 0;
4387 ctxt->inputMax = 0;
4388 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004389 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004390 }
4391 ctxt->inputNr = 0;
4392 ctxt->inputMax = 5;
4393 ctxt->input = NULL;
4394 ctxt->version = NULL;
4395 ctxt->encoding = NULL;
4396 ctxt->standalone = -1;
4397 ctxt->instate = XML_PARSER_START;
4398
4399 /* Allocate the Node stack */
4400 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4401 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004402 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004403 ctxt->nodeNr = 0;
4404 ctxt->nodeMax = 0;
4405 ctxt->node = NULL;
4406 ctxt->inputNr = 0;
4407 ctxt->inputMax = 0;
4408 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004409 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004410 }
4411 ctxt->nodeNr = 0;
4412 ctxt->nodeMax = 10;
4413 ctxt->node = NULL;
4414
4415 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004416 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004417 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004418 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004419 ctxt->nameNr = 0;
4420 ctxt->nameMax = 10;
4421 ctxt->name = NULL;
4422 ctxt->nodeNr = 0;
4423 ctxt->nodeMax = 0;
4424 ctxt->node = NULL;
4425 ctxt->inputNr = 0;
4426 ctxt->inputMax = 0;
4427 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004428 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004429 }
4430 ctxt->nameNr = 0;
4431 ctxt->nameMax = 10;
4432 ctxt->name = NULL;
4433
Daniel Veillard092643b2003-09-25 14:29:29 +00004434 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004435 else {
4436 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004437 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004438 }
4439 ctxt->userData = ctxt;
4440 ctxt->myDoc = NULL;
4441 ctxt->wellFormed = 1;
4442 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004443 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004444 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004445 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004446 ctxt->vctxt.userData = ctxt;
4447 ctxt->vctxt.error = xmlParserValidityError;
4448 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004449 ctxt->record_info = 0;
4450 ctxt->validate = 0;
4451 ctxt->nbChars = 0;
4452 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004453 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004454 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004455 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004456}
4457
4458/**
4459 * htmlFreeParserCtxt:
4460 * @ctxt: an HTML parser context
4461 *
4462 * Free all the memory used by a parser context. However the parsed
4463 * document in ctxt->myDoc is not freed.
4464 */
4465
4466void
4467htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4468{
4469 xmlFreeParserCtxt(ctxt);
4470}
4471
4472/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004473 * htmlNewParserCtxt:
4474 *
4475 * Allocate and initialize a new parser context.
4476 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004477 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004478 */
4479
Daniel Veillard34c647c2006-09-21 06:53:59 +00004480htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004481htmlNewParserCtxt(void)
4482{
4483 xmlParserCtxtPtr ctxt;
4484
4485 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4486 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004487 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004488 return(NULL);
4489 }
4490 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004491 if (htmlInitParserCtxt(ctxt) < 0) {
4492 htmlFreeParserCtxt(ctxt);
4493 return(NULL);
4494 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004495 return(ctxt);
4496}
4497
4498/**
4499 * htmlCreateMemoryParserCtxt:
4500 * @buffer: a pointer to a char array
4501 * @size: the size of the array
4502 *
4503 * Create a parser context for an HTML in-memory document.
4504 *
4505 * Returns the new parser context or NULL
4506 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004507htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004508htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4509 xmlParserCtxtPtr ctxt;
4510 xmlParserInputPtr input;
4511 xmlParserInputBufferPtr buf;
4512
4513 if (buffer == NULL)
4514 return(NULL);
4515 if (size <= 0)
4516 return(NULL);
4517
4518 ctxt = htmlNewParserCtxt();
4519 if (ctxt == NULL)
4520 return(NULL);
4521
4522 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4523 if (buf == NULL) return(NULL);
4524
4525 input = xmlNewInputStream(ctxt);
4526 if (input == NULL) {
4527 xmlFreeParserCtxt(ctxt);
4528 return(NULL);
4529 }
4530
4531 input->filename = NULL;
4532 input->buf = buf;
4533 input->base = input->buf->buffer->content;
4534 input->cur = input->buf->buffer->content;
4535 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4536
4537 inputPush(ctxt, input);
4538 return(ctxt);
4539}
4540
4541/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004542 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004543 * @cur: a pointer to an array of xmlChar
4544 * @encoding: a free form C string describing the HTML document encoding, or NULL
4545 *
4546 * Create a parser context for an HTML document.
4547 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004548 * TODO: check the need to add encoding handling there
4549 *
Owen Taylor3473f882001-02-23 17:55:21 +00004550 * Returns the new parser context or NULL
4551 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004552static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004553htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004554 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004555 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004556
Daniel Veillard1d995272002-07-22 16:43:32 +00004557 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004558 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004559 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004560 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004561 if (ctxt == NULL)
4562 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004563
4564 if (encoding != NULL) {
4565 xmlCharEncoding enc;
4566 xmlCharEncodingHandlerPtr handler;
4567
4568 if (ctxt->input->encoding != NULL)
4569 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004570 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004571
4572 enc = xmlParseCharEncoding(encoding);
4573 /*
4574 * registered set of known encodings
4575 */
4576 if (enc != XML_CHAR_ENCODING_ERROR) {
4577 xmlSwitchEncoding(ctxt, enc);
4578 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004579 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004580 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004581 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004582 }
4583 } else {
4584 /*
4585 * fallback for unknown encodings
4586 */
4587 handler = xmlFindCharEncodingHandler((const char *) encoding);
4588 if (handler != NULL) {
4589 xmlSwitchToEncoding(ctxt, handler);
4590 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004591 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4592 "Unsupported encoding %s\n",
4593 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004594 }
4595 }
4596 }
4597 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004598}
4599
Daniel Veillard73b013f2003-09-30 12:36:01 +00004600#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004601/************************************************************************
4602 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004603 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004604 * *
4605 ************************************************************************/
4606
4607/**
4608 * htmlParseLookupSequence:
4609 * @ctxt: an HTML parser context
4610 * @first: the first char to lookup
4611 * @next: the next char to lookup or zero
4612 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004613 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004614 *
4615 * Try to find if a sequence (first, next, third) or just (first next) or
4616 * (first) is available in the input stream.
4617 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4618 * to avoid rescanning sequences of bytes, it DOES change the state of the
4619 * parser, do not use liberally.
4620 * This is basically similar to xmlParseLookupSequence()
4621 *
4622 * Returns the index to the current parsing point if the full sequence
4623 * is available, -1 otherwise.
4624 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004625static int
Owen Taylor3473f882001-02-23 17:55:21 +00004626htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004627 xmlChar next, xmlChar third, int iscomment,
4628 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004629 int base, len;
4630 htmlParserInputPtr in;
4631 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004632 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004633 int invalue = 0;
4634 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004635
4636 in = ctxt->input;
4637 if (in == NULL) return(-1);
4638 base = in->cur - in->base;
4639 if (base < 0) return(-1);
4640 if (ctxt->checkIndex > base)
4641 base = ctxt->checkIndex;
4642 if (in->buf == NULL) {
4643 buf = in->base;
4644 len = in->length;
4645 } else {
4646 buf = in->buf->buffer->content;
4647 len = in->buf->buffer->use;
4648 }
4649 /* take into account the sequence length */
4650 if (third) len -= 2;
4651 else if (next) len --;
4652 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004653 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004654 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4655 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4656 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004657 /* do not increment past <! - some people use <!--> */
4658 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004659 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004660 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004661 if (ignoreattrval) {
4662 if (buf[base] == '"' || buf[base] == '\'') {
4663 if (invalue) {
4664 if (buf[base] == valdellim) {
4665 invalue = 0;
4666 continue;
4667 }
4668 } else {
4669 valdellim = buf[base];
4670 invalue = 1;
4671 continue;
4672 }
4673 } else if (invalue) {
4674 continue;
4675 }
4676 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004677 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004678 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004679 return(-1);
4680 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4681 (buf[base + 2] == '>')) {
4682 incomment = 0;
4683 base += 2;
4684 }
4685 continue;
4686 }
Owen Taylor3473f882001-02-23 17:55:21 +00004687 if (buf[base] == first) {
4688 if (third != 0) {
4689 if ((buf[base + 1] != next) ||
4690 (buf[base + 2] != third)) continue;
4691 } else if (next != 0) {
4692 if (buf[base + 1] != next) continue;
4693 }
4694 ctxt->checkIndex = 0;
4695#ifdef DEBUG_PUSH
4696 if (next == 0)
4697 xmlGenericError(xmlGenericErrorContext,
4698 "HPP: lookup '%c' found at %d\n",
4699 first, base);
4700 else if (third == 0)
4701 xmlGenericError(xmlGenericErrorContext,
4702 "HPP: lookup '%c%c' found at %d\n",
4703 first, next, base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004704 else
Owen Taylor3473f882001-02-23 17:55:21 +00004705 xmlGenericError(xmlGenericErrorContext,
4706 "HPP: lookup '%c%c%c' found at %d\n",
4707 first, next, third, base);
4708#endif
4709 return(base - (in->cur - in->base));
4710 }
4711 }
4712 ctxt->checkIndex = base;
4713#ifdef DEBUG_PUSH
4714 if (next == 0)
4715 xmlGenericError(xmlGenericErrorContext,
4716 "HPP: lookup '%c' failed\n", first);
4717 else if (third == 0)
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004720 else
Owen Taylor3473f882001-02-23 17:55:21 +00004721 xmlGenericError(xmlGenericErrorContext,
4722 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4723#endif
4724 return(-1);
4725}
4726
4727/**
4728 * htmlParseTryOrFinish:
4729 * @ctxt: an HTML parser context
4730 * @terminate: last chunk indicator
4731 *
4732 * Try to progress on parsing
4733 *
4734 * Returns zero if no parsing was possible
4735 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004736static int
Owen Taylor3473f882001-02-23 17:55:21 +00004737htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4738 int ret = 0;
4739 htmlParserInputPtr in;
4740 int avail = 0;
4741 xmlChar cur, next;
4742
4743#ifdef DEBUG_PUSH
4744 switch (ctxt->instate) {
4745 case XML_PARSER_EOF:
4746 xmlGenericError(xmlGenericErrorContext,
4747 "HPP: try EOF\n"); break;
4748 case XML_PARSER_START:
4749 xmlGenericError(xmlGenericErrorContext,
4750 "HPP: try START\n"); break;
4751 case XML_PARSER_MISC:
4752 xmlGenericError(xmlGenericErrorContext,
4753 "HPP: try MISC\n");break;
4754 case XML_PARSER_COMMENT:
4755 xmlGenericError(xmlGenericErrorContext,
4756 "HPP: try COMMENT\n");break;
4757 case XML_PARSER_PROLOG:
4758 xmlGenericError(xmlGenericErrorContext,
4759 "HPP: try PROLOG\n");break;
4760 case XML_PARSER_START_TAG:
4761 xmlGenericError(xmlGenericErrorContext,
4762 "HPP: try START_TAG\n");break;
4763 case XML_PARSER_CONTENT:
4764 xmlGenericError(xmlGenericErrorContext,
4765 "HPP: try CONTENT\n");break;
4766 case XML_PARSER_CDATA_SECTION:
4767 xmlGenericError(xmlGenericErrorContext,
4768 "HPP: try CDATA_SECTION\n");break;
4769 case XML_PARSER_END_TAG:
4770 xmlGenericError(xmlGenericErrorContext,
4771 "HPP: try END_TAG\n");break;
4772 case XML_PARSER_ENTITY_DECL:
4773 xmlGenericError(xmlGenericErrorContext,
4774 "HPP: try ENTITY_DECL\n");break;
4775 case XML_PARSER_ENTITY_VALUE:
4776 xmlGenericError(xmlGenericErrorContext,
4777 "HPP: try ENTITY_VALUE\n");break;
4778 case XML_PARSER_ATTRIBUTE_VALUE:
4779 xmlGenericError(xmlGenericErrorContext,
4780 "HPP: try ATTRIBUTE_VALUE\n");break;
4781 case XML_PARSER_DTD:
4782 xmlGenericError(xmlGenericErrorContext,
4783 "HPP: try DTD\n");break;
4784 case XML_PARSER_EPILOG:
4785 xmlGenericError(xmlGenericErrorContext,
4786 "HPP: try EPILOG\n");break;
4787 case XML_PARSER_PI:
4788 xmlGenericError(xmlGenericErrorContext,
4789 "HPP: try PI\n");break;
4790 case XML_PARSER_SYSTEM_LITERAL:
4791 xmlGenericError(xmlGenericErrorContext,
4792 "HPP: try SYSTEM_LITERAL\n");break;
4793 }
4794#endif
4795
4796 while (1) {
4797
4798 in = ctxt->input;
4799 if (in == NULL) break;
4800 if (in->buf == NULL)
4801 avail = in->length - (in->cur - in->base);
4802 else
4803 avail = in->buf->buffer->use - (in->cur - in->base);
4804 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004805 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004806 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004807 /*
4808 * SAX: end of the document processing.
4809 */
4810 ctxt->instate = XML_PARSER_EOF;
4811 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4812 ctxt->sax->endDocument(ctxt->userData);
4813 }
4814 }
4815 if (avail < 1)
4816 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004817 cur = in->cur[0];
4818 if (cur == 0) {
4819 SKIP(1);
4820 continue;
4821 }
4822
Owen Taylor3473f882001-02-23 17:55:21 +00004823 switch (ctxt->instate) {
4824 case XML_PARSER_EOF:
4825 /*
4826 * Document parsing is done !
4827 */
4828 goto done;
4829 case XML_PARSER_START:
4830 /*
4831 * Very first chars read from the document flow.
4832 */
4833 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004834 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004835 SKIP_BLANKS;
4836 if (in->buf == NULL)
4837 avail = in->length - (in->cur - in->base);
4838 else
4839 avail = in->buf->buffer->use - (in->cur - in->base);
4840 }
4841 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4842 ctxt->sax->setDocumentLocator(ctxt->userData,
4843 &xmlDefaultSAXLocator);
4844 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4845 (!ctxt->disableSAX))
4846 ctxt->sax->startDocument(ctxt->userData);
4847
4848 cur = in->cur[0];
4849 next = in->cur[1];
4850 if ((cur == '<') && (next == '!') &&
4851 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4852 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4853 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4854 (UPP(8) == 'E')) {
4855 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004856 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004857 goto done;
4858#ifdef DEBUG_PUSH
4859 xmlGenericError(xmlGenericErrorContext,
4860 "HPP: Parsing internal subset\n");
4861#endif
4862 htmlParseDocTypeDecl(ctxt);
4863 ctxt->instate = XML_PARSER_PROLOG;
4864#ifdef DEBUG_PUSH
4865 xmlGenericError(xmlGenericErrorContext,
4866 "HPP: entering PROLOG\n");
4867#endif
4868 } else {
4869 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004870#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004871 xmlGenericError(xmlGenericErrorContext,
4872 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004873#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004874 }
Owen Taylor3473f882001-02-23 17:55:21 +00004875 break;
4876 case XML_PARSER_MISC:
4877 SKIP_BLANKS;
4878 if (in->buf == NULL)
4879 avail = in->length - (in->cur - in->base);
4880 else
4881 avail = in->buf->buffer->use - (in->cur - in->base);
4882 if (avail < 2)
4883 goto done;
4884 cur = in->cur[0];
4885 next = in->cur[1];
4886 if ((cur == '<') && (next == '!') &&
4887 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4888 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004889 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004890 goto done;
4891#ifdef DEBUG_PUSH
4892 xmlGenericError(xmlGenericErrorContext,
4893 "HPP: Parsing Comment\n");
4894#endif
4895 htmlParseComment(ctxt);
4896 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004897 } else if ((cur == '<') && (next == '?')) {
4898 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004899 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004900 goto done;
4901#ifdef DEBUG_PUSH
4902 xmlGenericError(xmlGenericErrorContext,
4903 "HPP: Parsing PI\n");
4904#endif
4905 htmlParsePI(ctxt);
4906 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004907 } else if ((cur == '<') && (next == '!') &&
4908 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4909 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4910 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4911 (UPP(8) == 'E')) {
4912 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004913 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004914 goto done;
4915#ifdef DEBUG_PUSH
4916 xmlGenericError(xmlGenericErrorContext,
4917 "HPP: Parsing internal subset\n");
4918#endif
4919 htmlParseDocTypeDecl(ctxt);
4920 ctxt->instate = XML_PARSER_PROLOG;
4921#ifdef DEBUG_PUSH
4922 xmlGenericError(xmlGenericErrorContext,
4923 "HPP: entering PROLOG\n");
4924#endif
4925 } else if ((cur == '<') && (next == '!') &&
4926 (avail < 9)) {
4927 goto done;
4928 } else {
4929 ctxt->instate = XML_PARSER_START_TAG;
4930#ifdef DEBUG_PUSH
4931 xmlGenericError(xmlGenericErrorContext,
4932 "HPP: entering START_TAG\n");
4933#endif
4934 }
4935 break;
4936 case XML_PARSER_PROLOG:
4937 SKIP_BLANKS;
4938 if (in->buf == NULL)
4939 avail = in->length - (in->cur - in->base);
4940 else
4941 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004942 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00004943 goto done;
4944 cur = in->cur[0];
4945 next = in->cur[1];
4946 if ((cur == '<') && (next == '!') &&
4947 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4948 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004949 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004950 goto done;
4951#ifdef DEBUG_PUSH
4952 xmlGenericError(xmlGenericErrorContext,
4953 "HPP: Parsing Comment\n");
4954#endif
4955 htmlParseComment(ctxt);
4956 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004957 } else if ((cur == '<') && (next == '?')) {
4958 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004959 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004960 goto done;
4961#ifdef DEBUG_PUSH
4962 xmlGenericError(xmlGenericErrorContext,
4963 "HPP: Parsing PI\n");
4964#endif
4965 htmlParsePI(ctxt);
4966 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004967 } else if ((cur == '<') && (next == '!') &&
4968 (avail < 4)) {
4969 goto done;
4970 } else {
4971 ctxt->instate = XML_PARSER_START_TAG;
4972#ifdef DEBUG_PUSH
4973 xmlGenericError(xmlGenericErrorContext,
4974 "HPP: entering START_TAG\n");
4975#endif
4976 }
4977 break;
4978 case XML_PARSER_EPILOG:
4979 if (in->buf == NULL)
4980 avail = in->length - (in->cur - in->base);
4981 else
4982 avail = in->buf->buffer->use - (in->cur - in->base);
4983 if (avail < 1)
4984 goto done;
4985 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004986 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004987 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004988 goto done;
4989 }
4990 if (avail < 2)
4991 goto done;
4992 next = in->cur[1];
4993 if ((cur == '<') && (next == '!') &&
4994 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4995 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004996 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004997 goto done;
4998#ifdef DEBUG_PUSH
4999 xmlGenericError(xmlGenericErrorContext,
5000 "HPP: Parsing Comment\n");
5001#endif
5002 htmlParseComment(ctxt);
5003 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005004 } else if ((cur == '<') && (next == '?')) {
5005 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005006 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005007 goto done;
5008#ifdef DEBUG_PUSH
5009 xmlGenericError(xmlGenericErrorContext,
5010 "HPP: Parsing PI\n");
5011#endif
5012 htmlParsePI(ctxt);
5013 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005014 } else if ((cur == '<') && (next == '!') &&
5015 (avail < 4)) {
5016 goto done;
5017 } else {
5018 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005019 ctxt->wellFormed = 0;
5020 ctxt->instate = XML_PARSER_EOF;
5021#ifdef DEBUG_PUSH
5022 xmlGenericError(xmlGenericErrorContext,
5023 "HPP: entering EOF\n");
5024#endif
5025 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5026 ctxt->sax->endDocument(ctxt->userData);
5027 goto done;
5028 }
5029 break;
5030 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005031 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005032 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005033 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005034
5035 if (avail < 2)
5036 goto done;
5037 cur = in->cur[0];
5038 if (cur != '<') {
5039 ctxt->instate = XML_PARSER_CONTENT;
5040#ifdef DEBUG_PUSH
5041 xmlGenericError(xmlGenericErrorContext,
5042 "HPP: entering CONTENT\n");
5043#endif
5044 break;
5045 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005046 if (in->cur[1] == '/') {
5047 ctxt->instate = XML_PARSER_END_TAG;
5048 ctxt->checkIndex = 0;
5049#ifdef DEBUG_PUSH
5050 xmlGenericError(xmlGenericErrorContext,
5051 "HPP: entering END_TAG\n");
5052#endif
5053 break;
5054 }
Owen Taylor3473f882001-02-23 17:55:21 +00005055 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005056 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005057 goto done;
5058
Daniel Veillard597f1c12005-07-03 23:00:18 +00005059 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005060 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005061 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005062 (name == NULL)) {
5063 if (CUR == '>')
5064 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005065 break;
5066 }
Owen Taylor3473f882001-02-23 17:55:21 +00005067
5068 /*
5069 * Lookup the info for that element.
5070 */
5071 info = htmlTagLookup(name);
5072 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005073 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5074 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005075 }
5076
5077 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005078 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005079 */
5080 if ((CUR == '/') && (NXT(1) == '>')) {
5081 SKIP(2);
5082 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5083 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005084 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005085 ctxt->instate = XML_PARSER_CONTENT;
5086#ifdef DEBUG_PUSH
5087 xmlGenericError(xmlGenericErrorContext,
5088 "HPP: entering CONTENT\n");
5089#endif
5090 break;
5091 }
5092
5093 if (CUR == '>') {
5094 NEXT;
5095 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005096 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5097 "Couldn't find end of Start Tag %s\n",
5098 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005099
5100 /*
5101 * end of parsing of this node.
5102 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005103 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005104 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005105 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005106 }
Owen Taylor3473f882001-02-23 17:55:21 +00005107
5108 ctxt->instate = XML_PARSER_CONTENT;
5109#ifdef DEBUG_PUSH
5110 xmlGenericError(xmlGenericErrorContext,
5111 "HPP: entering CONTENT\n");
5112#endif
5113 break;
5114 }
5115
5116 /*
5117 * Check for an Empty Element from DTD definition
5118 */
5119 if ((info != NULL) && (info->empty)) {
5120 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5121 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005122 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005123 }
5124 ctxt->instate = XML_PARSER_CONTENT;
5125#ifdef DEBUG_PUSH
5126 xmlGenericError(xmlGenericErrorContext,
5127 "HPP: entering CONTENT\n");
5128#endif
5129 break;
5130 }
5131 case XML_PARSER_CONTENT: {
5132 long cons;
5133 /*
5134 * Handle preparsed entities and charRef
5135 */
5136 if (ctxt->token != 0) {
5137 xmlChar chr[2] = { 0 , 0 } ;
5138
5139 chr[0] = (xmlChar) ctxt->token;
5140 htmlCheckParagraph(ctxt);
5141 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5142 ctxt->sax->characters(ctxt->userData, chr, 1);
5143 ctxt->token = 0;
5144 ctxt->checkIndex = 0;
5145 }
5146 if ((avail == 1) && (terminate)) {
5147 cur = in->cur[0];
5148 if ((cur != '<') && (cur != '&')) {
5149 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005150 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005151 if (ctxt->sax->ignorableWhitespace != NULL)
5152 ctxt->sax->ignorableWhitespace(
5153 ctxt->userData, &cur, 1);
5154 } else {
5155 htmlCheckParagraph(ctxt);
5156 if (ctxt->sax->characters != NULL)
5157 ctxt->sax->characters(
5158 ctxt->userData, &cur, 1);
5159 }
5160 }
5161 ctxt->token = 0;
5162 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005163 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005164 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005165 }
Owen Taylor3473f882001-02-23 17:55:21 +00005166 }
5167 if (avail < 2)
5168 goto done;
5169 cur = in->cur[0];
5170 next = in->cur[1];
5171 cons = ctxt->nbChars;
5172 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5173 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5174 /*
5175 * Handle SCRIPT/STYLE separately
5176 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005177 if (!terminate) {
5178 int idx;
5179 xmlChar val;
5180
Jiri Netolicky446e1262009-08-07 17:05:36 +02005181 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005182 if (idx < 0)
5183 goto done;
5184 val = in->cur[idx + 2];
5185 if (val == 0) /* bad cut of input */
5186 goto done;
5187 }
Owen Taylor3473f882001-02-23 17:55:21 +00005188 htmlParseScript(ctxt);
5189 if ((cur == '<') && (next == '/')) {
5190 ctxt->instate = XML_PARSER_END_TAG;
5191 ctxt->checkIndex = 0;
5192#ifdef DEBUG_PUSH
5193 xmlGenericError(xmlGenericErrorContext,
5194 "HPP: entering END_TAG\n");
5195#endif
5196 break;
5197 }
5198 } else {
5199 /*
5200 * Sometimes DOCTYPE arrives in the middle of the document
5201 */
5202 if ((cur == '<') && (next == '!') &&
5203 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5204 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5205 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5206 (UPP(8) == 'E')) {
5207 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005208 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005209 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005210 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5211 "Misplaced DOCTYPE declaration\n",
5212 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005213 htmlParseDocTypeDecl(ctxt);
5214 } else if ((cur == '<') && (next == '!') &&
5215 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5216 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005217 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005218 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005219 goto done;
5220#ifdef DEBUG_PUSH
5221 xmlGenericError(xmlGenericErrorContext,
5222 "HPP: Parsing Comment\n");
5223#endif
5224 htmlParseComment(ctxt);
5225 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005226 } else if ((cur == '<') && (next == '?')) {
5227 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005228 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005229 goto done;
5230#ifdef DEBUG_PUSH
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: Parsing PI\n");
5233#endif
5234 htmlParsePI(ctxt);
5235 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005236 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5237 goto done;
5238 } else if ((cur == '<') && (next == '/')) {
5239 ctxt->instate = XML_PARSER_END_TAG;
5240 ctxt->checkIndex = 0;
5241#ifdef DEBUG_PUSH
5242 xmlGenericError(xmlGenericErrorContext,
5243 "HPP: entering END_TAG\n");
5244#endif
5245 break;
5246 } else if (cur == '<') {
5247 ctxt->instate = XML_PARSER_START_TAG;
5248 ctxt->checkIndex = 0;
5249#ifdef DEBUG_PUSH
5250 xmlGenericError(xmlGenericErrorContext,
5251 "HPP: entering START_TAG\n");
5252#endif
5253 break;
5254 } else if (cur == '&') {
5255 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005256 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005257 goto done;
5258#ifdef DEBUG_PUSH
5259 xmlGenericError(xmlGenericErrorContext,
5260 "HPP: Parsing Reference\n");
5261#endif
5262 /* TODO: check generation of subtrees if noent !!! */
5263 htmlParseReference(ctxt);
5264 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005265 /*
5266 * check that the text sequence is complete
5267 * before handing out the data to the parser
5268 * to avoid problems with erroneous end of
5269 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005270 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005271 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005272 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005273 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005274 ctxt->checkIndex = 0;
5275#ifdef DEBUG_PUSH
5276 xmlGenericError(xmlGenericErrorContext,
5277 "HPP: Parsing char data\n");
5278#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005279 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005280 }
5281 }
5282 if (cons == ctxt->nbChars) {
5283 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005284 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5285 "detected an error in element content\n",
5286 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005287 }
5288 NEXT;
5289 break;
5290 }
5291
5292 break;
5293 }
5294 case XML_PARSER_END_TAG:
5295 if (avail < 2)
5296 goto done;
5297 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005298 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005299 goto done;
5300 htmlParseEndTag(ctxt);
5301 if (ctxt->nameNr == 0) {
5302 ctxt->instate = XML_PARSER_EPILOG;
5303 } else {
5304 ctxt->instate = XML_PARSER_CONTENT;
5305 }
5306 ctxt->checkIndex = 0;
5307#ifdef DEBUG_PUSH
5308 xmlGenericError(xmlGenericErrorContext,
5309 "HPP: entering CONTENT\n");
5310#endif
5311 break;
5312 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005313 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5314 "HPP: internal error, state == CDATA\n",
5315 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005316 ctxt->instate = XML_PARSER_CONTENT;
5317 ctxt->checkIndex = 0;
5318#ifdef DEBUG_PUSH
5319 xmlGenericError(xmlGenericErrorContext,
5320 "HPP: entering CONTENT\n");
5321#endif
5322 break;
5323 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005324 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5325 "HPP: internal error, state == DTD\n",
5326 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005327 ctxt->instate = XML_PARSER_CONTENT;
5328 ctxt->checkIndex = 0;
5329#ifdef DEBUG_PUSH
5330 xmlGenericError(xmlGenericErrorContext,
5331 "HPP: entering CONTENT\n");
5332#endif
5333 break;
5334 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005335 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5336 "HPP: internal error, state == COMMENT\n",
5337 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005338 ctxt->instate = XML_PARSER_CONTENT;
5339 ctxt->checkIndex = 0;
5340#ifdef DEBUG_PUSH
5341 xmlGenericError(xmlGenericErrorContext,
5342 "HPP: entering CONTENT\n");
5343#endif
5344 break;
5345 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005346 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5347 "HPP: internal error, state == PI\n",
5348 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005349 ctxt->instate = XML_PARSER_CONTENT;
5350 ctxt->checkIndex = 0;
5351#ifdef DEBUG_PUSH
5352 xmlGenericError(xmlGenericErrorContext,
5353 "HPP: entering CONTENT\n");
5354#endif
5355 break;
5356 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005357 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5358 "HPP: internal error, state == ENTITY_DECL\n",
5359 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005360 ctxt->instate = XML_PARSER_CONTENT;
5361 ctxt->checkIndex = 0;
5362#ifdef DEBUG_PUSH
5363 xmlGenericError(xmlGenericErrorContext,
5364 "HPP: entering CONTENT\n");
5365#endif
5366 break;
5367 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005368 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5369 "HPP: internal error, state == ENTITY_VALUE\n",
5370 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005371 ctxt->instate = XML_PARSER_CONTENT;
5372 ctxt->checkIndex = 0;
5373#ifdef DEBUG_PUSH
5374 xmlGenericError(xmlGenericErrorContext,
5375 "HPP: entering DTD\n");
5376#endif
5377 break;
5378 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005379 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5380 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5381 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005382 ctxt->instate = XML_PARSER_START_TAG;
5383 ctxt->checkIndex = 0;
5384#ifdef DEBUG_PUSH
5385 xmlGenericError(xmlGenericErrorContext,
5386 "HPP: entering START_TAG\n");
5387#endif
5388 break;
5389 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005390 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5391 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5392 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005393 ctxt->instate = XML_PARSER_CONTENT;
5394 ctxt->checkIndex = 0;
5395#ifdef DEBUG_PUSH
5396 xmlGenericError(xmlGenericErrorContext,
5397 "HPP: entering CONTENT\n");
5398#endif
5399 break;
5400 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005401 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5402 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5403 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005404 ctxt->instate = XML_PARSER_CONTENT;
5405 ctxt->checkIndex = 0;
5406#ifdef DEBUG_PUSH
5407 xmlGenericError(xmlGenericErrorContext,
5408 "HPP: entering CONTENT\n");
5409#endif
5410 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005411 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005412 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5413 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5414 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005415 ctxt->instate = XML_PARSER_CONTENT;
5416 ctxt->checkIndex = 0;
5417#ifdef DEBUG_PUSH
5418 xmlGenericError(xmlGenericErrorContext,
5419 "HPP: entering CONTENT\n");
5420#endif
5421 break;
5422
Owen Taylor3473f882001-02-23 17:55:21 +00005423 }
5424 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005425done:
Owen Taylor3473f882001-02-23 17:55:21 +00005426 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005427 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005428 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005429 /*
5430 * SAX: end of the document processing.
5431 */
5432 ctxt->instate = XML_PARSER_EOF;
5433 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5434 ctxt->sax->endDocument(ctxt->userData);
5435 }
5436 }
5437 if ((ctxt->myDoc != NULL) &&
5438 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5439 (ctxt->instate == XML_PARSER_EPILOG))) {
5440 xmlDtdPtr dtd;
5441 dtd = xmlGetIntSubset(ctxt->myDoc);
5442 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005443 ctxt->myDoc->intSubset =
5444 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005445 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5446 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5447 }
5448#ifdef DEBUG_PUSH
5449 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5450#endif
5451 return(ret);
5452}
5453
5454/**
Owen Taylor3473f882001-02-23 17:55:21 +00005455 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005456 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005457 * @chunk: an char array
5458 * @size: the size in byte of the chunk
5459 * @terminate: last chunk indicator
5460 *
5461 * Parse a Chunk of memory
5462 *
5463 * Returns zero if no error, the xmlParserErrors otherwise.
5464 */
5465int
5466htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5467 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005468 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5469 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5470 "htmlParseChunk: context error\n", NULL, NULL);
5471 return(XML_ERR_INTERNAL_ERROR);
5472 }
Owen Taylor3473f882001-02-23 17:55:21 +00005473 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5474 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5475 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5476 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005477 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005478
5479 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005480 if (res < 0) {
5481 ctxt->errNo = XML_PARSER_EOF;
5482 ctxt->disableSAX = 1;
5483 return (XML_PARSER_EOF);
5484 }
Owen Taylor3473f882001-02-23 17:55:21 +00005485 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5486 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005487 ctxt->input->end =
5488 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005489#ifdef DEBUG_PUSH
5490 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5491#endif
5492
Daniel Veillard14f752c2003-08-09 11:44:50 +00005493#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005494 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5495 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005496#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005497 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005498 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5499 xmlParserInputBufferPtr in = ctxt->input->buf;
5500 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5501 (in->raw != NULL)) {
5502 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005503
Daniel Veillard14f752c2003-08-09 11:44:50 +00005504 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5505 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005506 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5507 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005508 return(XML_ERR_INVALID_ENCODING);
5509 }
5510 }
5511 }
Owen Taylor3473f882001-02-23 17:55:21 +00005512 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005513 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005514 if (terminate) {
5515 if ((ctxt->instate != XML_PARSER_EOF) &&
5516 (ctxt->instate != XML_PARSER_EPILOG) &&
5517 (ctxt->instate != XML_PARSER_MISC)) {
5518 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005519 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005520 }
Owen Taylor3473f882001-02-23 17:55:21 +00005521 if (ctxt->instate != XML_PARSER_EOF) {
5522 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5523 ctxt->sax->endDocument(ctxt->userData);
5524 }
5525 ctxt->instate = XML_PARSER_EOF;
5526 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005527 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005528}
5529
5530/************************************************************************
5531 * *
5532 * User entry points *
5533 * *
5534 ************************************************************************/
5535
5536/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005537 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005538 * @sax: a SAX handler
5539 * @user_data: The user data returned on SAX callbacks
5540 * @chunk: a pointer to an array of chars
5541 * @size: number of chars in the array
5542 * @filename: an optional file name or URI
5543 * @enc: an optional encoding
5544 *
5545 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005546 * The value of @filename is used for fetching external entities
5547 * and error/warning reports.
5548 *
5549 * Returns the new parser context or NULL
5550 */
5551htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005552htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005553 const char *chunk, int size, const char *filename,
5554 xmlCharEncoding enc) {
5555 htmlParserCtxtPtr ctxt;
5556 htmlParserInputPtr inputStream;
5557 xmlParserInputBufferPtr buf;
5558
Daniel Veillardd0463562001-10-13 09:15:48 +00005559 xmlInitParser();
5560
Owen Taylor3473f882001-02-23 17:55:21 +00005561 buf = xmlAllocParserInputBuffer(enc);
5562 if (buf == NULL) return(NULL);
5563
Daniel Veillardf403d292003-10-05 13:51:35 +00005564 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005565 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005566 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005567 return(NULL);
5568 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005569 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5570 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005571 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005572 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005573 xmlFree(ctxt->sax);
5574 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5575 if (ctxt->sax == NULL) {
5576 xmlFree(buf);
5577 xmlFree(ctxt);
5578 return(NULL);
5579 }
5580 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5581 if (user_data != NULL)
5582 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005583 }
Owen Taylor3473f882001-02-23 17:55:21 +00005584 if (filename == NULL) {
5585 ctxt->directory = NULL;
5586 } else {
5587 ctxt->directory = xmlParserGetDirectory(filename);
5588 }
5589
5590 inputStream = htmlNewInputStream(ctxt);
5591 if (inputStream == NULL) {
5592 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005593 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005594 return(NULL);
5595 }
5596
5597 if (filename == NULL)
5598 inputStream->filename = NULL;
5599 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005600 inputStream->filename = (char *)
5601 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005602 inputStream->buf = buf;
5603 inputStream->base = inputStream->buf->buffer->content;
5604 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005605 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005606 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005607
5608 inputPush(ctxt, inputStream);
5609
5610 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005611 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005612 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5613 int cur = ctxt->input->cur - ctxt->input->base;
5614
Daniel Veillarde77db162009-08-22 11:32:38 +02005615 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005616
5617 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5618 ctxt->input->cur = ctxt->input->base + cur;
5619 ctxt->input->end =
5620 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005621#ifdef DEBUG_PUSH
5622 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5623#endif
5624 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005625 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005626
5627 return(ctxt);
5628}
William M. Brack21e4ef22005-01-02 09:53:13 +00005629#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005630
5631/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005632 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005633 * @cur: a pointer to an array of xmlChar
5634 * @encoding: a free form C string describing the HTML document encoding, or NULL
5635 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005636 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005637 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005638 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5639 * to handle parse events. If sax is NULL, fallback to the default DOM
5640 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005641 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005642 * Returns the resulting document tree unless SAX is NULL or the document is
5643 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005644 */
5645
5646htmlDocPtr
5647htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5648 htmlDocPtr ret;
5649 htmlParserCtxtPtr ctxt;
5650
Daniel Veillardd0463562001-10-13 09:15:48 +00005651 xmlInitParser();
5652
Owen Taylor3473f882001-02-23 17:55:21 +00005653 if (cur == NULL) return(NULL);
5654
5655
5656 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5657 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005658 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005659 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005660 ctxt->sax = sax;
5661 ctxt->userData = userData;
5662 }
5663
5664 htmlParseDocument(ctxt);
5665 ret = ctxt->myDoc;
5666 if (sax != NULL) {
5667 ctxt->sax = NULL;
5668 ctxt->userData = NULL;
5669 }
5670 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005671
Owen Taylor3473f882001-02-23 17:55:21 +00005672 return(ret);
5673}
5674
5675/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005676 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005677 * @cur: a pointer to an array of xmlChar
5678 * @encoding: a free form C string describing the HTML document encoding, or NULL
5679 *
5680 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005681 *
Owen Taylor3473f882001-02-23 17:55:21 +00005682 * Returns the resulting document tree
5683 */
5684
5685htmlDocPtr
5686htmlParseDoc(xmlChar *cur, const char *encoding) {
5687 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5688}
5689
5690
5691/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005692 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005693 * @filename: the filename
5694 * @encoding: a free form C string describing the HTML document encoding, or NULL
5695 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005696 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005697 * Automatic support for ZLIB/Compress compressed document is provided
5698 * by default if found at compile-time.
5699 *
5700 * Returns the new parser context or NULL
5701 */
5702htmlParserCtxtPtr
5703htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5704{
5705 htmlParserCtxtPtr ctxt;
5706 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005707 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005708 /* htmlCharEncoding enc; */
5709 xmlChar *content, *content_line = (xmlChar *) "charset=";
5710
Daniel Veillarda03e3652004-11-02 18:45:30 +00005711 if (filename == NULL)
5712 return(NULL);
5713
Daniel Veillardf403d292003-10-05 13:51:35 +00005714 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005715 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005716 return(NULL);
5717 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005718 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5719 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005720#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005721 if (xmlDefaultSAXHandler.error != NULL) {
5722 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5723 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005724#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005725 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005726 return(NULL);
5727 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005728
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005729 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5730 xmlFree(canonicFilename);
5731 if (inputStream == NULL) {
5732 xmlFreeParserCtxt(ctxt);
5733 return(NULL);
5734 }
Owen Taylor3473f882001-02-23 17:55:21 +00005735
5736 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005737
Owen Taylor3473f882001-02-23 17:55:21 +00005738 /* set encoding */
5739 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005740 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005741 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005742 strcpy ((char *)content, (char *)content_line);
5743 strcat ((char *)content, (char *)encoding);
5744 htmlCheckEncoding (ctxt, content);
5745 xmlFree (content);
5746 }
5747 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005748
Owen Taylor3473f882001-02-23 17:55:21 +00005749 return(ctxt);
5750}
5751
5752/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005753 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005754 * @filename: the filename
5755 * @encoding: a free form C string describing the HTML document encoding, or NULL
5756 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005757 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005758 *
5759 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5760 * compressed document is provided by default if found at compile-time.
5761 * It use the given SAX function block to handle the parsing callback.
5762 * If sax is NULL, fallback to the default DOM tree building routines.
5763 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005764 * Returns the resulting document tree unless SAX is NULL or the document is
5765 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005766 */
5767
5768htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005769htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005770 void *userData) {
5771 htmlDocPtr ret;
5772 htmlParserCtxtPtr ctxt;
5773 htmlSAXHandlerPtr oldsax = NULL;
5774
Daniel Veillardd0463562001-10-13 09:15:48 +00005775 xmlInitParser();
5776
Owen Taylor3473f882001-02-23 17:55:21 +00005777 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5778 if (ctxt == NULL) return(NULL);
5779 if (sax != NULL) {
5780 oldsax = ctxt->sax;
5781 ctxt->sax = sax;
5782 ctxt->userData = userData;
5783 }
5784
5785 htmlParseDocument(ctxt);
5786
5787 ret = ctxt->myDoc;
5788 if (sax != NULL) {
5789 ctxt->sax = oldsax;
5790 ctxt->userData = NULL;
5791 }
5792 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005793
Owen Taylor3473f882001-02-23 17:55:21 +00005794 return(ret);
5795}
5796
5797/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005798 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005799 * @filename: the filename
5800 * @encoding: a free form C string describing the HTML document encoding, or NULL
5801 *
5802 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5803 * compressed document is provided by default if found at compile-time.
5804 *
5805 * Returns the resulting document tree
5806 */
5807
5808htmlDocPtr
5809htmlParseFile(const char *filename, const char *encoding) {
5810 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5811}
5812
5813/**
5814 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005815 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005816 *
5817 * Set and return the previous value for handling HTML omitted tags.
5818 *
5819 * Returns the last value for 0 for no handling, 1 for auto insertion.
5820 */
5821
5822int
5823htmlHandleOmittedElem(int val) {
5824 int old = htmlOmittedDefaultValue;
5825
5826 htmlOmittedDefaultValue = val;
5827 return(old);
5828}
5829
Daniel Veillard930dfb62003-02-05 10:17:38 +00005830/**
5831 * htmlElementAllowedHere:
5832 * @parent: HTML parent element
5833 * @elt: HTML element
5834 *
5835 * Checks whether an HTML element may be a direct child of a parent element.
5836 * Note - doesn't check for deprecated elements
5837 *
5838 * Returns 1 if allowed; 0 otherwise.
5839 */
5840int
5841htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5842 const char** p ;
5843
5844 if ( ! elt || ! parent || ! parent->subelts )
5845 return 0 ;
5846
5847 for ( p = parent->subelts; *p; ++p )
5848 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5849 return 1 ;
5850
5851 return 0 ;
5852}
5853/**
5854 * htmlElementStatusHere:
5855 * @parent: HTML parent element
5856 * @elt: HTML element
5857 *
5858 * Checks whether an HTML element may be a direct child of a parent element.
5859 * and if so whether it is valid or deprecated.
5860 *
5861 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5862 */
5863htmlStatus
5864htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5865 if ( ! parent || ! elt )
5866 return HTML_INVALID ;
5867 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5868 return HTML_INVALID ;
5869
5870 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5871}
5872/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005873 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005874 * @elt: HTML element
5875 * @attr: HTML attribute
5876 * @legacy: whether to allow deprecated attributes
5877 *
5878 * Checks whether an attribute is valid for an element
5879 * Has full knowledge of Required and Deprecated attributes
5880 *
5881 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5882 */
5883htmlStatus
5884htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5885 const char** p ;
5886
5887 if ( !elt || ! attr )
5888 return HTML_INVALID ;
5889
5890 if ( elt->attrs_req )
5891 for ( p = elt->attrs_req; *p; ++p)
5892 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5893 return HTML_REQUIRED ;
5894
5895 if ( elt->attrs_opt )
5896 for ( p = elt->attrs_opt; *p; ++p)
5897 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5898 return HTML_VALID ;
5899
5900 if ( legacy && elt->attrs_depr )
5901 for ( p = elt->attrs_depr; *p; ++p)
5902 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5903 return HTML_DEPRECATED ;
5904
5905 return HTML_INVALID ;
5906}
5907/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005908 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005909 * @node: an htmlNodePtr in a tree
5910 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005911 * for Element nodes)
5912 *
5913 * Checks whether the tree node is valid. Experimental (the author
5914 * only uses the HTML enhancements in a SAX parser)
5915 *
5916 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5917 * legacy allowed) or htmlElementStatusHere (otherwise).
5918 * for Attribute nodes, a return from htmlAttrAllowed
5919 * for other nodes, HTML_NA (no checks performed)
5920 */
5921htmlStatus
5922htmlNodeStatus(const htmlNodePtr node, int legacy) {
5923 if ( ! node )
5924 return HTML_INVALID ;
5925
5926 switch ( node->type ) {
5927 case XML_ELEMENT_NODE:
5928 return legacy
5929 ? ( htmlElementAllowedHere (
5930 htmlTagLookup(node->parent->name) , node->name
5931 ) ? HTML_VALID : HTML_INVALID )
5932 : htmlElementStatusHere(
5933 htmlTagLookup(node->parent->name) ,
5934 htmlTagLookup(node->name) )
5935 ;
5936 case XML_ATTRIBUTE_NODE:
5937 return htmlAttrAllowed(
5938 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5939 default: return HTML_NA ;
5940 }
5941}
Daniel Veillard9475a352003-09-26 12:47:50 +00005942/************************************************************************
5943 * *
5944 * New set (2.6.0) of simpler and more flexible APIs *
5945 * *
5946 ************************************************************************/
5947/**
5948 * DICT_FREE:
5949 * @str: a string
5950 *
5951 * Free a string if it is not owned by the "dict" dictionnary in the
5952 * current scope
5953 */
5954#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02005955 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00005956 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5957 xmlFree((char *)(str));
5958
5959/**
5960 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005961 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005962 *
5963 * Reset a parser context
5964 */
5965void
5966htmlCtxtReset(htmlParserCtxtPtr ctxt)
5967{
5968 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005969 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02005970
Daniel Veillarda03e3652004-11-02 18:45:30 +00005971 if (ctxt == NULL)
5972 return;
5973
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005974 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005975 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005976
5977 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5978 xmlFreeInputStream(input);
5979 }
5980 ctxt->inputNr = 0;
5981 ctxt->input = NULL;
5982
5983 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005984 if (ctxt->spaceTab != NULL) {
5985 ctxt->spaceTab[0] = -1;
5986 ctxt->space = &ctxt->spaceTab[0];
5987 } else {
5988 ctxt->space = NULL;
5989 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005990
5991
5992 ctxt->nodeNr = 0;
5993 ctxt->node = NULL;
5994
5995 ctxt->nameNr = 0;
5996 ctxt->name = NULL;
5997
5998 DICT_FREE(ctxt->version);
5999 ctxt->version = NULL;
6000 DICT_FREE(ctxt->encoding);
6001 ctxt->encoding = NULL;
6002 DICT_FREE(ctxt->directory);
6003 ctxt->directory = NULL;
6004 DICT_FREE(ctxt->extSubURI);
6005 ctxt->extSubURI = NULL;
6006 DICT_FREE(ctxt->extSubSystem);
6007 ctxt->extSubSystem = NULL;
6008 if (ctxt->myDoc != NULL)
6009 xmlFreeDoc(ctxt->myDoc);
6010 ctxt->myDoc = NULL;
6011
6012 ctxt->standalone = -1;
6013 ctxt->hasExternalSubset = 0;
6014 ctxt->hasPErefs = 0;
6015 ctxt->html = 1;
6016 ctxt->external = 0;
6017 ctxt->instate = XML_PARSER_START;
6018 ctxt->token = 0;
6019
6020 ctxt->wellFormed = 1;
6021 ctxt->nsWellFormed = 1;
6022 ctxt->valid = 1;
6023 ctxt->vctxt.userData = ctxt;
6024 ctxt->vctxt.error = xmlParserValidityError;
6025 ctxt->vctxt.warning = xmlParserValidityWarning;
6026 ctxt->record_info = 0;
6027 ctxt->nbChars = 0;
6028 ctxt->checkIndex = 0;
6029 ctxt->inSubset = 0;
6030 ctxt->errNo = XML_ERR_OK;
6031 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006032 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006033 ctxt->catalogs = NULL;
6034 xmlInitNodeInfoSeq(&ctxt->node_seq);
6035
6036 if (ctxt->attsDefault != NULL) {
6037 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6038 ctxt->attsDefault = NULL;
6039 }
6040 if (ctxt->attsSpecial != NULL) {
6041 xmlHashFree(ctxt->attsSpecial, NULL);
6042 ctxt->attsSpecial = NULL;
6043 }
6044}
6045
6046/**
6047 * htmlCtxtUseOptions:
6048 * @ctxt: an HTML parser context
6049 * @options: a combination of htmlParserOption(s)
6050 *
6051 * Applies the options to the parser context
6052 *
6053 * Returns 0 in case of success, the set of unknown or unimplemented options
6054 * in case of error.
6055 */
6056int
6057htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6058{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006059 if (ctxt == NULL)
6060 return(-1);
6061
Daniel Veillard9475a352003-09-26 12:47:50 +00006062 if (options & HTML_PARSE_NOWARNING) {
6063 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006064 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006065 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006066 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006067 }
6068 if (options & HTML_PARSE_NOERROR) {
6069 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006070 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006071 ctxt->sax->fatalError = NULL;
6072 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006073 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006074 }
6075 if (options & HTML_PARSE_PEDANTIC) {
6076 ctxt->pedantic = 1;
6077 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006078 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006079 } else
6080 ctxt->pedantic = 0;
6081 if (options & XML_PARSE_NOBLANKS) {
6082 ctxt->keepBlanks = 0;
6083 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6084 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006085 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006086 } else
6087 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006088 if (options & HTML_PARSE_RECOVER) {
6089 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006090 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006091 } else
6092 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006093 if (options & HTML_PARSE_COMPACT) {
6094 ctxt->options |= HTML_PARSE_COMPACT;
6095 options -= HTML_PARSE_COMPACT;
6096 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006097 if (options & XML_PARSE_HUGE) {
6098 ctxt->options |= XML_PARSE_HUGE;
6099 options -= XML_PARSE_HUGE;
6100 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006101 ctxt->dictNames = 0;
6102 return (options);
6103}
6104
6105/**
6106 * htmlDoRead:
6107 * @ctxt: an HTML parser context
6108 * @URL: the base URL to use for the document
6109 * @encoding: the document encoding, or NULL
6110 * @options: a combination of htmlParserOption(s)
6111 * @reuse: keep the context for reuse
6112 *
6113 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006114 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006115 * Returns the resulting document tree or NULL
6116 */
6117static htmlDocPtr
6118htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6119 int options, int reuse)
6120{
6121 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006122
Daniel Veillard9475a352003-09-26 12:47:50 +00006123 htmlCtxtUseOptions(ctxt, options);
6124 ctxt->html = 1;
6125 if (encoding != NULL) {
6126 xmlCharEncodingHandlerPtr hdlr;
6127
6128 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006129 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006130 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006131 if (ctxt->input->encoding != NULL)
6132 xmlFree((xmlChar *) ctxt->input->encoding);
6133 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6134 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006135 }
6136 if ((URL != NULL) && (ctxt->input != NULL) &&
6137 (ctxt->input->filename == NULL))
6138 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6139 htmlParseDocument(ctxt);
6140 ret = ctxt->myDoc;
6141 ctxt->myDoc = NULL;
6142 if (!reuse) {
6143 if ((ctxt->dictNames) &&
6144 (ret != NULL) &&
6145 (ret->dict == ctxt->dict))
6146 ctxt->dict = NULL;
6147 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006148 }
6149 return (ret);
6150}
6151
6152/**
6153 * htmlReadDoc:
6154 * @cur: a pointer to a zero terminated string
6155 * @URL: the base URL to use for the document
6156 * @encoding: the document encoding, or NULL
6157 * @options: a combination of htmlParserOption(s)
6158 *
6159 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006160 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006161 * Returns the resulting document tree
6162 */
6163htmlDocPtr
6164htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6165{
6166 htmlParserCtxtPtr ctxt;
6167
6168 if (cur == NULL)
6169 return (NULL);
6170
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006171 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006172 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006173 if (ctxt == NULL)
6174 return (NULL);
6175 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6176}
6177
6178/**
6179 * htmlReadFile:
6180 * @filename: a file or URL
6181 * @encoding: the document encoding, or NULL
6182 * @options: a combination of htmlParserOption(s)
6183 *
6184 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006185 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006186 * Returns the resulting document tree
6187 */
6188htmlDocPtr
6189htmlReadFile(const char *filename, const char *encoding, int options)
6190{
6191 htmlParserCtxtPtr ctxt;
6192
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006193 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006194 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6195 if (ctxt == NULL)
6196 return (NULL);
6197 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6198}
6199
6200/**
6201 * htmlReadMemory:
6202 * @buffer: a pointer to a char array
6203 * @size: the size of the array
6204 * @URL: the base URL to use for the document
6205 * @encoding: the document encoding, or NULL
6206 * @options: a combination of htmlParserOption(s)
6207 *
6208 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006209 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006210 * Returns the resulting document tree
6211 */
6212htmlDocPtr
6213htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6214{
6215 htmlParserCtxtPtr ctxt;
6216
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006217 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006218 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6219 if (ctxt == NULL)
6220 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006221 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006222 if (ctxt->sax != NULL)
6223 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006224 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6225}
6226
6227/**
6228 * htmlReadFd:
6229 * @fd: an open file descriptor
6230 * @URL: the base URL to use for the document
6231 * @encoding: the document encoding, or NULL
6232 * @options: a combination of htmlParserOption(s)
6233 *
6234 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006235 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006236 * Returns the resulting document tree
6237 */
6238htmlDocPtr
6239htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6240{
6241 htmlParserCtxtPtr ctxt;
6242 xmlParserInputBufferPtr input;
6243 xmlParserInputPtr stream;
6244
6245 if (fd < 0)
6246 return (NULL);
6247
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006248 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006249 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6250 if (input == NULL)
6251 return (NULL);
6252 ctxt = xmlNewParserCtxt();
6253 if (ctxt == NULL) {
6254 xmlFreeParserInputBuffer(input);
6255 return (NULL);
6256 }
6257 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6258 if (stream == NULL) {
6259 xmlFreeParserInputBuffer(input);
6260 xmlFreeParserCtxt(ctxt);
6261 return (NULL);
6262 }
6263 inputPush(ctxt, stream);
6264 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6265}
6266
6267/**
6268 * htmlReadIO:
6269 * @ioread: an I/O read function
6270 * @ioclose: an I/O close function
6271 * @ioctx: an I/O handler
6272 * @URL: the base URL to use for the document
6273 * @encoding: the document encoding, or NULL
6274 * @options: a combination of htmlParserOption(s)
6275 *
6276 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006277 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006278 * Returns the resulting document tree
6279 */
6280htmlDocPtr
6281htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6282 void *ioctx, const char *URL, const char *encoding, int options)
6283{
6284 htmlParserCtxtPtr ctxt;
6285 xmlParserInputBufferPtr input;
6286 xmlParserInputPtr stream;
6287
6288 if (ioread == NULL)
6289 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006290 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006291
6292 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6293 XML_CHAR_ENCODING_NONE);
6294 if (input == NULL)
6295 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006296 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006297 if (ctxt == NULL) {
6298 xmlFreeParserInputBuffer(input);
6299 return (NULL);
6300 }
6301 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6302 if (stream == NULL) {
6303 xmlFreeParserInputBuffer(input);
6304 xmlFreeParserCtxt(ctxt);
6305 return (NULL);
6306 }
6307 inputPush(ctxt, stream);
6308 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6309}
6310
6311/**
6312 * htmlCtxtReadDoc:
6313 * @ctxt: an HTML parser context
6314 * @cur: a pointer to a zero terminated string
6315 * @URL: the base URL to use for the document
6316 * @encoding: the document encoding, or NULL
6317 * @options: a combination of htmlParserOption(s)
6318 *
6319 * parse an XML in-memory document and build a tree.
6320 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006321 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006322 * Returns the resulting document tree
6323 */
6324htmlDocPtr
6325htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6326 const char *URL, const char *encoding, int options)
6327{
6328 xmlParserInputPtr stream;
6329
6330 if (cur == NULL)
6331 return (NULL);
6332 if (ctxt == NULL)
6333 return (NULL);
6334
6335 htmlCtxtReset(ctxt);
6336
6337 stream = xmlNewStringInputStream(ctxt, cur);
6338 if (stream == NULL) {
6339 return (NULL);
6340 }
6341 inputPush(ctxt, stream);
6342 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6343}
6344
6345/**
6346 * htmlCtxtReadFile:
6347 * @ctxt: an HTML parser context
6348 * @filename: a file or URL
6349 * @encoding: the document encoding, or NULL
6350 * @options: a combination of htmlParserOption(s)
6351 *
6352 * parse an XML file from the filesystem or the network.
6353 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006354 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006355 * Returns the resulting document tree
6356 */
6357htmlDocPtr
6358htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6359 const char *encoding, int options)
6360{
6361 xmlParserInputPtr stream;
6362
6363 if (filename == NULL)
6364 return (NULL);
6365 if (ctxt == NULL)
6366 return (NULL);
6367
6368 htmlCtxtReset(ctxt);
6369
Daniel Veillard29614c72004-11-26 10:47:26 +00006370 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006371 if (stream == NULL) {
6372 return (NULL);
6373 }
6374 inputPush(ctxt, stream);
6375 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6376}
6377
6378/**
6379 * htmlCtxtReadMemory:
6380 * @ctxt: an HTML parser context
6381 * @buffer: a pointer to a char array
6382 * @size: the size of the array
6383 * @URL: the base URL to use for the document
6384 * @encoding: the document encoding, or NULL
6385 * @options: a combination of htmlParserOption(s)
6386 *
6387 * parse an XML in-memory document and build a tree.
6388 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006389 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006390 * Returns the resulting document tree
6391 */
6392htmlDocPtr
6393htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6394 const char *URL, const char *encoding, int options)
6395{
6396 xmlParserInputBufferPtr input;
6397 xmlParserInputPtr stream;
6398
6399 if (ctxt == NULL)
6400 return (NULL);
6401 if (buffer == NULL)
6402 return (NULL);
6403
6404 htmlCtxtReset(ctxt);
6405
6406 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6407 if (input == NULL) {
6408 return(NULL);
6409 }
6410
6411 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6412 if (stream == NULL) {
6413 xmlFreeParserInputBuffer(input);
6414 return(NULL);
6415 }
6416
6417 inputPush(ctxt, stream);
6418 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6419}
6420
6421/**
6422 * htmlCtxtReadFd:
6423 * @ctxt: an HTML parser context
6424 * @fd: an open file descriptor
6425 * @URL: the base URL to use for the document
6426 * @encoding: the document encoding, or NULL
6427 * @options: a combination of htmlParserOption(s)
6428 *
6429 * parse an XML from a file descriptor and build a tree.
6430 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006431 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006432 * Returns the resulting document tree
6433 */
6434htmlDocPtr
6435htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6436 const char *URL, const char *encoding, int options)
6437{
6438 xmlParserInputBufferPtr input;
6439 xmlParserInputPtr stream;
6440
6441 if (fd < 0)
6442 return (NULL);
6443 if (ctxt == NULL)
6444 return (NULL);
6445
6446 htmlCtxtReset(ctxt);
6447
6448
6449 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6450 if (input == NULL)
6451 return (NULL);
6452 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6453 if (stream == NULL) {
6454 xmlFreeParserInputBuffer(input);
6455 return (NULL);
6456 }
6457 inputPush(ctxt, stream);
6458 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6459}
6460
6461/**
6462 * htmlCtxtReadIO:
6463 * @ctxt: an HTML parser context
6464 * @ioread: an I/O read function
6465 * @ioclose: an I/O close function
6466 * @ioctx: an I/O handler
6467 * @URL: the base URL to use for the document
6468 * @encoding: the document encoding, or NULL
6469 * @options: a combination of htmlParserOption(s)
6470 *
6471 * parse an HTML document from I/O functions and source and build a tree.
6472 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006473 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006474 * Returns the resulting document tree
6475 */
6476htmlDocPtr
6477htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6478 xmlInputCloseCallback ioclose, void *ioctx,
6479 const char *URL,
6480 const char *encoding, int options)
6481{
6482 xmlParserInputBufferPtr input;
6483 xmlParserInputPtr stream;
6484
6485 if (ioread == NULL)
6486 return (NULL);
6487 if (ctxt == NULL)
6488 return (NULL);
6489
6490 htmlCtxtReset(ctxt);
6491
6492 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6493 XML_CHAR_ENCODING_NONE);
6494 if (input == NULL)
6495 return (NULL);
6496 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6497 if (stream == NULL) {
6498 xmlFreeParserInputBuffer(input);
6499 return (NULL);
6500 }
6501 inputPush(ctxt, stream);
6502 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6503}
6504
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006505#define bottom_HTMLparser
6506#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006507#endif /* LIBXML_HTML_ENABLED */