blob: 2b569ca9041579b8e1a6882ccf9e6138241f0fd6 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
212/*
213 * Macros for accessing the content. Those should be used only by the parser,
214 * and not exported.
215 *
216 * Dirty macros, i.e. one need to make assumption on the context to use them
217 *
218 * CUR_PTR return the current pointer to the xmlChar to be parsed.
219 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
220 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
221 * in UNICODE mode. This should be used internally by the parser
222 * only to compare to ASCII values otherwise it would break when
223 * running with UTF-8 encoding.
224 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
225 * to compare on ASCII based substring.
226 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
227 * it should be used only to compare on ASCII based substring.
228 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 *
231 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
232 *
233 * CURRENT Returns the current char value, with the full decoding of
234 * UTF-8 if we are using this mode. It returns an int.
235 * NEXT Skip to the next character, this does the proper decoding
236 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000237 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000238 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
239 */
240
241#define UPPER (toupper(*ctxt->input->cur))
242
Daniel Veillard77a90a72003-03-22 00:04:05 +0000243#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000244
245#define NXT(val) ctxt->input->cur[(val)]
246
247#define UPP(val) (toupper(ctxt->input->cur[(val)]))
248
249#define CUR_PTR ctxt->input->cur
250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
252 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
253 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000255#define GROW if ((ctxt->progressive == 0) && \
256 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
257 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000258
259#define CURRENT ((int) (*ctxt->input->cur))
260
261#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
262
263/* Inported from XML */
264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
266#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000267#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000268
Daniel Veillard561b7f82002-03-20 21:55:57 +0000269#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000270
271
272#define NEXTL(l) do { \
273 if (*(ctxt->input->cur) == '\n') { \
274 ctxt->input->line++; ctxt->input->col = 1; \
275 } else ctxt->input->col++; \
276 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
277 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200278
Owen Taylor3473f882001-02-23 17:55:21 +0000279/************
280 \
281 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
282 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
283 ************/
284
285#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
286#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
287
288#define COPY_BUF(l,b,i,v) \
289 if (l == 1) b[i++] = (xmlChar) v; \
290 else i += xmlCopyChar(l,&b[i],v)
291
292/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200293 * htmlFindEncoding:
294 * @the HTML parser context
295 *
296 * Ty to find and encoding in the current data available in the input
297 * buffer this is needed to try to switch to the proper encoding when
298 * one face a character error.
299 * That's an heuristic, since it's operating outside of parsing it could
300 * try to use a meta which had been commented out, that's the reason it
301 * should only be used in case of error, not as a default.
302 *
303 * Returns an encoding string or NULL if not found, the string need to
304 * be freed
305 */
306static xmlChar *
307htmlFindEncoding(xmlParserCtxtPtr ctxt) {
308 const xmlChar *start, *cur, *end;
309
310 if ((ctxt == NULL) || (ctxt->input == NULL) ||
311 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
312 (ctxt->input->buf->encoder != NULL))
313 return(NULL);
314 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
315 return(NULL);
316
317 start = ctxt->input->cur;
318 end = ctxt->input->end;
319 /* we also expect the input buffer to be zero terminated */
320 if (*end != 0)
321 return(NULL);
322
323 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
324 if (cur == NULL)
325 return(NULL);
326 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
327 if (cur == NULL)
328 return(NULL);
329 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
330 if (cur == NULL)
331 return(NULL);
332 cur += 8;
333 start = cur;
334 while (((*cur >= 'A') && (*cur <= 'Z')) ||
335 ((*cur >= 'a') && (*cur <= 'z')) ||
336 ((*cur >= '0') && (*cur <= '9')) ||
337 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
338 cur++;
339 if (cur == start)
340 return(NULL);
341 return(xmlStrndup(start, cur - start));
342}
343
344/**
Owen Taylor3473f882001-02-23 17:55:21 +0000345 * htmlCurrentChar:
346 * @ctxt: the HTML parser context
347 * @len: pointer to the length of the char read
348 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000349 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000350 * bytes in the input buffer. Implement the end of line normalization:
351 * 2.11 End-of-Line Handling
352 * If the encoding is unspecified, in the case we find an ISO-Latin-1
353 * char, then the encoding converter is plugged in automatically.
354 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000355 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000356 */
357
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000358static int
Owen Taylor3473f882001-02-23 17:55:21 +0000359htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
360 if (ctxt->instate == XML_PARSER_EOF)
361 return(0);
362
363 if (ctxt->token != 0) {
364 *len = 0;
365 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200366 }
Owen Taylor3473f882001-02-23 17:55:21 +0000367 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
368 /*
369 * We are supposed to handle UTF8, check it's valid
370 * From rfc2044: encoding of the Unicode values on UTF-8:
371 *
372 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
373 * 0000 0000-0000 007F 0xxxxxxx
374 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200375 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000376 *
377 * Check for the 0x110000 limit too
378 */
379 const unsigned char *cur = ctxt->input->cur;
380 unsigned char c;
381 unsigned int val;
382
383 c = *cur;
384 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200385 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000386 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200387 cur = ctxt->input->cur;
388 }
Owen Taylor3473f882001-02-23 17:55:21 +0000389 if ((cur[1] & 0xc0) != 0x80)
390 goto encoding_error;
391 if ((c & 0xe0) == 0xe0) {
392
Adiel Mittmann8a103792009-08-25 11:27:13 +0200393 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000394 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200395 cur = ctxt->input->cur;
396 }
Owen Taylor3473f882001-02-23 17:55:21 +0000397 if ((cur[2] & 0xc0) != 0x80)
398 goto encoding_error;
399 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200400 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000401 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200402 cur = ctxt->input->cur;
403 }
Owen Taylor3473f882001-02-23 17:55:21 +0000404 if (((c & 0xf8) != 0xf0) ||
405 ((cur[3] & 0xc0) != 0x80))
406 goto encoding_error;
407 /* 4-byte code */
408 *len = 4;
409 val = (cur[0] & 0x7) << 18;
410 val |= (cur[1] & 0x3f) << 12;
411 val |= (cur[2] & 0x3f) << 6;
412 val |= cur[3] & 0x3f;
413 } else {
414 /* 3-byte code */
415 *len = 3;
416 val = (cur[0] & 0xf) << 12;
417 val |= (cur[1] & 0x3f) << 6;
418 val |= cur[2] & 0x3f;
419 }
420 } else {
421 /* 2-byte code */
422 *len = 2;
423 val = (cur[0] & 0x1f) << 6;
424 val |= cur[1] & 0x3f;
425 }
426 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000427 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
428 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200429 }
Owen Taylor3473f882001-02-23 17:55:21 +0000430 return(val);
431 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200432 if ((*ctxt->input->cur == 0) &&
433 (ctxt->input->cur < ctxt->input->end)) {
434 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
435 "Char 0x%X out of allowed range\n", 0);
436 *len = 1;
437 return(' ');
438 }
Owen Taylor3473f882001-02-23 17:55:21 +0000439 /* 1-byte code */
440 *len = 1;
441 return((int) *ctxt->input->cur);
442 }
443 }
444 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000445 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000446 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000447 * XML constructs only use < 128 chars
448 */
449 *len = 1;
450 if ((int) *ctxt->input->cur < 0x80)
451 return((int) *ctxt->input->cur);
452
453 /*
454 * Humm this is bad, do an automatic flow conversion
455 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200456 {
457 xmlChar * guess;
458 xmlCharEncodingHandlerPtr handler;
459
460 guess = htmlFindEncoding(ctxt);
461 if (guess == NULL) {
462 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
463 } else {
464 if (ctxt->input->encoding != NULL)
465 xmlFree((xmlChar *) ctxt->input->encoding);
466 ctxt->input->encoding = guess;
467 handler = xmlFindCharEncodingHandler((const char *) guess);
468 if (handler != NULL) {
469 xmlSwitchToEncoding(ctxt, handler);
470 } else {
471 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
472 "Unsupported encoding %s", guess, NULL);
473 }
474 }
475 ctxt->charset = XML_CHAR_ENCODING_UTF8;
476 }
477
Owen Taylor3473f882001-02-23 17:55:21 +0000478 return(xmlCurrentChar(ctxt, len));
479
480encoding_error:
481 /*
482 * If we detect an UTF8 error that probably mean that the
483 * input encoding didn't get properly advertized in the
484 * declaration header. Report the error and switch the encoding
485 * to ISO-Latin-1 (if you don't like this policy, just declare the
486 * encoding !)
487 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000488 {
489 char buffer[150];
490
Daniel Veillard861101d2007-06-12 08:38:57 +0000491 if (ctxt->input->end - ctxt->input->cur >= 4) {
492 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
493 ctxt->input->cur[0], ctxt->input->cur[1],
494 ctxt->input->cur[2], ctxt->input->cur[3]);
495 } else {
496 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
497 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000498 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
499 "Input is not proper UTF-8, indicate encoding !\n",
500 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000501 }
502
Daniel Veillarde77db162009-08-22 11:32:38 +0200503 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000504 *len = 1;
505 return((int) *ctxt->input->cur);
506}
507
508/**
Owen Taylor3473f882001-02-23 17:55:21 +0000509 * htmlSkipBlankChars:
510 * @ctxt: the HTML parser context
511 *
512 * skip all blanks character found at that point in the input streams.
513 *
514 * Returns the number of space chars skipped
515 */
516
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000517static int
Owen Taylor3473f882001-02-23 17:55:21 +0000518htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
519 int res = 0;
520
William M. Brack76e95df2003-10-18 16:20:14 +0000521 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000522 if ((*ctxt->input->cur == 0) &&
523 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
524 xmlPopInput(ctxt);
525 } else {
526 if (*(ctxt->input->cur) == '\n') {
527 ctxt->input->line++; ctxt->input->col = 1;
528 } else ctxt->input->col++;
529 ctxt->input->cur++;
530 ctxt->nbChars++;
531 if (*ctxt->input->cur == 0)
532 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
533 }
534 res++;
535 }
536 return(res);
537}
538
539
540
541/************************************************************************
542 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200543 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000544 * *
545 ************************************************************************/
546
547/*
548 * Start Tag: 1 means the start tag can be ommited
549 * End Tag: 1 means the end tag can be ommited
550 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000551 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000552 * Depr: this element is deprecated
553 * DTD: 1 means that this element is valid only in the Loose DTD
554 * 2 means that this element is valid only in the Frameset DTD
555 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000556 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000557 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000558 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000559
560/* Definitions and a couple of vars for HTML Elements */
561
562#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000563#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000564#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000565#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000566#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
567#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100568#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000569#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100570#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000571#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000572#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000573#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000575#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000576#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000577#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000578#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000579#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000580#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000581#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000582#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000583#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000584#define EMPTY NULL
585
586
Daniel Veillard065abe82006-07-03 08:55:04 +0000587static const char* const html_flow[] = { FLOW, NULL } ;
588static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000589
590/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000591static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000592#define html_cdata html_pcdata
593
594
595/* ... and for HTML Attributes */
596
597#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000598#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000599#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000600#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000601#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000602#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000604#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000605#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000606#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000607#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000608#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000609
Daniel Veillard065abe82006-07-03 08:55:04 +0000610static const char* const html_attrs[] = { ATTRS, NULL } ;
611static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
612static const char* const core_attrs[] = { COREATTRS, NULL } ;
613static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000614
615
616/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000617static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000618 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
619 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000620static const char* const target_attr[] = { "target", NULL } ;
621static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
622static const char* const alt_attr[] = { "alt", NULL } ;
623static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
624static const char* const href_attrs[] = { "href", NULL } ;
625static const char* const clear_attrs[] = { "clear", NULL } ;
626static const char* const inline_p[] = { INLINE, "p", NULL } ;
627
628static const char* const flow_param[] = { FLOW, "param", NULL } ;
629static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000630 "archive", "alt", "name", "height", "width", "align",
631 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000632static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000633 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000634static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000635 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000636static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
637static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
638static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
639static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000640 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000641static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000642 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
643
644
Daniel Veillard065abe82006-07-03 08:55:04 +0000645static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
646static const char* const col_elt[] = { "col", NULL } ;
647static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
648static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
649static const char* const dl_contents[] = { "dt", "dd", NULL } ;
650static const char* const compact_attr[] = { "compact", NULL } ;
651static const char* const label_attr[] = { "label", NULL } ;
652static const char* const fieldset_contents[] = { FLOW, "legend" } ;
653static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
654static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
655static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
656static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
657static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
658static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
659static const char* const head_attrs[] = { I18N, "profile", NULL } ;
660static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
661static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
662static const char* const version_attr[] = { "version", NULL } ;
663static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
664static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
665static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000666static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000667static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
668static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
669static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
670static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
671static const char* const align_attr[] = { "align", NULL } ;
672static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
673static const char* const map_contents[] = { BLOCK, "area", NULL } ;
674static const char* const name_attr[] = { "name", NULL } ;
675static const char* const action_attr[] = { "action", NULL } ;
676static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
677static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
678static const char* const content_attr[] = { "content", NULL } ;
679static const char* const type_attr[] = { "type", NULL } ;
680static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
681static const char* const object_contents[] = { FLOW, "param", NULL } ;
682static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
683static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
684static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
685static const char* const option_elt[] = { "option", NULL } ;
686static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
687static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
688static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
689static const char* const width_attr[] = { "width", NULL } ;
690static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
691static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
692static const char* const language_attr[] = { "language", NULL } ;
693static const char* const select_content[] = { "optgroup", "option", NULL } ;
694static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
695static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200696static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000697static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
698static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
699static const char* const tr_elt[] = { "tr", NULL } ;
700static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
701static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
702static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
703static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
704static const char* const tr_contents[] = { "th", "td", NULL } ;
705static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
706static const char* const li_elt[] = { "li", NULL } ;
707static const char* const ul_depr[] = { "type", "compact", NULL} ;
708static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000709
710#define DECL (const char**)
711
Daniel Veillard22090732001-07-16 00:06:07 +0000712static const htmlElemDesc
713html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000714{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
715 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
716},
717{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
718 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
719},
720{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
721 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
722},
723{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
724 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
725},
726{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
727 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
728},
729{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
730 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
731},
732{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
733 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
734},
735{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
736 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
737},
738{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
739 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
740},
741{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
742 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
743},
744{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
745 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
746},
747{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
748 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
749},
750{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
751 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
752},
753{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
754 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
755},
756{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
757 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
758},
759{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
760 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
761},
762{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
763 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
764},
765{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
766 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
767},
768{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
769 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
770},
771{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
772 EMPTY , NULL , DECL col_attrs , NULL, NULL
773},
774{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
775 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
776},
777{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
778 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
779},
780{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
781 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
782},
783{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
784 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
785},
786{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
787 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
788},
789{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
790 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
791},
792{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000793 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000794},
795{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
796 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
797},
798{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
799 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
800},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000801{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000802 EMPTY, NULL, DECL embed_attrs, NULL, NULL
803},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000804{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
805 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
806},
807{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
808 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
809},
810{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
811 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
812},
813{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
814 EMPTY, NULL, NULL, DECL frame_attrs, NULL
815},
816{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
817 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
818},
819{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
823 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
824},
825{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
826 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
827},
828{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
829 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
830},
831{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
832 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
833},
834{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
835 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
836},
837{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
838 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
839},
840{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
841 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
842},
843{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
844 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
845},
846{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
847 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848},
849{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
850 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
851},
852{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000853 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000854},
855{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
856 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
857},
858{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
859 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
860},
861{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
862 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
863},
864{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
865 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
866},
867{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
868 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
869},
870{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
871 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
872},
873{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
874 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
875},
876{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
877 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
878},
879{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000880 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000881},
882{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
883 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
884},
885{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
886 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
887},
888{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
889 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
890},
891{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
892 DECL html_flow, "div", DECL html_attrs, NULL, NULL
893},
894{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
895 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
896},
897{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
898 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
899},
900{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000901 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000902},
903{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
904 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
905},
906{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
907 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
908},
909{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000910 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000911},
912{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
913 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
914},
915{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
916 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
917},
918{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
919 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
920},
921{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
922 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
923},
924{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
925 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
926},
927{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
928 DECL select_content, NULL, DECL select_attrs, NULL, NULL
929},
930{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
931 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
932},
933{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
934 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
935},
936{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
937 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
938},
939{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
940 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
941},
942{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
943 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
944},
945{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
946 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
947},
948{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
949 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
950},
951{ "table", 0, 0, 0, 0, 0, 0, 0, "",
952 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
953},
954{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
955 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
956},
957{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
958 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
959},
960{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
961 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
962},
963{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
964 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
965},
966{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
967 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
968},
969{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
970 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
971},
972{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
973 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
974},
975{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
976 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
977},
978{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
979 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
980},
981{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
982 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
983},
984{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
985 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
986},
987{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
988 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989}
Owen Taylor3473f882001-02-23 17:55:21 +0000990};
991
992/*
Owen Taylor3473f882001-02-23 17:55:21 +0000993 * start tags that imply the end of current element
994 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000995static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000996"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
997 "dl", "ul", "ol", "menu", "dir", "address", "pre",
998 "listing", "xmp", "head", NULL,
999"head", "p", NULL,
1000"title", "p", NULL,
1001"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001002"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001003"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1004 "pre", "listing", "xmp", "head", "li", NULL,
1005"hr", "p", "head", NULL,
1006"h1", "p", "head", NULL,
1007"h2", "p", "head", NULL,
1008"h3", "p", "head", NULL,
1009"h4", "p", "head", NULL,
1010"h5", "p", "head", NULL,
1011"h6", "p", "head", NULL,
1012"dir", "p", "head", NULL,
1013"address", "p", "head", "ul", NULL,
1014"pre", "p", "head", "ul", NULL,
1015"listing", "p", "head", NULL,
1016"xmp", "p", "head", NULL,
1017"blockquote", "p", "head", NULL,
1018"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1019 "xmp", "head", NULL,
1020"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1021 "head", "dd", NULL,
1022"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1023 "head", "dt", NULL,
1024"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1025 "listing", "xmp", NULL,
1026"ol", "p", "head", "ul", NULL,
1027"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001028"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001029"div", "p", "head", NULL,
1030"noscript", "p", "head", NULL,
1031"center", "font", "b", "i", "p", "head", NULL,
1032"a", "a", NULL,
1033"caption", "p", NULL,
1034"colgroup", "caption", "colgroup", "col", "p", NULL,
1035"col", "caption", "col", "p", NULL,
1036"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1037 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001038"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001039"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001040"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1041"thead", "caption", "col", "colgroup", NULL,
1042"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1043 "tbody", "p", NULL,
1044"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1045 "tfoot", "tbody", "p", NULL,
1046"optgroup", "option", NULL,
1047"option", "option", NULL,
1048"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1049 "pre", "listing", "xmp", "a", NULL,
1050NULL
1051};
1052
1053/*
1054 * The list of HTML elements which are supposed not to have
1055 * CDATA content and where a p element will be implied
1056 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001057 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001058 * implied paragraph
1059 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001060static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001061 "html",
1062 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001063 NULL
1064};
1065
1066/*
1067 * The list of HTML attributes which are of content %Script;
1068 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1069 * it assumes the name starts with 'on'
1070 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001071static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001072 "onclick",
1073 "ondblclick",
1074 "onmousedown",
1075 "onmouseup",
1076 "onmouseover",
1077 "onmousemove",
1078 "onmouseout",
1079 "onkeypress",
1080 "onkeydown",
1081 "onkeyup",
1082 "onload",
1083 "onunload",
1084 "onfocus",
1085 "onblur",
1086 "onsubmit",
1087 "onrest",
1088 "onchange",
1089 "onselect"
1090};
1091
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001092/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001093 * This table is used by the htmlparser to know what to do with
1094 * broken html pages. By assigning different priorities to different
1095 * elements the parser can decide how to handle extra endtags.
1096 * Endtags are only allowed to close elements with lower or equal
1097 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001098 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001099
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001100typedef struct {
1101 const char *name;
1102 int priority;
1103} elementPriority;
1104
Daniel Veillard22090732001-07-16 00:06:07 +00001105static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001106 {"div", 150},
1107 {"td", 160},
1108 {"th", 160},
1109 {"tr", 170},
1110 {"thead", 180},
1111 {"tbody", 180},
1112 {"tfoot", 180},
1113 {"table", 190},
1114 {"head", 200},
1115 {"body", 200},
1116 {"html", 220},
1117 {NULL, 100} /* Default priority */
1118};
Owen Taylor3473f882001-02-23 17:55:21 +00001119
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001120static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001121static int htmlStartCloseIndexinitialized = 0;
1122
1123/************************************************************************
1124 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001125 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001126 * *
1127 ************************************************************************/
1128
1129/**
1130 * htmlInitAutoClose:
1131 *
1132 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1133 * This is not reentrant. Call xmlInitParser() once before processing in
1134 * case of use in multithreaded programs.
1135 */
1136void
1137htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001138 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001139
1140 if (htmlStartCloseIndexinitialized) return;
1141
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001142 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1143 indx = 0;
1144 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001145 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001146 while (htmlStartClose[i] != NULL) i++;
1147 i++;
1148 }
1149 htmlStartCloseIndexinitialized = 1;
1150}
1151
1152/**
1153 * htmlTagLookup:
1154 * @tag: The tag name in lowercase
1155 *
1156 * Lookup the HTML tag in the ElementTable
1157 *
1158 * Returns the related htmlElemDescPtr or NULL if not found.
1159 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001160const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001161htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001162 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001163
1164 for (i = 0; i < (sizeof(html40ElementTable) /
1165 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001166 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001167 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001168 }
1169 return(NULL);
1170}
1171
1172/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001173 * htmlGetEndPriority:
1174 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001175 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001176 * Return value: The "endtag" priority.
1177 **/
1178static int
1179htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001180 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001181
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001182 while ((htmlEndPriority[i].name != NULL) &&
1183 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1184 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001185
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001186 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001187}
1188
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001189
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001190/**
Owen Taylor3473f882001-02-23 17:55:21 +00001191 * htmlCheckAutoClose:
1192 * @newtag: The new tag name
1193 * @oldtag: The old tag name
1194 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001195 * Checks whether the new tag is one of the registered valid tags for
1196 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001197 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1198 *
1199 * Returns 0 if no, 1 if yes.
1200 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001201static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001202htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1203{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001204 int i, indx;
1205 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001206
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001207 if (htmlStartCloseIndexinitialized == 0)
1208 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001209
1210 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001211 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001212 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213 if (closed == NULL)
1214 return (0);
1215 if (xmlStrEqual(BAD_CAST * closed, newtag))
1216 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001217 }
1218
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001219 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001220 i++;
1221 while (htmlStartClose[i] != NULL) {
1222 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001223 return (1);
1224 }
1225 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001226 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001227 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001228}
1229
1230/**
1231 * htmlAutoCloseOnClose:
1232 * @ctxt: an HTML parser context
1233 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001234 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001235 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001236 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001237 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001238static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001239htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1240{
1241 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001242 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001243
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001245
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001246 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001247
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1249 break;
1250 /*
1251 * A missplaced endtag can only close elements with lower
1252 * or equal priority, so if we find an element with higher
1253 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001254 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001255 */
1256 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1257 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001258 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001259 if (i < 0)
1260 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001261
1262 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001263 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001264 if ((info != NULL) && (info->endTag == 3)) {
1265 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1266 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001267 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001268 }
1269 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1270 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001271 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001272 }
1273}
1274
1275/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001276 * htmlAutoCloseOnEnd:
1277 * @ctxt: an HTML parser context
1278 *
1279 * Close all remaining tags at the end of the stream
1280 */
1281static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001282htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1283{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001284 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001285
William M. Brack899e64a2003-09-26 18:03:42 +00001286 if (ctxt->nameNr == 0)
1287 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001288 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001289 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1290 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001291 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001292 }
1293}
1294
1295/**
Owen Taylor3473f882001-02-23 17:55:21 +00001296 * htmlAutoClose:
1297 * @ctxt: an HTML parser context
1298 * @newtag: The new tag name or NULL
1299 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001300 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001301 * The list is kept in htmlStartClose array. This function is
1302 * called when a new tag has been detected and generates the
1303 * appropriates closes if possible/needed.
1304 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001305 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001306 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001307static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001308htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1309{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001310 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001311 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1313 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001314 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001315 }
1316 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001317 htmlAutoCloseOnEnd(ctxt);
1318 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001319 }
1320 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001321 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1322 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1323 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001324 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1325 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001326 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001327 }
Owen Taylor3473f882001-02-23 17:55:21 +00001328}
1329
1330/**
1331 * htmlAutoCloseTag:
1332 * @doc: the HTML document
1333 * @name: The tag name
1334 * @elem: the HTML element
1335 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001336 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001337 * The list is kept in htmlStartClose array. This function checks
1338 * if the element or one of it's children would autoclose the
1339 * given tag.
1340 *
1341 * Returns 1 if autoclose, 0 otherwise
1342 */
1343int
1344htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1345 htmlNodePtr child;
1346
1347 if (elem == NULL) return(1);
1348 if (xmlStrEqual(name, elem->name)) return(0);
1349 if (htmlCheckAutoClose(elem->name, name)) return(1);
1350 child = elem->children;
1351 while (child != NULL) {
1352 if (htmlAutoCloseTag(doc, name, child)) return(1);
1353 child = child->next;
1354 }
1355 return(0);
1356}
1357
1358/**
1359 * htmlIsAutoClosed:
1360 * @doc: the HTML document
1361 * @elem: the HTML element
1362 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001363 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001364 * The list is kept in htmlStartClose array. This function checks
1365 * if a tag is autoclosed by one of it's child
1366 *
1367 * Returns 1 if autoclosed, 0 otherwise
1368 */
1369int
1370htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1371 htmlNodePtr child;
1372
1373 if (elem == NULL) return(1);
1374 child = elem->children;
1375 while (child != NULL) {
1376 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1377 child = child->next;
1378 }
1379 return(0);
1380}
1381
1382/**
1383 * htmlCheckImplied:
1384 * @ctxt: an HTML parser context
1385 * @newtag: The new tag name
1386 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001387 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001388 * called when a new tag has been detected and generates the
1389 * appropriates implicit tags if missing
1390 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001391static void
Owen Taylor3473f882001-02-23 17:55:21 +00001392htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001393 int i;
1394
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001395 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1396 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001397 if (!htmlOmittedDefaultValue)
1398 return;
1399 if (xmlStrEqual(newtag, BAD_CAST"html"))
1400 return;
1401 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001402 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001403 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1404 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1405 }
1406 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1407 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001408 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001409 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1410 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1411 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1412 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1413 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1414 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001415 if (ctxt->html >= 3) {
1416 /* we already saw or generated an <head> before */
1417 return;
1418 }
1419 /*
1420 * dropped OBJECT ... i you put it first BODY will be
1421 * assumed !
1422 */
1423 htmlnamePush(ctxt, BAD_CAST"head");
1424 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1425 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001426 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1427 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1428 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001429 if (ctxt->html >= 10) {
1430 /* we already saw or generated a <body> before */
1431 return;
1432 }
Owen Taylor3473f882001-02-23 17:55:21 +00001433 for (i = 0;i < ctxt->nameNr;i++) {
1434 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1435 return;
1436 }
1437 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1438 return;
1439 }
1440 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001441
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001442 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001443 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1444 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1445 }
1446}
1447
1448/**
1449 * htmlCheckParagraph
1450 * @ctxt: an HTML parser context
1451 *
1452 * Check whether a p element need to be implied before inserting
1453 * characters in the current element.
1454 *
1455 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1456 * in case of error.
1457 */
1458
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001459static int
Owen Taylor3473f882001-02-23 17:55:21 +00001460htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1461 const xmlChar *tag;
1462 int i;
1463
1464 if (ctxt == NULL)
1465 return(-1);
1466 tag = ctxt->name;
1467 if (tag == NULL) {
1468 htmlAutoClose(ctxt, BAD_CAST"p");
1469 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001470 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001471 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1472 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1473 return(1);
1474 }
1475 if (!htmlOmittedDefaultValue)
1476 return(0);
1477 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1478 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001479 htmlAutoClose(ctxt, BAD_CAST"p");
1480 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001481 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001482 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1483 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1484 return(1);
1485 }
1486 }
1487 return(0);
1488}
1489
1490/**
1491 * htmlIsScriptAttribute:
1492 * @name: an attribute name
1493 *
1494 * Check if an attribute is of content type Script
1495 *
1496 * Returns 1 is the attribute is a script 0 otherwise
1497 */
1498int
1499htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001500 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001501
1502 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001503 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001504 /*
1505 * all script attributes start with 'on'
1506 */
1507 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001508 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001509 for (i = 0;
1510 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1511 i++) {
1512 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1513 return(1);
1514 }
1515 return(0);
1516}
1517
1518/************************************************************************
1519 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001520 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001521 * *
1522 ************************************************************************/
1523
1524
Daniel Veillard22090732001-07-16 00:06:07 +00001525static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001526/*
1527 * the 4 absolute ones, plus apostrophe.
1528 */
1529{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1530{ 38, "amp", "ampersand, U+0026 ISOnum" },
1531{ 39, "apos", "single quote" },
1532{ 60, "lt", "less-than sign, U+003C ISOnum" },
1533{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1534
1535/*
1536 * A bunch still in the 128-255 range
1537 * Replacing them depend really on the charset used.
1538 */
1539{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1540{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1541{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1542{ 163, "pound","pound sign, U+00A3 ISOnum" },
1543{ 164, "curren","currency sign, U+00A4 ISOnum" },
1544{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1545{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1546{ 167, "sect", "section sign, U+00A7 ISOnum" },
1547{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1548{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1549{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1550{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1551{ 172, "not", "not sign, U+00AC ISOnum" },
1552{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1553{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1554{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1555{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1556{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1557{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1558{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1559{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1560{ 181, "micro","micro sign, U+00B5 ISOnum" },
1561{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1562{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1563{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1564{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1565{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1566{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1567{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1568{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1569{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1570{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1571{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1572{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1573{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1574{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1575{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1576{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1577{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1578{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1579{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1580{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1581{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1582{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1583{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1584{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1585{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1586{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1587{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1588{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1589{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1590{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1591{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1592{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1593{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1594{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1595{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1596{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1597{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1598{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1599{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1600{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1601{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1602{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1603{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1604{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1605{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1606{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1607{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1608{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1609{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1610{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1611{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1612{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1613{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1614{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1615{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1616{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1617{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1618{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1619{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1620{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1621{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1622{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1623{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1624{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1625{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1626{ 247, "divide","division sign, U+00F7 ISOnum" },
1627{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1628{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1629{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1630{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1631{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1632{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1633{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1634{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1635
1636{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1637{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1638{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1639{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1640{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1641
1642/*
1643 * Anything below should really be kept as entities references
1644 */
1645{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1646
1647{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1648{ 732, "tilde","small tilde, U+02DC ISOdia" },
1649
1650{ 913, "Alpha","greek capital letter alpha, U+0391" },
1651{ 914, "Beta", "greek capital letter beta, U+0392" },
1652{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1653{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1654{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1655{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1656{ 919, "Eta", "greek capital letter eta, U+0397" },
1657{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1658{ 921, "Iota", "greek capital letter iota, U+0399" },
1659{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001660{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001661{ 924, "Mu", "greek capital letter mu, U+039C" },
1662{ 925, "Nu", "greek capital letter nu, U+039D" },
1663{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1664{ 927, "Omicron","greek capital letter omicron, U+039F" },
1665{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1666{ 929, "Rho", "greek capital letter rho, U+03A1" },
1667{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1668{ 932, "Tau", "greek capital letter tau, U+03A4" },
1669{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1670{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1671{ 935, "Chi", "greek capital letter chi, U+03A7" },
1672{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1673{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1674
1675{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1676{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1677{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1678{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1679{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1680{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1681{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1682{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1683{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1684{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1685{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1686{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1687{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1688{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1689{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1690{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1691{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1692{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1693{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1694{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1695{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1696{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1697{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1698{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1699{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1700{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1701{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1702{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1703
1704{ 8194, "ensp", "en space, U+2002 ISOpub" },
1705{ 8195, "emsp", "em space, U+2003 ISOpub" },
1706{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1707{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1708{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1709{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1710{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1711{ 8211, "ndash","en dash, U+2013 ISOpub" },
1712{ 8212, "mdash","em dash, U+2014 ISOpub" },
1713{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1714{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1715{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1716{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1717{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1718{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1719{ 8224, "dagger","dagger, U+2020 ISOpub" },
1720{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1721
1722{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1723{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1724
1725{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1726
1727{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1728{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1729
1730{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1731{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1732
1733{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1734{ 8260, "frasl","fraction slash, U+2044 NEW" },
1735
1736{ 8364, "euro", "euro sign, U+20AC NEW" },
1737
1738{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1739{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1740{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1741{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1742{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1743{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1744{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1745{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1746{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1747{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1748{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1749{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1750{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1751{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1752{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1753{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1754
1755{ 8704, "forall","for all, U+2200 ISOtech" },
1756{ 8706, "part", "partial differential, U+2202 ISOtech" },
1757{ 8707, "exist","there exists, U+2203 ISOtech" },
1758{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1759{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1760{ 8712, "isin", "element of, U+2208 ISOtech" },
1761{ 8713, "notin","not an element of, U+2209 ISOtech" },
1762{ 8715, "ni", "contains as member, U+220B ISOtech" },
1763{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001764{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001765{ 8722, "minus","minus sign, U+2212 ISOtech" },
1766{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1767{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1768{ 8733, "prop", "proportional to, U+221D ISOtech" },
1769{ 8734, "infin","infinity, U+221E ISOtech" },
1770{ 8736, "ang", "angle, U+2220 ISOamso" },
1771{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1772{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1773{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1774{ 8746, "cup", "union = cup, U+222A ISOtech" },
1775{ 8747, "int", "integral, U+222B ISOtech" },
1776{ 8756, "there4","therefore, U+2234 ISOtech" },
1777{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1778{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1779{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1780{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1781{ 8801, "equiv","identical to, U+2261 ISOtech" },
1782{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1783{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1784{ 8834, "sub", "subset of, U+2282 ISOtech" },
1785{ 8835, "sup", "superset of, U+2283 ISOtech" },
1786{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1787{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1788{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1789{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1790{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1791{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1792{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1793{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1794{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1795{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1796{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1797{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1798{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1799{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1800
1801{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1802{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1803{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1804{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1805
1806};
1807
1808/************************************************************************
1809 * *
1810 * Commodity functions to handle entities *
1811 * *
1812 ************************************************************************/
1813
1814/*
1815 * Macro used to grow the current buffer.
1816 */
1817#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001818 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001819 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001820 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1821 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001822 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001823 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001824 return(NULL); \
1825 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001826 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001827}
1828
1829/**
1830 * htmlEntityLookup:
1831 * @name: the entity name
1832 *
1833 * Lookup the given entity in EntitiesTable
1834 *
1835 * TODO: the linear scan is really ugly, an hash table is really needed.
1836 *
1837 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1838 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001839const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001840htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001841 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001842
1843 for (i = 0;i < (sizeof(html40EntitiesTable)/
1844 sizeof(html40EntitiesTable[0]));i++) {
1845 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001846 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001847 }
1848 }
1849 return(NULL);
1850}
1851
1852/**
1853 * htmlEntityValueLookup:
1854 * @value: the entity's unicode value
1855 *
1856 * Lookup the given entity in EntitiesTable
1857 *
1858 * TODO: the linear scan is really ugly, an hash table is really needed.
1859 *
1860 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1861 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001862const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001863htmlEntityValueLookup(unsigned int value) {
1864 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001865
1866 for (i = 0;i < (sizeof(html40EntitiesTable)/
1867 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001868 if (html40EntitiesTable[i].value >= value) {
1869 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001870 break;
William M. Brack78637da2003-07-31 14:47:38 +00001871 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001872 }
Owen Taylor3473f882001-02-23 17:55:21 +00001873 }
1874 return(NULL);
1875}
1876
1877/**
1878 * UTF8ToHtml:
1879 * @out: a pointer to an array of bytes to store the result
1880 * @outlen: the length of @out
1881 * @in: a pointer to an array of UTF-8 chars
1882 * @inlen: the length of @in
1883 *
1884 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1885 * plus HTML entities block of chars out.
1886 *
1887 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1888 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001889 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001890 * The value of @outlen after return is the number of octets consumed.
1891 */
1892int
1893UTF8ToHtml(unsigned char* out, int *outlen,
1894 const unsigned char* in, int *inlen) {
1895 const unsigned char* processed = in;
1896 const unsigned char* outend;
1897 const unsigned char* outstart = out;
1898 const unsigned char* instart = in;
1899 const unsigned char* inend;
1900 unsigned int c, d;
1901 int trailing;
1902
Daniel Veillardce682bc2004-11-05 17:22:25 +00001903 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001904 if (in == NULL) {
1905 /*
1906 * initialization nothing to do
1907 */
1908 *outlen = 0;
1909 *inlen = 0;
1910 return(0);
1911 }
1912 inend = in + (*inlen);
1913 outend = out + (*outlen);
1914 while (in < inend) {
1915 d = *in++;
1916 if (d < 0x80) { c= d; trailing= 0; }
1917 else if (d < 0xC0) {
1918 /* trailing byte in leading position */
1919 *outlen = out - outstart;
1920 *inlen = processed - instart;
1921 return(-2);
1922 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1923 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1924 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1925 else {
1926 /* no chance for this in Ascii */
1927 *outlen = out - outstart;
1928 *inlen = processed - instart;
1929 return(-2);
1930 }
1931
1932 if (inend - in < trailing) {
1933 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001934 }
Owen Taylor3473f882001-02-23 17:55:21 +00001935
1936 for ( ; trailing; trailing--) {
1937 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1938 break;
1939 c <<= 6;
1940 c |= d & 0x3F;
1941 }
1942
1943 /* assertion: c is a single UTF-4 value */
1944 if (c < 0x80) {
1945 if (out + 1 >= outend)
1946 break;
1947 *out++ = c;
1948 } else {
1949 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001950 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001951 const char *cp;
1952 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001953
1954 /*
1955 * Try to lookup a predefined HTML entity for it
1956 */
1957
1958 ent = htmlEntityValueLookup(c);
1959 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001960 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1961 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001962 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001963 else
1964 cp = ent->name;
1965 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001966 if (out + 2 + len >= outend)
1967 break;
1968 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001969 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001970 out += len;
1971 *out++ = ';';
1972 }
1973 processed = in;
1974 }
1975 *outlen = out - outstart;
1976 *inlen = processed - instart;
1977 return(0);
1978}
1979
1980/**
1981 * htmlEncodeEntities:
1982 * @out: a pointer to an array of bytes to store the result
1983 * @outlen: the length of @out
1984 * @in: a pointer to an array of UTF-8 chars
1985 * @inlen: the length of @in
1986 * @quoteChar: the quote character to escape (' or ") or zero.
1987 *
1988 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1989 * plus HTML entities block of chars out.
1990 *
1991 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1992 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001993 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001994 * The value of @outlen after return is the number of octets consumed.
1995 */
1996int
1997htmlEncodeEntities(unsigned char* out, int *outlen,
1998 const unsigned char* in, int *inlen, int quoteChar) {
1999 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002000 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002001 const unsigned char* outstart = out;
2002 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002003 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002004 unsigned int c, d;
2005 int trailing;
2006
Daniel Veillardce682bc2004-11-05 17:22:25 +00002007 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2008 return(-1);
2009 outend = out + (*outlen);
2010 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002011 while (in < inend) {
2012 d = *in++;
2013 if (d < 0x80) { c= d; trailing= 0; }
2014 else if (d < 0xC0) {
2015 /* trailing byte in leading position */
2016 *outlen = out - outstart;
2017 *inlen = processed - instart;
2018 return(-2);
2019 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2020 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2021 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2022 else {
2023 /* no chance for this in Ascii */
2024 *outlen = out - outstart;
2025 *inlen = processed - instart;
2026 return(-2);
2027 }
2028
2029 if (inend - in < trailing)
2030 break;
2031
2032 while (trailing--) {
2033 if (((d= *in++) & 0xC0) != 0x80) {
2034 *outlen = out - outstart;
2035 *inlen = processed - instart;
2036 return(-2);
2037 }
2038 c <<= 6;
2039 c |= d & 0x3F;
2040 }
2041
2042 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002043 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2044 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002045 if (out >= outend)
2046 break;
2047 *out++ = c;
2048 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002049 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002050 const char *cp;
2051 char nbuf[16];
2052 int len;
2053
2054 /*
2055 * Try to lookup a predefined HTML entity for it
2056 */
2057 ent = htmlEntityValueLookup(c);
2058 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002059 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002060 cp = nbuf;
2061 }
2062 else
2063 cp = ent->name;
2064 len = strlen(cp);
2065 if (out + 2 + len > outend)
2066 break;
2067 *out++ = '&';
2068 memcpy(out, cp, len);
2069 out += len;
2070 *out++ = ';';
2071 }
2072 processed = in;
2073 }
2074 *outlen = out - outstart;
2075 *inlen = processed - instart;
2076 return(0);
2077}
2078
Owen Taylor3473f882001-02-23 17:55:21 +00002079/************************************************************************
2080 * *
2081 * Commodity functions to handle streams *
2082 * *
2083 ************************************************************************/
2084
2085/**
Owen Taylor3473f882001-02-23 17:55:21 +00002086 * htmlNewInputStream:
2087 * @ctxt: an HTML parser context
2088 *
2089 * Create a new input stream structure
2090 * Returns the new input stream or NULL
2091 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002092static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002093htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2094 htmlParserInputPtr input;
2095
2096 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2097 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002098 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002099 return(NULL);
2100 }
2101 memset(input, 0, sizeof(htmlParserInput));
2102 input->filename = NULL;
2103 input->directory = NULL;
2104 input->base = NULL;
2105 input->cur = NULL;
2106 input->buf = NULL;
2107 input->line = 1;
2108 input->col = 1;
2109 input->buf = NULL;
2110 input->free = NULL;
2111 input->version = NULL;
2112 input->consumed = 0;
2113 input->length = 0;
2114 return(input);
2115}
2116
2117
2118/************************************************************************
2119 * *
2120 * Commodity functions, cleanup needed ? *
2121 * *
2122 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002123/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002124 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002125 * NOTE: it might be more apropriate to integrate this information
2126 * into the html40ElementTable array but I don't want to risk any
2127 * binary incomptibility
2128 */
2129static const char *allowPCData[] = {
2130 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2131 "blockquote", "body", "button", "caption", "center", "cite", "code",
2132 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2133 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2134 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2135 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2136};
Owen Taylor3473f882001-02-23 17:55:21 +00002137
2138/**
2139 * areBlanks:
2140 * @ctxt: an HTML parser context
2141 * @str: a xmlChar *
2142 * @len: the size of @str
2143 *
2144 * Is this a sequence of blank chars that one can ignore ?
2145 *
2146 * Returns 1 if ignorable 0 otherwise.
2147 */
2148
2149static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002150 unsigned int i;
2151 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002152 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002153 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002154
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002155 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002156 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002157
2158 if (CUR == 0) return(1);
2159 if (CUR != '<') return(0);
2160 if (ctxt->name == NULL)
2161 return(1);
2162 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2163 return(1);
2164 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2165 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002166
2167 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2168 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2169 dtd = xmlGetIntSubset(ctxt->myDoc);
2170 if (dtd != NULL && dtd->ExternalID != NULL) {
2171 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2172 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2173 return(1);
2174 }
2175 }
2176
Owen Taylor3473f882001-02-23 17:55:21 +00002177 if (ctxt->node == NULL) return(0);
2178 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002179 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2180 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002181 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002182 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2183 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002184 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002185 for all tags "b" allowing PCDATA */
2186 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2187 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2188 return(0);
2189 }
2190 }
Owen Taylor3473f882001-02-23 17:55:21 +00002191 } else if (xmlNodeIsText(lastChild)) {
2192 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002193 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002194 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002195 for all tags "p" allowing PCDATA */
2196 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2197 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2198 return(0);
2199 }
2200 }
Owen Taylor3473f882001-02-23 17:55:21 +00002201 }
2202 return(1);
2203}
2204
2205/**
Owen Taylor3473f882001-02-23 17:55:21 +00002206 * htmlNewDocNoDtD:
2207 * @URI: URI for the dtd, or NULL
2208 * @ExternalID: the external ID of the DTD, or NULL
2209 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002210 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2211 * are NULL
2212 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002213 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002214 */
2215htmlDocPtr
2216htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2217 xmlDocPtr cur;
2218
2219 /*
2220 * Allocate a new document and fill the fields.
2221 */
2222 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2223 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002224 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002225 return(NULL);
2226 }
2227 memset(cur, 0, sizeof(xmlDoc));
2228
2229 cur->type = XML_HTML_DOCUMENT_NODE;
2230 cur->version = NULL;
2231 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002232 cur->doc = cur;
2233 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002234 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002235 cur->extSubset = NULL;
2236 cur->oldNs = NULL;
2237 cur->encoding = NULL;
2238 cur->standalone = 1;
2239 cur->compression = 0;
2240 cur->ids = NULL;
2241 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002242 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002243 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002244 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002245 if ((ExternalID != NULL) ||
2246 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002247 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002248 return(cur);
2249}
2250
2251/**
2252 * htmlNewDoc:
2253 * @URI: URI for the dtd, or NULL
2254 * @ExternalID: the external ID of the DTD, or NULL
2255 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002256 * Creates a new HTML document
2257 *
Owen Taylor3473f882001-02-23 17:55:21 +00002258 * Returns a new document
2259 */
2260htmlDocPtr
2261htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2262 if ((URI == NULL) && (ExternalID == NULL))
2263 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002264 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2265 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002266
2267 return(htmlNewDocNoDtD(URI, ExternalID));
2268}
2269
2270
2271/************************************************************************
2272 * *
2273 * The parser itself *
2274 * Relates to http://www.w3.org/TR/html40 *
2275 * *
2276 ************************************************************************/
2277
2278/************************************************************************
2279 * *
2280 * The parser itself *
2281 * *
2282 ************************************************************************/
2283
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002284static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002285
Owen Taylor3473f882001-02-23 17:55:21 +00002286/**
2287 * htmlParseHTMLName:
2288 * @ctxt: an HTML parser context
2289 *
2290 * parse an HTML tag or attribute name, note that we convert it to lowercase
2291 * since HTML names are not case-sensitive.
2292 *
2293 * Returns the Tag Name parsed or NULL
2294 */
2295
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002296static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002297htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002298 int i = 0;
2299 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2300
William M. Brackd1757ab2004-10-02 22:07:48 +00002301 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002302 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002303
2304 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002305 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002306 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2307 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002308 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2309 else loc[i] = CUR;
2310 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002311
Owen Taylor3473f882001-02-23 17:55:21 +00002312 NEXT;
2313 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002314
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002315 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002316}
2317
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002318
2319/**
2320 * htmlParseHTMLName_nonInvasive:
2321 * @ctxt: an HTML parser context
2322 *
2323 * parse an HTML tag or attribute name, note that we convert it to lowercase
2324 * since HTML names are not case-sensitive, this doesn't consume the data
2325 * from the stream, it's a look-ahead
2326 *
2327 * Returns the Tag Name parsed or NULL
2328 */
2329
2330static const xmlChar *
2331htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2332 int i = 0;
2333 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2334
2335 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2336 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002337
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002338 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2339 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2340 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2341 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2342 else loc[i] = NXT(1+i);
2343 i++;
2344 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002345
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002346 return(xmlDictLookup(ctxt->dict, loc, i));
2347}
2348
2349
Owen Taylor3473f882001-02-23 17:55:21 +00002350/**
2351 * htmlParseName:
2352 * @ctxt: an HTML parser context
2353 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002354 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002355 *
2356 * Returns the Name parsed or NULL
2357 */
2358
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002359static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002360htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002361 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002362 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002363 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002364
2365 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002366
2367 /*
2368 * Accelerator for simple ASCII names
2369 */
2370 in = ctxt->input->cur;
2371 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2372 ((*in >= 0x41) && (*in <= 0x5A)) ||
2373 (*in == '_') || (*in == ':')) {
2374 in++;
2375 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2376 ((*in >= 0x41) && (*in <= 0x5A)) ||
2377 ((*in >= 0x30) && (*in <= 0x39)) ||
2378 (*in == '_') || (*in == '-') ||
2379 (*in == ':') || (*in == '.'))
2380 in++;
2381 if ((*in > 0) && (*in < 0x80)) {
2382 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002383 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002384 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002385 ctxt->nbChars += count;
2386 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002387 return(ret);
2388 }
2389 }
2390 return(htmlParseNameComplex(ctxt));
2391}
2392
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002393static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002394htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002395 int len = 0, l;
2396 int c;
2397 int count = 0;
2398
2399 /*
2400 * Handler for more complex cases
2401 */
2402 GROW;
2403 c = CUR_CHAR(l);
2404 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2405 (!IS_LETTER(c) && (c != '_') &&
2406 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002407 return(NULL);
2408 }
2409
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002410 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2411 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2412 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002413 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002414 (IS_COMBINING(c)) ||
2415 (IS_EXTENDER(c)))) {
2416 if (count++ > 100) {
2417 count = 0;
2418 GROW;
2419 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002420 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002421 NEXTL(l);
2422 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002423 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002424 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002425}
2426
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002427
Owen Taylor3473f882001-02-23 17:55:21 +00002428/**
2429 * htmlParseHTMLAttribute:
2430 * @ctxt: an HTML parser context
2431 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002432 *
Owen Taylor3473f882001-02-23 17:55:21 +00002433 * parse an HTML attribute value till the stop (quote), if
2434 * stop is 0 then it stops at the first space
2435 *
2436 * Returns the attribute parsed or NULL
2437 */
2438
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002439static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002440htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2441 xmlChar *buffer = NULL;
2442 int buffer_size = 0;
2443 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002444 const xmlChar *name = NULL;
2445 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002446 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002447
2448 /*
2449 * allocate a translation buffer.
2450 */
2451 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002452 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002453 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002454 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002455 return(NULL);
2456 }
2457 out = buffer;
2458
2459 /*
2460 * Ok loop until we reach one of the ending chars
2461 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002462 while ((CUR != 0) && (CUR != stop)) {
2463 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002464 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002465 if (CUR == '&') {
2466 if (NXT(1) == '#') {
2467 unsigned int c;
2468 int bits;
2469
2470 c = htmlParseCharRef(ctxt);
2471 if (c < 0x80)
2472 { *out++ = c; bits= -6; }
2473 else if (c < 0x800)
2474 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2475 else if (c < 0x10000)
2476 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002477 else
Owen Taylor3473f882001-02-23 17:55:21 +00002478 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002479
Owen Taylor3473f882001-02-23 17:55:21 +00002480 for ( ; bits >= 0; bits-= 6) {
2481 *out++ = ((c >> bits) & 0x3F) | 0x80;
2482 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002483
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002484 if (out - buffer > buffer_size - 100) {
2485 int indx = out - buffer;
2486
2487 growBuffer(buffer);
2488 out = &buffer[indx];
2489 }
Owen Taylor3473f882001-02-23 17:55:21 +00002490 } else {
2491 ent = htmlParseEntityRef(ctxt, &name);
2492 if (name == NULL) {
2493 *out++ = '&';
2494 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002495 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002496
2497 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002498 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002499 }
2500 } else if (ent == NULL) {
2501 *out++ = '&';
2502 cur = name;
2503 while (*cur != 0) {
2504 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002505 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002506
2507 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002508 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002509 }
2510 *out++ = *cur++;
2511 }
Owen Taylor3473f882001-02-23 17:55:21 +00002512 } else {
2513 unsigned int c;
2514 int bits;
2515
2516 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002517 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002518
2519 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002520 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002521 }
Daniel Veillard48519092006-10-17 15:56:35 +00002522 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002523 if (c < 0x80)
2524 { *out++ = c; bits= -6; }
2525 else if (c < 0x800)
2526 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2527 else if (c < 0x10000)
2528 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002529 else
Owen Taylor3473f882001-02-23 17:55:21 +00002530 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002531
Owen Taylor3473f882001-02-23 17:55:21 +00002532 for ( ; bits >= 0; bits-= 6) {
2533 *out++ = ((c >> bits) & 0x3F) | 0x80;
2534 }
Owen Taylor3473f882001-02-23 17:55:21 +00002535 }
2536 }
2537 } else {
2538 unsigned int c;
2539 int bits, l;
2540
2541 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002542 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002543
2544 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002545 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002546 }
2547 c = CUR_CHAR(l);
2548 if (c < 0x80)
2549 { *out++ = c; bits= -6; }
2550 else if (c < 0x800)
2551 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2552 else if (c < 0x10000)
2553 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002554 else
Owen Taylor3473f882001-02-23 17:55:21 +00002555 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002556
Owen Taylor3473f882001-02-23 17:55:21 +00002557 for ( ; bits >= 0; bits-= 6) {
2558 *out++ = ((c >> bits) & 0x3F) | 0x80;
2559 }
2560 NEXT;
2561 }
2562 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002563 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002564 return(buffer);
2565}
2566
2567/**
Owen Taylor3473f882001-02-23 17:55:21 +00002568 * htmlParseEntityRef:
2569 * @ctxt: an HTML parser context
2570 * @str: location to store the entity name
2571 *
2572 * parse an HTML ENTITY references
2573 *
2574 * [68] EntityRef ::= '&' Name ';'
2575 *
2576 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2577 * if non-NULL *str will have to be freed by the caller.
2578 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002579const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002580htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2581 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002582 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002583
2584 if (str != NULL) *str = NULL;
2585 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002586
2587 if (CUR == '&') {
2588 NEXT;
2589 name = htmlParseName(ctxt);
2590 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002591 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2592 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002593 } else {
2594 GROW;
2595 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002596 if (str != NULL)
2597 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002598
2599 /*
2600 * Lookup the entity in the table.
2601 */
2602 ent = htmlEntityLookup(name);
2603 if (ent != NULL) /* OK that's ugly !!! */
2604 NEXT;
2605 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002606 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2607 "htmlParseEntityRef: expecting ';'\n",
2608 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002609 if (str != NULL)
2610 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002611 }
2612 }
2613 }
2614 return(ent);
2615}
2616
2617/**
2618 * htmlParseAttValue:
2619 * @ctxt: an HTML parser context
2620 *
2621 * parse a value for an attribute
2622 * Note: the parser won't do substitution of entities here, this
2623 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002624 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002625 *
2626 * Returns the AttValue parsed or NULL.
2627 */
2628
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002629static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002630htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2631 xmlChar *ret = NULL;
2632
2633 if (CUR == '"') {
2634 NEXT;
2635 ret = htmlParseHTMLAttribute(ctxt, '"');
2636 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002637 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2638 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002639 } else
2640 NEXT;
2641 } else if (CUR == '\'') {
2642 NEXT;
2643 ret = htmlParseHTMLAttribute(ctxt, '\'');
2644 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002645 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2646 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002647 } else
2648 NEXT;
2649 } else {
2650 /*
2651 * That's an HTMLism, the attribute value may not be quoted
2652 */
2653 ret = htmlParseHTMLAttribute(ctxt, 0);
2654 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002655 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2656 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002657 }
2658 }
2659 return(ret);
2660}
2661
2662/**
2663 * htmlParseSystemLiteral:
2664 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002665 *
Owen Taylor3473f882001-02-23 17:55:21 +00002666 * parse an HTML Literal
2667 *
2668 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2669 *
2670 * Returns the SystemLiteral parsed or NULL
2671 */
2672
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002673static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002674htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2675 const xmlChar *q;
2676 xmlChar *ret = NULL;
2677
2678 if (CUR == '"') {
2679 NEXT;
2680 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002681 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002682 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002683 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002684 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2685 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002686 } else {
2687 ret = xmlStrndup(q, CUR_PTR - q);
2688 NEXT;
2689 }
2690 } else if (CUR == '\'') {
2691 NEXT;
2692 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002693 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002694 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002695 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002696 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2697 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002698 } else {
2699 ret = xmlStrndup(q, CUR_PTR - q);
2700 NEXT;
2701 }
2702 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002703 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2704 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002705 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002706
Owen Taylor3473f882001-02-23 17:55:21 +00002707 return(ret);
2708}
2709
2710/**
2711 * htmlParsePubidLiteral:
2712 * @ctxt: an HTML parser context
2713 *
2714 * parse an HTML public literal
2715 *
2716 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2717 *
2718 * Returns the PubidLiteral parsed or NULL.
2719 */
2720
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002721static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002722htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2723 const xmlChar *q;
2724 xmlChar *ret = NULL;
2725 /*
2726 * Name ::= (Letter | '_') (NameChar)*
2727 */
2728 if (CUR == '"') {
2729 NEXT;
2730 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002731 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002732 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002733 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2734 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002735 } else {
2736 ret = xmlStrndup(q, CUR_PTR - q);
2737 NEXT;
2738 }
2739 } else if (CUR == '\'') {
2740 NEXT;
2741 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002742 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002743 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002744 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002745 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2746 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002747 } else {
2748 ret = xmlStrndup(q, CUR_PTR - q);
2749 NEXT;
2750 }
2751 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002752 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2753 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002754 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002755
Owen Taylor3473f882001-02-23 17:55:21 +00002756 return(ret);
2757}
2758
2759/**
2760 * htmlParseScript:
2761 * @ctxt: an HTML parser context
2762 *
2763 * parse the content of an HTML SCRIPT or STYLE element
2764 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2765 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2766 * http://www.w3.org/TR/html4/types.html#type-script
2767 * http://www.w3.org/TR/html4/types.html#h-6.15
2768 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2769 *
2770 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2771 * element and the value of intrinsic event attributes. User agents must
2772 * not evaluate script data as HTML markup but instead must pass it on as
2773 * data to a script engine.
2774 * NOTES:
2775 * - The content is passed like CDATA
2776 * - the attributes for style and scripting "onXXX" are also described
2777 * as CDATA but SGML allows entities references in attributes so their
2778 * processing is identical as other attributes
2779 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002780static void
Owen Taylor3473f882001-02-23 17:55:21 +00002781htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002782 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002783 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002784 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002785
2786 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002787 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002788 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002789 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002790 /*
2791 * One should break here, the specification is clear:
2792 * Authors should therefore escape "</" within the content.
2793 * Escape mechanisms are specific to each scripting or
2794 * style sheet language.
2795 *
2796 * In recovery mode, only break if end tag match the
2797 * current tag, effectively ignoring all tags inside the
2798 * script/style block and treating the entire block as
2799 * CDATA.
2800 */
2801 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002802 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2803 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002804 {
2805 break; /* while */
2806 } else {
2807 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002808 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002809 ctxt->name, NULL);
2810 }
2811 } else {
2812 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002813 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002814 {
2815 break; /* while */
2816 }
2817 }
Owen Taylor3473f882001-02-23 17:55:21 +00002818 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002819 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002820 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2821 if (ctxt->sax->cdataBlock!= NULL) {
2822 /*
2823 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2824 */
2825 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002826 } else if (ctxt->sax->characters != NULL) {
2827 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002828 }
2829 nbchar = 0;
2830 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002831 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002832 NEXTL(l);
2833 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002834 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002835
Daniel Veillard68716a72006-10-16 09:32:17 +00002836 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002837 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2838 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002839 NEXT;
2840 }
2841
2842 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2843 if (ctxt->sax->cdataBlock!= NULL) {
2844 /*
2845 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2846 */
2847 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002848 } else if (ctxt->sax->characters != NULL) {
2849 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002850 }
2851 }
2852}
2853
2854
2855/**
2856 * htmlParseCharData:
2857 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002858 *
2859 * parse a CharData section.
2860 * if we are within a CDATA section ']]>' marks an end of section.
2861 *
2862 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2863 */
2864
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002865static void
2866htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002867 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2868 int nbchar = 0;
2869 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002870 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002871
2872 SHRINK;
2873 cur = CUR_CHAR(l);
2874 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002875 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002876 (cur != 0)) {
2877 if (!(IS_CHAR(cur))) {
2878 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2879 "Invalid char in CDATA 0x%X\n", cur);
2880 } else {
2881 COPY_BUF(l,buf,nbchar,cur);
2882 }
Owen Taylor3473f882001-02-23 17:55:21 +00002883 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2884 /*
2885 * Ok the segment is to be consumed as chars.
2886 */
2887 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2888 if (areBlanks(ctxt, buf, nbchar)) {
2889 if (ctxt->sax->ignorableWhitespace != NULL)
2890 ctxt->sax->ignorableWhitespace(ctxt->userData,
2891 buf, nbchar);
2892 } else {
2893 htmlCheckParagraph(ctxt);
2894 if (ctxt->sax->characters != NULL)
2895 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2896 }
2897 }
2898 nbchar = 0;
2899 }
2900 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002901 chunk++;
2902 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2903 chunk = 0;
2904 SHRINK;
2905 GROW;
2906 }
Owen Taylor3473f882001-02-23 17:55:21 +00002907 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002908 if (cur == 0) {
2909 SHRINK;
2910 GROW;
2911 cur = CUR_CHAR(l);
2912 }
Owen Taylor3473f882001-02-23 17:55:21 +00002913 }
2914 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002915 buf[nbchar] = 0;
2916
Owen Taylor3473f882001-02-23 17:55:21 +00002917 /*
2918 * Ok the segment is to be consumed as chars.
2919 */
2920 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2921 if (areBlanks(ctxt, buf, nbchar)) {
2922 if (ctxt->sax->ignorableWhitespace != NULL)
2923 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2924 } else {
2925 htmlCheckParagraph(ctxt);
2926 if (ctxt->sax->characters != NULL)
2927 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2928 }
2929 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002930 } else {
2931 /*
2932 * Loop detection
2933 */
2934 if (cur == 0)
2935 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002936 }
2937}
2938
2939/**
2940 * htmlParseExternalID:
2941 * @ctxt: an HTML parser context
2942 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002943 *
2944 * Parse an External ID or a Public ID
2945 *
Owen Taylor3473f882001-02-23 17:55:21 +00002946 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2947 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2948 *
2949 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2950 *
2951 * Returns the function returns SystemLiteral and in the second
2952 * case publicID receives PubidLiteral, is strict is off
2953 * it is possible to return NULL and have publicID set.
2954 */
2955
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002956static xmlChar *
2957htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002958 xmlChar *URI = NULL;
2959
2960 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2961 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2962 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2963 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002964 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002965 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2966 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002967 }
2968 SKIP_BLANKS;
2969 URI = htmlParseSystemLiteral(ctxt);
2970 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002971 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2972 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002973 }
2974 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2975 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2976 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2977 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002978 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002979 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2980 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002981 }
2982 SKIP_BLANKS;
2983 *publicID = htmlParsePubidLiteral(ctxt);
2984 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002985 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2986 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2987 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002988 }
2989 SKIP_BLANKS;
2990 if ((CUR == '"') || (CUR == '\'')) {
2991 URI = htmlParseSystemLiteral(ctxt);
2992 }
2993 }
2994 return(URI);
2995}
2996
2997/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002998 * xmlParsePI:
2999 * @ctxt: an XML parser context
3000 *
3001 * parse an XML Processing Instruction.
3002 *
3003 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3004 */
3005static void
3006htmlParsePI(htmlParserCtxtPtr ctxt) {
3007 xmlChar *buf = NULL;
3008 int len = 0;
3009 int size = HTML_PARSER_BUFFER_SIZE;
3010 int cur, l;
3011 const xmlChar *target;
3012 xmlParserInputState state;
3013 int count = 0;
3014
3015 if ((RAW == '<') && (NXT(1) == '?')) {
3016 state = ctxt->instate;
3017 ctxt->instate = XML_PARSER_PI;
3018 /*
3019 * this is a Processing Instruction.
3020 */
3021 SKIP(2);
3022 SHRINK;
3023
3024 /*
3025 * Parse the target name and check for special support like
3026 * namespace.
3027 */
3028 target = htmlParseName(ctxt);
3029 if (target != NULL) {
3030 if (RAW == '>') {
3031 SKIP(1);
3032
3033 /*
3034 * SAX: PI detected.
3035 */
3036 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3037 (ctxt->sax->processingInstruction != NULL))
3038 ctxt->sax->processingInstruction(ctxt->userData,
3039 target, NULL);
3040 ctxt->instate = state;
3041 return;
3042 }
3043 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3044 if (buf == NULL) {
3045 htmlErrMemory(ctxt, NULL);
3046 ctxt->instate = state;
3047 return;
3048 }
3049 cur = CUR;
3050 if (!IS_BLANK(cur)) {
3051 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3052 "ParsePI: PI %s space expected\n", target, NULL);
3053 }
3054 SKIP_BLANKS;
3055 cur = CUR_CHAR(l);
3056 while (IS_CHAR(cur) && (cur != '>')) {
3057 if (len + 5 >= size) {
3058 xmlChar *tmp;
3059
3060 size *= 2;
3061 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3062 if (tmp == NULL) {
3063 htmlErrMemory(ctxt, NULL);
3064 xmlFree(buf);
3065 ctxt->instate = state;
3066 return;
3067 }
3068 buf = tmp;
3069 }
3070 count++;
3071 if (count > 50) {
3072 GROW;
3073 count = 0;
3074 }
3075 COPY_BUF(l,buf,len,cur);
3076 NEXTL(l);
3077 cur = CUR_CHAR(l);
3078 if (cur == 0) {
3079 SHRINK;
3080 GROW;
3081 cur = CUR_CHAR(l);
3082 }
3083 }
3084 buf[len] = 0;
3085 if (cur != '>') {
3086 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3087 "ParsePI: PI %s never end ...\n", target, NULL);
3088 } else {
3089 SKIP(1);
3090
3091 /*
3092 * SAX: PI detected.
3093 */
3094 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3095 (ctxt->sax->processingInstruction != NULL))
3096 ctxt->sax->processingInstruction(ctxt->userData,
3097 target, buf);
3098 }
3099 xmlFree(buf);
3100 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003101 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003102 "PI is not started correctly", NULL, NULL);
3103 }
3104 ctxt->instate = state;
3105 }
3106}
3107
3108/**
Owen Taylor3473f882001-02-23 17:55:21 +00003109 * htmlParseComment:
3110 * @ctxt: an HTML parser context
3111 *
3112 * Parse an XML (SGML) comment <!-- .... -->
3113 *
3114 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3115 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003116static void
Owen Taylor3473f882001-02-23 17:55:21 +00003117htmlParseComment(htmlParserCtxtPtr ctxt) {
3118 xmlChar *buf = NULL;
3119 int len;
3120 int size = HTML_PARSER_BUFFER_SIZE;
3121 int q, ql;
3122 int r, rl;
3123 int cur, l;
3124 xmlParserInputState state;
3125
3126 /*
3127 * Check that there is a comment right here.
3128 */
3129 if ((RAW != '<') || (NXT(1) != '!') ||
3130 (NXT(2) != '-') || (NXT(3) != '-')) return;
3131
3132 state = ctxt->instate;
3133 ctxt->instate = XML_PARSER_COMMENT;
3134 SHRINK;
3135 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003136 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003137 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003138 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003139 ctxt->instate = state;
3140 return;
3141 }
3142 q = CUR_CHAR(ql);
3143 NEXTL(ql);
3144 r = CUR_CHAR(rl);
3145 NEXTL(rl);
3146 cur = CUR_CHAR(l);
3147 len = 0;
3148 while (IS_CHAR(cur) &&
3149 ((cur != '>') ||
3150 (r != '-') || (q != '-'))) {
3151 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003152 xmlChar *tmp;
3153
Owen Taylor3473f882001-02-23 17:55:21 +00003154 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003155 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3156 if (tmp == NULL) {
3157 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003158 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003159 ctxt->instate = state;
3160 return;
3161 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003162 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003163 }
3164 COPY_BUF(ql,buf,len,q);
3165 q = r;
3166 ql = rl;
3167 r = cur;
3168 rl = l;
3169 NEXTL(l);
3170 cur = CUR_CHAR(l);
3171 if (cur == 0) {
3172 SHRINK;
3173 GROW;
3174 cur = CUR_CHAR(l);
3175 }
3176 }
3177 buf[len] = 0;
3178 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003179 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3180 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003181 xmlFree(buf);
3182 } else {
3183 NEXT;
3184 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3185 (!ctxt->disableSAX))
3186 ctxt->sax->comment(ctxt->userData, buf);
3187 xmlFree(buf);
3188 }
3189 ctxt->instate = state;
3190}
3191
3192/**
3193 * htmlParseCharRef:
3194 * @ctxt: an HTML parser context
3195 *
3196 * parse Reference declarations
3197 *
3198 * [66] CharRef ::= '&#' [0-9]+ ';' |
3199 * '&#x' [0-9a-fA-F]+ ';'
3200 *
3201 * Returns the value parsed (as an int)
3202 */
3203int
3204htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3205 int val = 0;
3206
Daniel Veillarda03e3652004-11-02 18:45:30 +00003207 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3208 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3209 "htmlParseCharRef: context error\n",
3210 NULL, NULL);
3211 return(0);
3212 }
Owen Taylor3473f882001-02-23 17:55:21 +00003213 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003214 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003215 SKIP(3);
3216 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003217 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003218 val = val * 16 + (CUR - '0');
3219 else if ((CUR >= 'a') && (CUR <= 'f'))
3220 val = val * 16 + (CUR - 'a') + 10;
3221 else if ((CUR >= 'A') && (CUR <= 'F'))
3222 val = val * 16 + (CUR - 'A') + 10;
3223 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003224 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003225 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003226 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003227 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003228 }
3229 NEXT;
3230 }
3231 if (CUR == ';')
3232 NEXT;
3233 } else if ((CUR == '&') && (NXT(1) == '#')) {
3234 SKIP(2);
3235 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003236 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003237 val = val * 10 + (CUR - '0');
3238 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003239 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003240 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003241 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003242 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003243 }
3244 NEXT;
3245 }
3246 if (CUR == ';')
3247 NEXT;
3248 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003249 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3250 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003251 }
3252 /*
3253 * Check the value IS_CHAR ...
3254 */
3255 if (IS_CHAR(val)) {
3256 return(val);
3257 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003258 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3259 "htmlParseCharRef: invalid xmlChar value %d\n",
3260 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003261 }
3262 return(0);
3263}
3264
3265
3266/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003267 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003268 * @ctxt: an HTML parser context
3269 *
3270 * parse a DOCTYPE declaration
3271 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003272 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003273 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3274 */
3275
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003276static void
Owen Taylor3473f882001-02-23 17:55:21 +00003277htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003278 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003279 xmlChar *ExternalID = NULL;
3280 xmlChar *URI = NULL;
3281
3282 /*
3283 * We know that '<!DOCTYPE' has been detected.
3284 */
3285 SKIP(9);
3286
3287 SKIP_BLANKS;
3288
3289 /*
3290 * Parse the DOCTYPE name.
3291 */
3292 name = htmlParseName(ctxt);
3293 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003294 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3295 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3296 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003297 }
3298 /*
3299 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3300 */
3301
3302 SKIP_BLANKS;
3303
3304 /*
3305 * Check for SystemID and ExternalID
3306 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003307 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003308 SKIP_BLANKS;
3309
3310 /*
3311 * We should be at the end of the DOCTYPE declaration.
3312 */
3313 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003314 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3315 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003316 /* We shouldn't try to resynchronize ... */
3317 }
3318 NEXT;
3319
3320 /*
3321 * Create or update the document accordingly to the DOCTYPE
3322 */
3323 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3324 (!ctxt->disableSAX))
3325 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3326
3327 /*
3328 * Cleanup, since we don't use all those identifiers
3329 */
3330 if (URI != NULL) xmlFree(URI);
3331 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003332}
3333
3334/**
3335 * htmlParseAttribute:
3336 * @ctxt: an HTML parser context
3337 * @value: a xmlChar ** used to store the value of the attribute
3338 *
3339 * parse an attribute
3340 *
3341 * [41] Attribute ::= Name Eq AttValue
3342 *
3343 * [25] Eq ::= S? '=' S?
3344 *
3345 * With namespace:
3346 *
3347 * [NS 11] Attribute ::= QName Eq AttValue
3348 *
3349 * Also the case QName == xmlns:??? is handled independently as a namespace
3350 * definition.
3351 *
3352 * Returns the attribute name, and the value in *value.
3353 */
3354
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003355static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003356htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003357 const xmlChar *name;
3358 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003359
3360 *value = NULL;
3361 name = htmlParseHTMLName(ctxt);
3362 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003363 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3364 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003365 return(NULL);
3366 }
3367
3368 /*
3369 * read the value
3370 */
3371 SKIP_BLANKS;
3372 if (CUR == '=') {
3373 NEXT;
3374 SKIP_BLANKS;
3375 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003376 } else if (htmlIsBooleanAttr(name)) {
3377 /*
3378 * assume a minimized attribute
3379 */
3380 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003381 }
3382
3383 *value = val;
3384 return(name);
3385}
3386
3387/**
3388 * htmlCheckEncoding:
3389 * @ctxt: an HTML parser context
3390 * @attvalue: the attribute value
3391 *
3392 * Checks an http-equiv attribute from a Meta tag to detect
3393 * the encoding
3394 * If a new encoding is detected the parser is switched to decode
3395 * it and pass UTF8
3396 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003397static void
Owen Taylor3473f882001-02-23 17:55:21 +00003398htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3399 const xmlChar *encoding;
3400
3401 if ((ctxt == NULL) || (attvalue == NULL))
3402 return;
3403
Daniel Veillarde77db162009-08-22 11:32:38 +02003404 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003405 if (ctxt->input->encoding != NULL)
3406 return;
3407
3408 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3409 if (encoding != NULL) {
3410 encoding += 8;
3411 } else {
3412 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3413 if (encoding != NULL)
3414 encoding += 9;
3415 }
3416 if (encoding != NULL) {
3417 xmlCharEncoding enc;
3418 xmlCharEncodingHandlerPtr handler;
3419
3420 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3421
3422 if (ctxt->input->encoding != NULL)
3423 xmlFree((xmlChar *) ctxt->input->encoding);
3424 ctxt->input->encoding = xmlStrdup(encoding);
3425
3426 enc = xmlParseCharEncoding((const char *) encoding);
3427 /*
3428 * registered set of known encodings
3429 */
3430 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003431 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003432 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3433 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3434 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3435 (ctxt->input->buf != NULL) &&
3436 (ctxt->input->buf->encoder == NULL)) {
3437 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3438 "htmlCheckEncoding: wrong encoding meta\n",
3439 NULL, NULL);
3440 } else {
3441 xmlSwitchEncoding(ctxt, enc);
3442 }
Owen Taylor3473f882001-02-23 17:55:21 +00003443 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3444 } else {
3445 /*
3446 * fallback for unknown encodings
3447 */
3448 handler = xmlFindCharEncodingHandler((const char *) encoding);
3449 if (handler != NULL) {
3450 xmlSwitchToEncoding(ctxt, handler);
3451 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3452 } else {
3453 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3454 }
3455 }
3456
3457 if ((ctxt->input->buf != NULL) &&
3458 (ctxt->input->buf->encoder != NULL) &&
3459 (ctxt->input->buf->raw != NULL) &&
3460 (ctxt->input->buf->buffer != NULL)) {
3461 int nbchars;
3462 int processed;
3463
3464 /*
3465 * convert as much as possible to the parser reading buffer.
3466 */
3467 processed = ctxt->input->cur - ctxt->input->base;
3468 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3469 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3470 ctxt->input->buf->buffer,
3471 ctxt->input->buf->raw);
3472 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003473 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3474 "htmlCheckEncoding: encoder error\n",
3475 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003476 }
3477 ctxt->input->base =
3478 ctxt->input->cur = ctxt->input->buf->buffer->content;
Eugene Pimenov1e60fbc2010-03-10 18:10:49 +01003479 ctxt->input->end =
3480 &ctxt->input->base[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00003481 }
3482 }
3483}
3484
3485/**
3486 * htmlCheckMeta:
3487 * @ctxt: an HTML parser context
3488 * @atts: the attributes values
3489 *
3490 * Checks an attributes from a Meta tag
3491 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003492static void
Owen Taylor3473f882001-02-23 17:55:21 +00003493htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3494 int i;
3495 const xmlChar *att, *value;
3496 int http = 0;
3497 const xmlChar *content = NULL;
3498
3499 if ((ctxt == NULL) || (atts == NULL))
3500 return;
3501
3502 i = 0;
3503 att = atts[i++];
3504 while (att != NULL) {
3505 value = atts[i++];
3506 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3507 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3508 http = 1;
3509 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3510 content = value;
3511 att = atts[i++];
3512 }
3513 if ((http) && (content != NULL))
3514 htmlCheckEncoding(ctxt, content);
3515
3516}
3517
3518/**
3519 * htmlParseStartTag:
3520 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003521 *
Owen Taylor3473f882001-02-23 17:55:21 +00003522 * parse a start of tag either for rule element or
3523 * EmptyElement. In both case we don't parse the tag closing chars.
3524 *
3525 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3526 *
3527 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3528 *
3529 * With namespace:
3530 *
3531 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3532 *
3533 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3534 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003535 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003536 */
3537
Daniel Veillard597f1c12005-07-03 23:00:18 +00003538static int
Owen Taylor3473f882001-02-23 17:55:21 +00003539htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003540 const xmlChar *name;
3541 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003542 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003543 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003544 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003545 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003546 int meta = 0;
3547 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003548 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003549
Daniel Veillarde77db162009-08-22 11:32:38 +02003550 if (ctxt->instate == XML_PARSER_EOF)
3551 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003552 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3553 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3554 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003555 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003556 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003557 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003558 NEXT;
3559
Daniel Veillard30e76072006-03-09 14:13:55 +00003560 atts = ctxt->atts;
3561 maxatts = ctxt->maxatts;
3562
Owen Taylor3473f882001-02-23 17:55:21 +00003563 GROW;
3564 name = htmlParseHTMLName(ctxt);
3565 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003566 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3567 "htmlParseStartTag: invalid element name\n",
3568 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003569 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003570 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3571 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003572 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003573 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003574 }
3575 if (xmlStrEqual(name, BAD_CAST"meta"))
3576 meta = 1;
3577
3578 /*
3579 * Check for auto-closure of HTML elements.
3580 */
3581 htmlAutoClose(ctxt, name);
3582
3583 /*
3584 * Check for implied HTML elements.
3585 */
3586 htmlCheckImplied(ctxt, name);
3587
3588 /*
3589 * Avoid html at any level > 0, head at any level != 1
3590 * or any attempt to recurse body
3591 */
3592 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003593 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3594 "htmlParseStartTag: misplaced <html> tag\n",
3595 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003596 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003597 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003598 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003599 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003600 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003601 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3602 "htmlParseStartTag: misplaced <head> tag\n",
3603 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003604 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003605 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003606 }
3607 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003608 int indx;
3609 for (indx = 0;indx < ctxt->nameNr;indx++) {
3610 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003611 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3612 "htmlParseStartTag: misplaced <body> tag\n",
3613 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003614 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003615 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003616 }
3617 }
3618 }
3619
3620 /*
3621 * Now parse the attributes, it ends up with the ending
3622 *
3623 * (S Attribute)* S?
3624 */
3625 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003626 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003627 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003628 ((CUR != '/') || (NXT(1) != '>'))) {
3629 long cons = ctxt->nbChars;
3630
3631 GROW;
3632 attname = htmlParseAttribute(ctxt, &attvalue);
3633 if (attname != NULL) {
3634
3635 /*
3636 * Well formedness requires at most one declaration of an attribute
3637 */
3638 for (i = 0; i < nbatts;i += 2) {
3639 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003640 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3641 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003642 if (attvalue != NULL)
3643 xmlFree(attvalue);
3644 goto failed;
3645 }
3646 }
3647
3648 /*
3649 * Add the pair to atts
3650 */
3651 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003652 maxatts = 22; /* allow for 10 attrs by default */
3653 atts = (const xmlChar **)
3654 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003655 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003656 htmlErrMemory(ctxt, NULL);
3657 if (attvalue != NULL)
3658 xmlFree(attvalue);
3659 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003660 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003661 ctxt->atts = atts;
3662 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003663 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003664 const xmlChar **n;
3665
Owen Taylor3473f882001-02-23 17:55:21 +00003666 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003667 n = (const xmlChar **) xmlRealloc((void *) atts,
3668 maxatts * sizeof(const xmlChar *));
3669 if (n == NULL) {
3670 htmlErrMemory(ctxt, NULL);
3671 if (attvalue != NULL)
3672 xmlFree(attvalue);
3673 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003674 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003675 atts = n;
3676 ctxt->atts = atts;
3677 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003678 }
3679 atts[nbatts++] = attname;
3680 atts[nbatts++] = attvalue;
3681 atts[nbatts] = NULL;
3682 atts[nbatts + 1] = NULL;
3683 }
3684 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003685 if (attvalue != NULL)
3686 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003687 /* Dump the bogus attribute string up to the next blank or
3688 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003689 while ((IS_CHAR_CH(CUR)) &&
3690 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003691 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003692 NEXT;
3693 }
3694
3695failed:
3696 SKIP_BLANKS;
3697 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003698 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3699 "htmlParseStartTag: problem parsing attributes\n",
3700 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003701 break;
3702 }
3703 }
3704
3705 /*
3706 * Handle specific association to the META tag
3707 */
William M. Bracke978ae22007-03-21 06:16:02 +00003708 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003709 htmlCheckMeta(ctxt, atts);
3710
3711 /*
3712 * SAX: Start of Element !
3713 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003714 if (!discardtag) {
3715 htmlnamePush(ctxt, name);
3716 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3717 if (nbatts != 0)
3718 ctxt->sax->startElement(ctxt->userData, name, atts);
3719 else
3720 ctxt->sax->startElement(ctxt->userData, name, NULL);
3721 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003722 }
Owen Taylor3473f882001-02-23 17:55:21 +00003723
3724 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003725 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003726 if (atts[i] != NULL)
3727 xmlFree((xmlChar *) atts[i]);
3728 }
Owen Taylor3473f882001-02-23 17:55:21 +00003729 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003730
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003731 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003732}
3733
3734/**
3735 * htmlParseEndTag:
3736 * @ctxt: an HTML parser context
3737 *
3738 * parse an end of tag
3739 *
3740 * [42] ETag ::= '</' Name S? '>'
3741 *
3742 * With namespace
3743 *
3744 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003745 *
3746 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003747 */
3748
Daniel Veillardf420ac52001-07-04 16:04:09 +00003749static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003750htmlParseEndTag(htmlParserCtxtPtr ctxt)
3751{
3752 const xmlChar *name;
3753 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003754 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003755
3756 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003757 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3758 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003759 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003760 }
3761 SKIP(2);
3762
3763 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003764 if (name == NULL)
3765 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003766 /*
3767 * We should definitely be at the ending "S? '>'" part
3768 */
3769 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003770 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003771 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3772 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003773 if (ctxt->recovery) {
3774 /*
3775 * We're not at the ending > !!
3776 * Error, unless in recover mode where we search forwards
3777 * until we find a >
3778 */
3779 while (CUR != '\0' && CUR != '>') NEXT;
3780 NEXT;
3781 }
Owen Taylor3473f882001-02-23 17:55:21 +00003782 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003783 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003784
3785 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003786 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3787 * out now.
3788 */
3789 if ((ctxt->depth > 0) &&
3790 (xmlStrEqual(name, BAD_CAST "html") ||
3791 xmlStrEqual(name, BAD_CAST "body") ||
3792 xmlStrEqual(name, BAD_CAST "head"))) {
3793 ctxt->depth--;
3794 return (0);
3795 }
3796
3797 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003798 * If the name read is not one of the element in the parsing stack
3799 * then return, it's just an error.
3800 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003801 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3802 if (xmlStrEqual(name, ctxt->nameTab[i]))
3803 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003804 }
3805 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003806 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3807 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003808 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003809 }
3810
3811
3812 /*
3813 * Check for auto-closure of HTML elements.
3814 */
3815
3816 htmlAutoCloseOnClose(ctxt, name);
3817
3818 /*
3819 * Well formedness constraints, opening and closing must match.
3820 * With the exception that the autoclose may have popped stuff out
3821 * of the stack.
3822 */
3823 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003824 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003825 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3826 "Opening and ending tag mismatch: %s and %s\n",
3827 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003828 }
3829 }
3830
3831 /*
3832 * SAX: End of Tag
3833 */
3834 oldname = ctxt->name;
3835 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003836 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3837 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003838 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003839 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003840 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003841 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003842 }
3843
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003844 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003845}
3846
3847
3848/**
3849 * htmlParseReference:
3850 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003851 *
Owen Taylor3473f882001-02-23 17:55:21 +00003852 * parse and handle entity references in content,
3853 * this will end-up in a call to character() since this is either a
3854 * CharRef, or a predefined entity.
3855 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003856static void
Owen Taylor3473f882001-02-23 17:55:21 +00003857htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003858 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003859 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003860 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003861 if (CUR != '&') return;
3862
3863 if (NXT(1) == '#') {
3864 unsigned int c;
3865 int bits, i = 0;
3866
3867 c = htmlParseCharRef(ctxt);
3868 if (c == 0)
3869 return;
3870
3871 if (c < 0x80) { out[i++]= c; bits= -6; }
3872 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3873 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3874 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003875
Owen Taylor3473f882001-02-23 17:55:21 +00003876 for ( ; bits >= 0; bits-= 6) {
3877 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3878 }
3879 out[i] = 0;
3880
3881 htmlCheckParagraph(ctxt);
3882 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3883 ctxt->sax->characters(ctxt->userData, out, i);
3884 } else {
3885 ent = htmlParseEntityRef(ctxt, &name);
3886 if (name == NULL) {
3887 htmlCheckParagraph(ctxt);
3888 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3889 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3890 return;
3891 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003892 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003893 htmlCheckParagraph(ctxt);
3894 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3895 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3896 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3897 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3898 }
3899 } else {
3900 unsigned int c;
3901 int bits, i = 0;
3902
3903 c = ent->value;
3904 if (c < 0x80)
3905 { out[i++]= c; bits= -6; }
3906 else if (c < 0x800)
3907 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3908 else if (c < 0x10000)
3909 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003910 else
Owen Taylor3473f882001-02-23 17:55:21 +00003911 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003912
Owen Taylor3473f882001-02-23 17:55:21 +00003913 for ( ; bits >= 0; bits-= 6) {
3914 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3915 }
3916 out[i] = 0;
3917
3918 htmlCheckParagraph(ctxt);
3919 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3920 ctxt->sax->characters(ctxt->userData, out, i);
3921 }
Owen Taylor3473f882001-02-23 17:55:21 +00003922 }
3923}
3924
3925/**
3926 * htmlParseContent:
3927 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003928 *
3929 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003930 */
3931
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003932static void
Owen Taylor3473f882001-02-23 17:55:21 +00003933htmlParseContent(htmlParserCtxtPtr ctxt) {
3934 xmlChar *currentNode;
3935 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003936 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003937
3938 currentNode = xmlStrdup(ctxt->name);
3939 depth = ctxt->nameNr;
3940 while (1) {
3941 long cons = ctxt->nbChars;
3942
3943 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003944
3945 if (ctxt->instate == XML_PARSER_EOF)
3946 break;
3947
Owen Taylor3473f882001-02-23 17:55:21 +00003948 /*
3949 * Our tag or one of it's parent or children is ending.
3950 */
3951 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003952 if (htmlParseEndTag(ctxt) &&
3953 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3954 if (currentNode != NULL)
3955 xmlFree(currentNode);
3956 return;
3957 }
3958 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003959 }
3960
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003961 else if ((CUR == '<') &&
3962 ((IS_ASCII_LETTER(NXT(1))) ||
3963 (NXT(1) == '_') || (NXT(1) == ':'))) {
3964 name = htmlParseHTMLName_nonInvasive(ctxt);
3965 if (name == NULL) {
3966 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3967 "htmlParseStartTag: invalid element name\n",
3968 NULL, NULL);
3969 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003970 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003971 NEXT;
3972
3973 if (currentNode != NULL)
3974 xmlFree(currentNode);
3975 return;
3976 }
3977
3978 if (ctxt->name != NULL) {
3979 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3980 htmlAutoClose(ctxt, name);
3981 continue;
3982 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003983 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003984 }
3985
Owen Taylor3473f882001-02-23 17:55:21 +00003986 /*
3987 * Has this node been popped out during parsing of
3988 * the next element
3989 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003990 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3991 (!xmlStrEqual(currentNode, ctxt->name)))
3992 {
Owen Taylor3473f882001-02-23 17:55:21 +00003993 if (currentNode != NULL) xmlFree(currentNode);
3994 return;
3995 }
3996
Daniel Veillardf9533d12001-03-03 10:04:57 +00003997 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3998 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003999 /*
4000 * Handle SCRIPT/STYLE separately
4001 */
4002 htmlParseScript(ctxt);
4003 } else {
4004 /*
4005 * Sometimes DOCTYPE arrives in the middle of the document
4006 */
4007 if ((CUR == '<') && (NXT(1) == '!') &&
4008 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4009 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4010 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4011 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004012 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4013 "Misplaced DOCTYPE declaration\n",
4014 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004015 htmlParseDocTypeDecl(ctxt);
4016 }
4017
4018 /*
4019 * First case : a comment
4020 */
4021 if ((CUR == '<') && (NXT(1) == '!') &&
4022 (NXT(2) == '-') && (NXT(3) == '-')) {
4023 htmlParseComment(ctxt);
4024 }
4025
4026 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004027 * Second case : a Processing Instruction.
4028 */
4029 else if ((CUR == '<') && (NXT(1) == '?')) {
4030 htmlParsePI(ctxt);
4031 }
4032
4033 /*
4034 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004035 */
4036 else if (CUR == '<') {
4037 htmlParseElement(ctxt);
4038 }
4039
4040 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004041 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004042 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004043 */
4044 else if (CUR == '&') {
4045 htmlParseReference(ctxt);
4046 }
4047
4048 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004049 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004050 */
4051 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004052 htmlAutoCloseOnEnd(ctxt);
4053 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004054 }
4055
4056 /*
4057 * Last case, text. Note that References are handled directly.
4058 */
4059 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004060 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004061 }
4062
4063 if (cons == ctxt->nbChars) {
4064 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004065 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4066 "detected an error in element content\n",
4067 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004068 }
4069 break;
4070 }
4071 }
4072 GROW;
4073 }
4074 if (currentNode != NULL) xmlFree(currentNode);
4075}
4076
4077/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004078 * htmlParseContent:
4079 * @ctxt: an HTML parser context
4080 *
4081 * Parse a content: comment, sub-element, reference or text.
4082 */
4083
4084void
4085__htmlParseContent(void *ctxt) {
4086 if (ctxt != NULL)
4087 htmlParseContent((htmlParserCtxtPtr) ctxt);
4088}
4089
4090/**
Owen Taylor3473f882001-02-23 17:55:21 +00004091 * htmlParseElement:
4092 * @ctxt: an HTML parser context
4093 *
4094 * parse an HTML element, this is highly recursive
4095 *
4096 * [39] element ::= EmptyElemTag | STag content ETag
4097 *
4098 * [41] Attribute ::= Name Eq AttValue
4099 */
4100
4101void
4102htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004103 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004104 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004105 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004106 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004107 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004108 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004109 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004110
Daniel Veillarda03e3652004-11-02 18:45:30 +00004111 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4112 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004113 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004114 return;
4115 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004116
4117 if (ctxt->instate == XML_PARSER_EOF)
4118 return;
4119
Owen Taylor3473f882001-02-23 17:55:21 +00004120 /* Capture start position */
4121 if (ctxt->record_info) {
4122 node_info.begin_pos = ctxt->input->consumed +
4123 (CUR_PTR - ctxt->input->base);
4124 node_info.begin_line = ctxt->input->line;
4125 }
4126
Daniel Veillard597f1c12005-07-03 23:00:18 +00004127 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004128 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004129 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004130 if (CUR == '>')
4131 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004132 return;
4133 }
Owen Taylor3473f882001-02-23 17:55:21 +00004134
4135 /*
4136 * Lookup the info for that element.
4137 */
4138 info = htmlTagLookup(name);
4139 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004140 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4141 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004142 }
4143
4144 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004145 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004146 */
4147 if ((CUR == '/') && (NXT(1) == '>')) {
4148 SKIP(2);
4149 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4150 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004151 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004152 return;
4153 }
4154
4155 if (CUR == '>') {
4156 NEXT;
4157 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004158 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4159 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004160
4161 /*
4162 * end of parsing of this node.
4163 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004164 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004165 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004166 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004167 }
Owen Taylor3473f882001-02-23 17:55:21 +00004168
4169 /*
4170 * Capture end position and add node
4171 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004172 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004173 node_info.end_pos = ctxt->input->consumed +
4174 (CUR_PTR - ctxt->input->base);
4175 node_info.end_line = ctxt->input->line;
4176 node_info.node = ctxt->node;
4177 xmlParserAddNodeInfo(ctxt, &node_info);
4178 }
4179 return;
4180 }
4181
4182 /*
4183 * Check for an Empty Element from DTD definition
4184 */
4185 if ((info != NULL) && (info->empty)) {
4186 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4187 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004188 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004189 return;
4190 }
4191
4192 /*
4193 * Parse the content of the element:
4194 */
4195 currentNode = xmlStrdup(ctxt->name);
4196 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004197 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004198 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004199 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004200 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004201 if (ctxt->nameNr < depth) break;
4202 }
Owen Taylor3473f882001-02-23 17:55:21 +00004203
Owen Taylor3473f882001-02-23 17:55:21 +00004204 /*
4205 * Capture end position and add node
4206 */
4207 if ( currentNode != NULL && ctxt->record_info ) {
4208 node_info.end_pos = ctxt->input->consumed +
4209 (CUR_PTR - ctxt->input->base);
4210 node_info.end_line = ctxt->input->line;
4211 node_info.node = ctxt->node;
4212 xmlParserAddNodeInfo(ctxt, &node_info);
4213 }
William M. Brack76e95df2003-10-18 16:20:14 +00004214 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004215 htmlAutoCloseOnEnd(ctxt);
4216 }
4217
Owen Taylor3473f882001-02-23 17:55:21 +00004218 if (currentNode != NULL)
4219 xmlFree(currentNode);
4220}
4221
4222/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004223 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004224 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004225 *
Owen Taylor3473f882001-02-23 17:55:21 +00004226 * parse an HTML document (and build a tree if using the standard SAX
4227 * interface).
4228 *
4229 * Returns 0, -1 in case of error. the parser context is augmented
4230 * as a result of the parsing.
4231 */
4232
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004233int
Owen Taylor3473f882001-02-23 17:55:21 +00004234htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004235 xmlChar start[4];
4236 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004237 xmlDtdPtr dtd;
4238
Daniel Veillardd0463562001-10-13 09:15:48 +00004239 xmlInitParser();
4240
Owen Taylor3473f882001-02-23 17:55:21 +00004241 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004242
Daniel Veillarda03e3652004-11-02 18:45:30 +00004243 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4244 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4245 "htmlParseDocument: context error\n", NULL, NULL);
4246 return(XML_ERR_INTERNAL_ERROR);
4247 }
4248 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004249 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004250 GROW;
4251 /*
4252 * SAX: beginning of the document processing.
4253 */
4254 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4255 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4256
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004257 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4258 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4259 /*
4260 * Get the 4 first bytes and decode the charset
4261 * if enc != XML_CHAR_ENCODING_NONE
4262 * plug some encoding conversion routines.
4263 */
4264 start[0] = RAW;
4265 start[1] = NXT(1);
4266 start[2] = NXT(2);
4267 start[3] = NXT(3);
4268 enc = xmlDetectCharEncoding(&start[0], 4);
4269 if (enc != XML_CHAR_ENCODING_NONE) {
4270 xmlSwitchEncoding(ctxt, enc);
4271 }
4272 }
4273
Owen Taylor3473f882001-02-23 17:55:21 +00004274 /*
4275 * Wipe out everything which is before the first '<'
4276 */
4277 SKIP_BLANKS;
4278 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004279 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004280 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004281 }
4282
4283 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4284 ctxt->sax->startDocument(ctxt->userData);
4285
4286
4287 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004288 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004289 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004290 while (((CUR == '<') && (NXT(1) == '!') &&
4291 (NXT(2) == '-') && (NXT(3) == '-')) ||
4292 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004293 htmlParseComment(ctxt);
4294 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004295 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004296 }
Owen Taylor3473f882001-02-23 17:55:21 +00004297
4298
4299 /*
4300 * Then possibly doc type declaration(s) and more Misc
4301 * (doctypedecl Misc*)?
4302 */
4303 if ((CUR == '<') && (NXT(1) == '!') &&
4304 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4305 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4306 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4307 (UPP(8) == 'E')) {
4308 htmlParseDocTypeDecl(ctxt);
4309 }
4310 SKIP_BLANKS;
4311
4312 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004313 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004314 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004315 while (((CUR == '<') && (NXT(1) == '!') &&
4316 (NXT(2) == '-') && (NXT(3) == '-')) ||
4317 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004318 htmlParseComment(ctxt);
4319 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004320 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004321 }
Owen Taylor3473f882001-02-23 17:55:21 +00004322
4323 /*
4324 * Time to start parsing the tree itself
4325 */
4326 htmlParseContent(ctxt);
4327
4328 /*
4329 * autoclose
4330 */
4331 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004332 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004333
4334
4335 /*
4336 * SAX: end of the document processing.
4337 */
4338 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4339 ctxt->sax->endDocument(ctxt->userData);
4340
4341 if (ctxt->myDoc != NULL) {
4342 dtd = xmlGetIntSubset(ctxt->myDoc);
4343 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004344 ctxt->myDoc->intSubset =
4345 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004346 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4347 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4348 }
4349 if (! ctxt->wellFormed) return(-1);
4350 return(0);
4351}
4352
4353
4354/************************************************************************
4355 * *
4356 * Parser contexts handling *
4357 * *
4358 ************************************************************************/
4359
4360/**
William M. Brackedb65a72004-02-06 07:36:04 +00004361 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004362 * @ctxt: an HTML parser context
4363 *
4364 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004365 *
4366 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004367 */
4368
Daniel Veillardf403d292003-10-05 13:51:35 +00004369static int
Owen Taylor3473f882001-02-23 17:55:21 +00004370htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4371{
4372 htmlSAXHandler *sax;
4373
Daniel Veillardf403d292003-10-05 13:51:35 +00004374 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004375 memset(ctxt, 0, sizeof(htmlParserCtxt));
4376
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004377 ctxt->dict = xmlDictCreate();
4378 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004379 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4380 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004381 }
Owen Taylor3473f882001-02-23 17:55:21 +00004382 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4383 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004384 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4385 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004386 }
4387 else
4388 memset(sax, 0, sizeof(htmlSAXHandler));
4389
4390 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004391 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004392 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4393 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004394 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004395 ctxt->inputNr = 0;
4396 ctxt->inputMax = 0;
4397 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004398 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004399 }
4400 ctxt->inputNr = 0;
4401 ctxt->inputMax = 5;
4402 ctxt->input = NULL;
4403 ctxt->version = NULL;
4404 ctxt->encoding = NULL;
4405 ctxt->standalone = -1;
4406 ctxt->instate = XML_PARSER_START;
4407
4408 /* Allocate the Node stack */
4409 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4410 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004411 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004412 ctxt->nodeNr = 0;
4413 ctxt->nodeMax = 0;
4414 ctxt->node = NULL;
4415 ctxt->inputNr = 0;
4416 ctxt->inputMax = 0;
4417 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004418 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004419 }
4420 ctxt->nodeNr = 0;
4421 ctxt->nodeMax = 10;
4422 ctxt->node = NULL;
4423
4424 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004425 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004426 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004427 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004428 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004429 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004430 ctxt->name = NULL;
4431 ctxt->nodeNr = 0;
4432 ctxt->nodeMax = 0;
4433 ctxt->node = NULL;
4434 ctxt->inputNr = 0;
4435 ctxt->inputMax = 0;
4436 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004437 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004438 }
4439 ctxt->nameNr = 0;
4440 ctxt->nameMax = 10;
4441 ctxt->name = NULL;
4442
Daniel Veillard092643b2003-09-25 14:29:29 +00004443 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004444 else {
4445 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004446 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004447 }
4448 ctxt->userData = ctxt;
4449 ctxt->myDoc = NULL;
4450 ctxt->wellFormed = 1;
4451 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004452 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004453 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004454 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004455 ctxt->vctxt.userData = ctxt;
4456 ctxt->vctxt.error = xmlParserValidityError;
4457 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004458 ctxt->record_info = 0;
4459 ctxt->validate = 0;
4460 ctxt->nbChars = 0;
4461 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004462 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004463 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004464 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004465}
4466
4467/**
4468 * htmlFreeParserCtxt:
4469 * @ctxt: an HTML parser context
4470 *
4471 * Free all the memory used by a parser context. However the parsed
4472 * document in ctxt->myDoc is not freed.
4473 */
4474
4475void
4476htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4477{
4478 xmlFreeParserCtxt(ctxt);
4479}
4480
4481/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004482 * htmlNewParserCtxt:
4483 *
4484 * Allocate and initialize a new parser context.
4485 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004486 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004487 */
4488
Daniel Veillard34c647c2006-09-21 06:53:59 +00004489htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004490htmlNewParserCtxt(void)
4491{
4492 xmlParserCtxtPtr ctxt;
4493
4494 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4495 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004496 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004497 return(NULL);
4498 }
4499 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004500 if (htmlInitParserCtxt(ctxt) < 0) {
4501 htmlFreeParserCtxt(ctxt);
4502 return(NULL);
4503 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004504 return(ctxt);
4505}
4506
4507/**
4508 * htmlCreateMemoryParserCtxt:
4509 * @buffer: a pointer to a char array
4510 * @size: the size of the array
4511 *
4512 * Create a parser context for an HTML in-memory document.
4513 *
4514 * Returns the new parser context or NULL
4515 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004516htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004517htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4518 xmlParserCtxtPtr ctxt;
4519 xmlParserInputPtr input;
4520 xmlParserInputBufferPtr buf;
4521
4522 if (buffer == NULL)
4523 return(NULL);
4524 if (size <= 0)
4525 return(NULL);
4526
4527 ctxt = htmlNewParserCtxt();
4528 if (ctxt == NULL)
4529 return(NULL);
4530
4531 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4532 if (buf == NULL) return(NULL);
4533
4534 input = xmlNewInputStream(ctxt);
4535 if (input == NULL) {
4536 xmlFreeParserCtxt(ctxt);
4537 return(NULL);
4538 }
4539
4540 input->filename = NULL;
4541 input->buf = buf;
4542 input->base = input->buf->buffer->content;
4543 input->cur = input->buf->buffer->content;
4544 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4545
4546 inputPush(ctxt, input);
4547 return(ctxt);
4548}
4549
4550/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004551 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004552 * @cur: a pointer to an array of xmlChar
4553 * @encoding: a free form C string describing the HTML document encoding, or NULL
4554 *
4555 * Create a parser context for an HTML document.
4556 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004557 * TODO: check the need to add encoding handling there
4558 *
Owen Taylor3473f882001-02-23 17:55:21 +00004559 * Returns the new parser context or NULL
4560 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004561static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004562htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004563 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004564 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004565
Daniel Veillard1d995272002-07-22 16:43:32 +00004566 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004567 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004568 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004569 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004570 if (ctxt == NULL)
4571 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004572
4573 if (encoding != NULL) {
4574 xmlCharEncoding enc;
4575 xmlCharEncodingHandlerPtr handler;
4576
4577 if (ctxt->input->encoding != NULL)
4578 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004579 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004580
4581 enc = xmlParseCharEncoding(encoding);
4582 /*
4583 * registered set of known encodings
4584 */
4585 if (enc != XML_CHAR_ENCODING_ERROR) {
4586 xmlSwitchEncoding(ctxt, enc);
4587 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004588 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004589 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004590 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004591 }
4592 } else {
4593 /*
4594 * fallback for unknown encodings
4595 */
4596 handler = xmlFindCharEncodingHandler((const char *) encoding);
4597 if (handler != NULL) {
4598 xmlSwitchToEncoding(ctxt, handler);
4599 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004600 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4601 "Unsupported encoding %s\n",
4602 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004603 }
4604 }
4605 }
4606 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004607}
4608
Daniel Veillard73b013f2003-09-30 12:36:01 +00004609#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004610/************************************************************************
4611 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004612 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004613 * *
4614 ************************************************************************/
4615
4616/**
4617 * htmlParseLookupSequence:
4618 * @ctxt: an HTML parser context
4619 * @first: the first char to lookup
4620 * @next: the next char to lookup or zero
4621 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004622 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004623 *
4624 * Try to find if a sequence (first, next, third) or just (first next) or
4625 * (first) is available in the input stream.
4626 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4627 * to avoid rescanning sequences of bytes, it DOES change the state of the
4628 * parser, do not use liberally.
4629 * This is basically similar to xmlParseLookupSequence()
4630 *
4631 * Returns the index to the current parsing point if the full sequence
4632 * is available, -1 otherwise.
4633 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004634static int
Owen Taylor3473f882001-02-23 17:55:21 +00004635htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004636 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02004637 int ignoreattrval)
4638{
Owen Taylor3473f882001-02-23 17:55:21 +00004639 int base, len;
4640 htmlParserInputPtr in;
4641 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004642 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004643 int invalue = 0;
4644 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004645
4646 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004647 if (in == NULL)
4648 return (-1);
4649
Owen Taylor3473f882001-02-23 17:55:21 +00004650 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004651 if (base < 0)
4652 return (-1);
4653
Owen Taylor3473f882001-02-23 17:55:21 +00004654 if (ctxt->checkIndex > base)
4655 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004656
Owen Taylor3473f882001-02-23 17:55:21 +00004657 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004658 buf = in->base;
4659 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00004660 } else {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004661 buf = in->buf->buffer->content;
4662 len = in->buf->buffer->use;
Owen Taylor3473f882001-02-23 17:55:21 +00004663 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02004664
Owen Taylor3473f882001-02-23 17:55:21 +00004665 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02004666 if (third)
4667 len -= 2;
4668 else if (next)
4669 len--;
4670 for (; base < len; base++) {
4671 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
4672 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4673 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4674 incomment = 1;
4675 /* do not increment past <! - some people use <!--> */
4676 base += 2;
4677 }
4678 }
4679 if (ignoreattrval) {
4680 if (buf[base] == '"' || buf[base] == '\'') {
4681 if (invalue) {
4682 if (buf[base] == valdellim) {
4683 invalue = 0;
4684 continue;
4685 }
4686 } else {
4687 valdellim = buf[base];
4688 invalue = 1;
4689 continue;
4690 }
4691 } else if (invalue) {
4692 continue;
4693 }
4694 }
4695 if (incomment) {
4696 if (base + 3 > len)
4697 return (-1);
4698 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4699 (buf[base + 2] == '>')) {
4700 incomment = 0;
4701 base += 2;
4702 }
4703 continue;
4704 }
Owen Taylor3473f882001-02-23 17:55:21 +00004705 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004706 if (third != 0) {
4707 if ((buf[base + 1] != next) || (buf[base + 2] != third))
4708 continue;
4709 } else if (next != 0) {
4710 if (buf[base + 1] != next)
4711 continue;
4712 }
4713 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004714#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02004715 if (next == 0)
4716 xmlGenericError(xmlGenericErrorContext,
4717 "HPP: lookup '%c' found at %d\n",
4718 first, base);
4719 else if (third == 0)
4720 xmlGenericError(xmlGenericErrorContext,
4721 "HPP: lookup '%c%c' found at %d\n",
4722 first, next, base);
4723 else
4724 xmlGenericError(xmlGenericErrorContext,
4725 "HPP: lookup '%c%c%c' found at %d\n",
4726 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00004727#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02004728 return (base - (in->cur - in->base));
4729 }
Owen Taylor3473f882001-02-23 17:55:21 +00004730 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02004731 if ((!incomment) && (!invalue))
4732 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00004733#ifdef DEBUG_PUSH
4734 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02004735 xmlGenericError(xmlGenericErrorContext,
4736 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00004737 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02004738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004740 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02004741 xmlGenericError(xmlGenericErrorContext,
4742 "HPP: lookup '%c%c%c' failed\n", first, next,
4743 third);
Owen Taylor3473f882001-02-23 17:55:21 +00004744#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02004745 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004746}
4747
4748/**
Markus Kull56a03032009-08-24 19:00:23 +02004749 * htmlParseLookupChars:
4750 * @ctxt: an HTML parser context
4751 * @stop: Array of chars, which stop the lookup.
4752 * @stopLen: Length of stop-Array
4753 *
4754 * Try to find if any char of the stop-Array is available in the input
4755 * stream.
4756 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4757 * to avoid rescanning sequences of bytes, it DOES change the state of the
4758 * parser, do not use liberally.
4759 *
4760 * Returns the index to the current parsing point if a stopChar
4761 * is available, -1 otherwise.
4762 */
4763static int
4764htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
4765 int stopLen)
4766{
4767 int base, len;
4768 htmlParserInputPtr in;
4769 const xmlChar *buf;
4770 int incomment = 0;
4771 int i;
4772
4773 in = ctxt->input;
4774 if (in == NULL)
4775 return (-1);
4776
4777 base = in->cur - in->base;
4778 if (base < 0)
4779 return (-1);
4780
4781 if (ctxt->checkIndex > base)
4782 base = ctxt->checkIndex;
4783
4784 if (in->buf == NULL) {
4785 buf = in->base;
4786 len = in->length;
4787 } else {
4788 buf = in->buf->buffer->content;
4789 len = in->buf->buffer->use;
4790 }
4791
4792 for (; base < len; base++) {
4793 if (!incomment && (base + 4 < len)) {
4794 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4795 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4796 incomment = 1;
4797 /* do not increment past <! - some people use <!--> */
4798 base += 2;
4799 }
4800 }
4801 if (incomment) {
4802 if (base + 3 > len)
4803 return (-1);
4804 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4805 (buf[base + 2] == '>')) {
4806 incomment = 0;
4807 base += 2;
4808 }
4809 continue;
4810 }
4811 for (i = 0; i < stopLen; ++i) {
4812 if (buf[base] == stop[i]) {
4813 ctxt->checkIndex = 0;
4814 return (base - (in->cur - in->base));
4815 }
4816 }
4817 }
4818 ctxt->checkIndex = base;
4819 return (-1);
4820}
4821
4822/**
Owen Taylor3473f882001-02-23 17:55:21 +00004823 * htmlParseTryOrFinish:
4824 * @ctxt: an HTML parser context
4825 * @terminate: last chunk indicator
4826 *
4827 * Try to progress on parsing
4828 *
4829 * Returns zero if no parsing was possible
4830 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004831static int
Owen Taylor3473f882001-02-23 17:55:21 +00004832htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4833 int ret = 0;
4834 htmlParserInputPtr in;
4835 int avail = 0;
4836 xmlChar cur, next;
4837
4838#ifdef DEBUG_PUSH
4839 switch (ctxt->instate) {
4840 case XML_PARSER_EOF:
4841 xmlGenericError(xmlGenericErrorContext,
4842 "HPP: try EOF\n"); break;
4843 case XML_PARSER_START:
4844 xmlGenericError(xmlGenericErrorContext,
4845 "HPP: try START\n"); break;
4846 case XML_PARSER_MISC:
4847 xmlGenericError(xmlGenericErrorContext,
4848 "HPP: try MISC\n");break;
4849 case XML_PARSER_COMMENT:
4850 xmlGenericError(xmlGenericErrorContext,
4851 "HPP: try COMMENT\n");break;
4852 case XML_PARSER_PROLOG:
4853 xmlGenericError(xmlGenericErrorContext,
4854 "HPP: try PROLOG\n");break;
4855 case XML_PARSER_START_TAG:
4856 xmlGenericError(xmlGenericErrorContext,
4857 "HPP: try START_TAG\n");break;
4858 case XML_PARSER_CONTENT:
4859 xmlGenericError(xmlGenericErrorContext,
4860 "HPP: try CONTENT\n");break;
4861 case XML_PARSER_CDATA_SECTION:
4862 xmlGenericError(xmlGenericErrorContext,
4863 "HPP: try CDATA_SECTION\n");break;
4864 case XML_PARSER_END_TAG:
4865 xmlGenericError(xmlGenericErrorContext,
4866 "HPP: try END_TAG\n");break;
4867 case XML_PARSER_ENTITY_DECL:
4868 xmlGenericError(xmlGenericErrorContext,
4869 "HPP: try ENTITY_DECL\n");break;
4870 case XML_PARSER_ENTITY_VALUE:
4871 xmlGenericError(xmlGenericErrorContext,
4872 "HPP: try ENTITY_VALUE\n");break;
4873 case XML_PARSER_ATTRIBUTE_VALUE:
4874 xmlGenericError(xmlGenericErrorContext,
4875 "HPP: try ATTRIBUTE_VALUE\n");break;
4876 case XML_PARSER_DTD:
4877 xmlGenericError(xmlGenericErrorContext,
4878 "HPP: try DTD\n");break;
4879 case XML_PARSER_EPILOG:
4880 xmlGenericError(xmlGenericErrorContext,
4881 "HPP: try EPILOG\n");break;
4882 case XML_PARSER_PI:
4883 xmlGenericError(xmlGenericErrorContext,
4884 "HPP: try PI\n");break;
4885 case XML_PARSER_SYSTEM_LITERAL:
4886 xmlGenericError(xmlGenericErrorContext,
4887 "HPP: try SYSTEM_LITERAL\n");break;
4888 }
4889#endif
4890
4891 while (1) {
4892
4893 in = ctxt->input;
4894 if (in == NULL) break;
4895 if (in->buf == NULL)
4896 avail = in->length - (in->cur - in->base);
4897 else
4898 avail = in->buf->buffer->use - (in->cur - in->base);
4899 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004900 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004901 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004902 /*
4903 * SAX: end of the document processing.
4904 */
4905 ctxt->instate = XML_PARSER_EOF;
4906 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4907 ctxt->sax->endDocument(ctxt->userData);
4908 }
4909 }
4910 if (avail < 1)
4911 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004912 cur = in->cur[0];
4913 if (cur == 0) {
4914 SKIP(1);
4915 continue;
4916 }
4917
Owen Taylor3473f882001-02-23 17:55:21 +00004918 switch (ctxt->instate) {
4919 case XML_PARSER_EOF:
4920 /*
4921 * Document parsing is done !
4922 */
4923 goto done;
4924 case XML_PARSER_START:
4925 /*
4926 * Very first chars read from the document flow.
4927 */
4928 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004929 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004930 SKIP_BLANKS;
4931 if (in->buf == NULL)
4932 avail = in->length - (in->cur - in->base);
4933 else
4934 avail = in->buf->buffer->use - (in->cur - in->base);
4935 }
4936 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4937 ctxt->sax->setDocumentLocator(ctxt->userData,
4938 &xmlDefaultSAXLocator);
4939 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4940 (!ctxt->disableSAX))
4941 ctxt->sax->startDocument(ctxt->userData);
4942
4943 cur = in->cur[0];
4944 next = in->cur[1];
4945 if ((cur == '<') && (next == '!') &&
4946 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4947 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4948 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4949 (UPP(8) == 'E')) {
4950 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004951 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004952 goto done;
4953#ifdef DEBUG_PUSH
4954 xmlGenericError(xmlGenericErrorContext,
4955 "HPP: Parsing internal subset\n");
4956#endif
4957 htmlParseDocTypeDecl(ctxt);
4958 ctxt->instate = XML_PARSER_PROLOG;
4959#ifdef DEBUG_PUSH
4960 xmlGenericError(xmlGenericErrorContext,
4961 "HPP: entering PROLOG\n");
4962#endif
4963 } else {
4964 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004965#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004966 xmlGenericError(xmlGenericErrorContext,
4967 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004968#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004969 }
Owen Taylor3473f882001-02-23 17:55:21 +00004970 break;
4971 case XML_PARSER_MISC:
4972 SKIP_BLANKS;
4973 if (in->buf == NULL)
4974 avail = in->length - (in->cur - in->base);
4975 else
4976 avail = in->buf->buffer->use - (in->cur - in->base);
4977 if (avail < 2)
4978 goto done;
4979 cur = in->cur[0];
4980 next = in->cur[1];
4981 if ((cur == '<') && (next == '!') &&
4982 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4983 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004984 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004985 goto done;
4986#ifdef DEBUG_PUSH
4987 xmlGenericError(xmlGenericErrorContext,
4988 "HPP: Parsing Comment\n");
4989#endif
4990 htmlParseComment(ctxt);
4991 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004992 } else if ((cur == '<') && (next == '?')) {
4993 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004994 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004995 goto done;
4996#ifdef DEBUG_PUSH
4997 xmlGenericError(xmlGenericErrorContext,
4998 "HPP: Parsing PI\n");
4999#endif
5000 htmlParsePI(ctxt);
5001 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005002 } else if ((cur == '<') && (next == '!') &&
5003 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5004 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5005 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5006 (UPP(8) == 'E')) {
5007 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005008 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005009 goto done;
5010#ifdef DEBUG_PUSH
5011 xmlGenericError(xmlGenericErrorContext,
5012 "HPP: Parsing internal subset\n");
5013#endif
5014 htmlParseDocTypeDecl(ctxt);
5015 ctxt->instate = XML_PARSER_PROLOG;
5016#ifdef DEBUG_PUSH
5017 xmlGenericError(xmlGenericErrorContext,
5018 "HPP: entering PROLOG\n");
5019#endif
5020 } else if ((cur == '<') && (next == '!') &&
5021 (avail < 9)) {
5022 goto done;
5023 } else {
5024 ctxt->instate = XML_PARSER_START_TAG;
5025#ifdef DEBUG_PUSH
5026 xmlGenericError(xmlGenericErrorContext,
5027 "HPP: entering START_TAG\n");
5028#endif
5029 }
5030 break;
5031 case XML_PARSER_PROLOG:
5032 SKIP_BLANKS;
5033 if (in->buf == NULL)
5034 avail = in->length - (in->cur - in->base);
5035 else
5036 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005037 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005038 goto done;
5039 cur = in->cur[0];
5040 next = in->cur[1];
5041 if ((cur == '<') && (next == '!') &&
5042 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5043 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005044 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005045 goto done;
5046#ifdef DEBUG_PUSH
5047 xmlGenericError(xmlGenericErrorContext,
5048 "HPP: Parsing Comment\n");
5049#endif
5050 htmlParseComment(ctxt);
5051 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005052 } else if ((cur == '<') && (next == '?')) {
5053 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005054 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005055 goto done;
5056#ifdef DEBUG_PUSH
5057 xmlGenericError(xmlGenericErrorContext,
5058 "HPP: Parsing PI\n");
5059#endif
5060 htmlParsePI(ctxt);
5061 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005062 } else if ((cur == '<') && (next == '!') &&
5063 (avail < 4)) {
5064 goto done;
5065 } else {
5066 ctxt->instate = XML_PARSER_START_TAG;
5067#ifdef DEBUG_PUSH
5068 xmlGenericError(xmlGenericErrorContext,
5069 "HPP: entering START_TAG\n");
5070#endif
5071 }
5072 break;
5073 case XML_PARSER_EPILOG:
5074 if (in->buf == NULL)
5075 avail = in->length - (in->cur - in->base);
5076 else
5077 avail = in->buf->buffer->use - (in->cur - in->base);
5078 if (avail < 1)
5079 goto done;
5080 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005081 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005082 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005083 goto done;
5084 }
5085 if (avail < 2)
5086 goto done;
5087 next = in->cur[1];
5088 if ((cur == '<') && (next == '!') &&
5089 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5090 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005091 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005092 goto done;
5093#ifdef DEBUG_PUSH
5094 xmlGenericError(xmlGenericErrorContext,
5095 "HPP: Parsing Comment\n");
5096#endif
5097 htmlParseComment(ctxt);
5098 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005099 } else if ((cur == '<') && (next == '?')) {
5100 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005101 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005102 goto done;
5103#ifdef DEBUG_PUSH
5104 xmlGenericError(xmlGenericErrorContext,
5105 "HPP: Parsing PI\n");
5106#endif
5107 htmlParsePI(ctxt);
5108 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005109 } else if ((cur == '<') && (next == '!') &&
5110 (avail < 4)) {
5111 goto done;
5112 } else {
5113 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005114 ctxt->wellFormed = 0;
5115 ctxt->instate = XML_PARSER_EOF;
5116#ifdef DEBUG_PUSH
5117 xmlGenericError(xmlGenericErrorContext,
5118 "HPP: entering EOF\n");
5119#endif
5120 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5121 ctxt->sax->endDocument(ctxt->userData);
5122 goto done;
5123 }
5124 break;
5125 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005126 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005127 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005128 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005129
5130 if (avail < 2)
5131 goto done;
5132 cur = in->cur[0];
5133 if (cur != '<') {
5134 ctxt->instate = XML_PARSER_CONTENT;
5135#ifdef DEBUG_PUSH
5136 xmlGenericError(xmlGenericErrorContext,
5137 "HPP: entering CONTENT\n");
5138#endif
5139 break;
5140 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005141 if (in->cur[1] == '/') {
5142 ctxt->instate = XML_PARSER_END_TAG;
5143 ctxt->checkIndex = 0;
5144#ifdef DEBUG_PUSH
5145 xmlGenericError(xmlGenericErrorContext,
5146 "HPP: entering END_TAG\n");
5147#endif
5148 break;
5149 }
Owen Taylor3473f882001-02-23 17:55:21 +00005150 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005151 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005152 goto done;
5153
Daniel Veillard597f1c12005-07-03 23:00:18 +00005154 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005155 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005156 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005157 (name == NULL)) {
5158 if (CUR == '>')
5159 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005160 break;
5161 }
Owen Taylor3473f882001-02-23 17:55:21 +00005162
5163 /*
5164 * Lookup the info for that element.
5165 */
5166 info = htmlTagLookup(name);
5167 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005168 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5169 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005170 }
5171
5172 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005173 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005174 */
5175 if ((CUR == '/') && (NXT(1) == '>')) {
5176 SKIP(2);
5177 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5178 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005179 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005180 ctxt->instate = XML_PARSER_CONTENT;
5181#ifdef DEBUG_PUSH
5182 xmlGenericError(xmlGenericErrorContext,
5183 "HPP: entering CONTENT\n");
5184#endif
5185 break;
5186 }
5187
5188 if (CUR == '>') {
5189 NEXT;
5190 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005191 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5192 "Couldn't find end of Start Tag %s\n",
5193 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005194
5195 /*
5196 * end of parsing of this node.
5197 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005198 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005199 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005200 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005201 }
Owen Taylor3473f882001-02-23 17:55:21 +00005202
5203 ctxt->instate = XML_PARSER_CONTENT;
5204#ifdef DEBUG_PUSH
5205 xmlGenericError(xmlGenericErrorContext,
5206 "HPP: entering CONTENT\n");
5207#endif
5208 break;
5209 }
5210
5211 /*
5212 * Check for an Empty Element from DTD definition
5213 */
5214 if ((info != NULL) && (info->empty)) {
5215 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5216 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005217 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005218 }
5219 ctxt->instate = XML_PARSER_CONTENT;
5220#ifdef DEBUG_PUSH
5221 xmlGenericError(xmlGenericErrorContext,
5222 "HPP: entering CONTENT\n");
5223#endif
5224 break;
5225 }
5226 case XML_PARSER_CONTENT: {
5227 long cons;
5228 /*
5229 * Handle preparsed entities and charRef
5230 */
5231 if (ctxt->token != 0) {
5232 xmlChar chr[2] = { 0 , 0 } ;
5233
5234 chr[0] = (xmlChar) ctxt->token;
5235 htmlCheckParagraph(ctxt);
5236 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5237 ctxt->sax->characters(ctxt->userData, chr, 1);
5238 ctxt->token = 0;
5239 ctxt->checkIndex = 0;
5240 }
5241 if ((avail == 1) && (terminate)) {
5242 cur = in->cur[0];
5243 if ((cur != '<') && (cur != '&')) {
5244 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005245 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005246 if (ctxt->sax->ignorableWhitespace != NULL)
5247 ctxt->sax->ignorableWhitespace(
5248 ctxt->userData, &cur, 1);
5249 } else {
5250 htmlCheckParagraph(ctxt);
5251 if (ctxt->sax->characters != NULL)
5252 ctxt->sax->characters(
5253 ctxt->userData, &cur, 1);
5254 }
5255 }
5256 ctxt->token = 0;
5257 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005258 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005259 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005260 }
Owen Taylor3473f882001-02-23 17:55:21 +00005261 }
5262 if (avail < 2)
5263 goto done;
5264 cur = in->cur[0];
5265 next = in->cur[1];
5266 cons = ctxt->nbChars;
5267 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5268 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5269 /*
5270 * Handle SCRIPT/STYLE separately
5271 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005272 if (!terminate) {
5273 int idx;
5274 xmlChar val;
5275
Jiri Netolicky446e1262009-08-07 17:05:36 +02005276 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005277 if (idx < 0)
5278 goto done;
5279 val = in->cur[idx + 2];
5280 if (val == 0) /* bad cut of input */
5281 goto done;
5282 }
Owen Taylor3473f882001-02-23 17:55:21 +00005283 htmlParseScript(ctxt);
5284 if ((cur == '<') && (next == '/')) {
5285 ctxt->instate = XML_PARSER_END_TAG;
5286 ctxt->checkIndex = 0;
5287#ifdef DEBUG_PUSH
5288 xmlGenericError(xmlGenericErrorContext,
5289 "HPP: entering END_TAG\n");
5290#endif
5291 break;
5292 }
5293 } else {
5294 /*
5295 * Sometimes DOCTYPE arrives in the middle of the document
5296 */
5297 if ((cur == '<') && (next == '!') &&
5298 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5299 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5300 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5301 (UPP(8) == 'E')) {
5302 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005303 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005304 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005305 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5306 "Misplaced DOCTYPE declaration\n",
5307 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005308 htmlParseDocTypeDecl(ctxt);
5309 } else if ((cur == '<') && (next == '!') &&
5310 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5311 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005312 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005313 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005314 goto done;
5315#ifdef DEBUG_PUSH
5316 xmlGenericError(xmlGenericErrorContext,
5317 "HPP: Parsing Comment\n");
5318#endif
5319 htmlParseComment(ctxt);
5320 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005321 } else if ((cur == '<') && (next == '?')) {
5322 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005323 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005324 goto done;
5325#ifdef DEBUG_PUSH
5326 xmlGenericError(xmlGenericErrorContext,
5327 "HPP: Parsing PI\n");
5328#endif
5329 htmlParsePI(ctxt);
5330 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005331 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5332 goto done;
5333 } else if ((cur == '<') && (next == '/')) {
5334 ctxt->instate = XML_PARSER_END_TAG;
5335 ctxt->checkIndex = 0;
5336#ifdef DEBUG_PUSH
5337 xmlGenericError(xmlGenericErrorContext,
5338 "HPP: entering END_TAG\n");
5339#endif
5340 break;
5341 } else if (cur == '<') {
5342 ctxt->instate = XML_PARSER_START_TAG;
5343 ctxt->checkIndex = 0;
5344#ifdef DEBUG_PUSH
5345 xmlGenericError(xmlGenericErrorContext,
5346 "HPP: entering START_TAG\n");
5347#endif
5348 break;
5349 } else if (cur == '&') {
5350 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005351 (htmlParseLookupChars(ctxt,
5352 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005353 goto done;
5354#ifdef DEBUG_PUSH
5355 xmlGenericError(xmlGenericErrorContext,
5356 "HPP: Parsing Reference\n");
5357#endif
5358 /* TODO: check generation of subtrees if noent !!! */
5359 htmlParseReference(ctxt);
5360 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005361 /*
5362 * check that the text sequence is complete
5363 * before handing out the data to the parser
5364 * to avoid problems with erroneous end of
5365 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005366 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005367 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005368 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005369 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005370 ctxt->checkIndex = 0;
5371#ifdef DEBUG_PUSH
5372 xmlGenericError(xmlGenericErrorContext,
5373 "HPP: Parsing char data\n");
5374#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005375 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005376 }
5377 }
5378 if (cons == ctxt->nbChars) {
5379 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005380 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5381 "detected an error in element content\n",
5382 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005383 }
5384 NEXT;
5385 break;
5386 }
5387
5388 break;
5389 }
5390 case XML_PARSER_END_TAG:
5391 if (avail < 2)
5392 goto done;
5393 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005394 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005395 goto done;
5396 htmlParseEndTag(ctxt);
5397 if (ctxt->nameNr == 0) {
5398 ctxt->instate = XML_PARSER_EPILOG;
5399 } else {
5400 ctxt->instate = XML_PARSER_CONTENT;
5401 }
5402 ctxt->checkIndex = 0;
5403#ifdef DEBUG_PUSH
5404 xmlGenericError(xmlGenericErrorContext,
5405 "HPP: entering CONTENT\n");
5406#endif
5407 break;
5408 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005409 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5410 "HPP: internal error, state == CDATA\n",
5411 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005412 ctxt->instate = XML_PARSER_CONTENT;
5413 ctxt->checkIndex = 0;
5414#ifdef DEBUG_PUSH
5415 xmlGenericError(xmlGenericErrorContext,
5416 "HPP: entering CONTENT\n");
5417#endif
5418 break;
5419 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005420 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5421 "HPP: internal error, state == DTD\n",
5422 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005423 ctxt->instate = XML_PARSER_CONTENT;
5424 ctxt->checkIndex = 0;
5425#ifdef DEBUG_PUSH
5426 xmlGenericError(xmlGenericErrorContext,
5427 "HPP: entering CONTENT\n");
5428#endif
5429 break;
5430 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005431 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5432 "HPP: internal error, state == COMMENT\n",
5433 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005434 ctxt->instate = XML_PARSER_CONTENT;
5435 ctxt->checkIndex = 0;
5436#ifdef DEBUG_PUSH
5437 xmlGenericError(xmlGenericErrorContext,
5438 "HPP: entering CONTENT\n");
5439#endif
5440 break;
5441 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005442 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5443 "HPP: internal error, state == PI\n",
5444 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005445 ctxt->instate = XML_PARSER_CONTENT;
5446 ctxt->checkIndex = 0;
5447#ifdef DEBUG_PUSH
5448 xmlGenericError(xmlGenericErrorContext,
5449 "HPP: entering CONTENT\n");
5450#endif
5451 break;
5452 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005453 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5454 "HPP: internal error, state == ENTITY_DECL\n",
5455 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005456 ctxt->instate = XML_PARSER_CONTENT;
5457 ctxt->checkIndex = 0;
5458#ifdef DEBUG_PUSH
5459 xmlGenericError(xmlGenericErrorContext,
5460 "HPP: entering CONTENT\n");
5461#endif
5462 break;
5463 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005464 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5465 "HPP: internal error, state == ENTITY_VALUE\n",
5466 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005467 ctxt->instate = XML_PARSER_CONTENT;
5468 ctxt->checkIndex = 0;
5469#ifdef DEBUG_PUSH
5470 xmlGenericError(xmlGenericErrorContext,
5471 "HPP: entering DTD\n");
5472#endif
5473 break;
5474 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005475 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5476 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5477 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005478 ctxt->instate = XML_PARSER_START_TAG;
5479 ctxt->checkIndex = 0;
5480#ifdef DEBUG_PUSH
5481 xmlGenericError(xmlGenericErrorContext,
5482 "HPP: entering START_TAG\n");
5483#endif
5484 break;
5485 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005486 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5487 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5488 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005489 ctxt->instate = XML_PARSER_CONTENT;
5490 ctxt->checkIndex = 0;
5491#ifdef DEBUG_PUSH
5492 xmlGenericError(xmlGenericErrorContext,
5493 "HPP: entering CONTENT\n");
5494#endif
5495 break;
5496 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005497 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5498 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5499 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005500 ctxt->instate = XML_PARSER_CONTENT;
5501 ctxt->checkIndex = 0;
5502#ifdef DEBUG_PUSH
5503 xmlGenericError(xmlGenericErrorContext,
5504 "HPP: entering CONTENT\n");
5505#endif
5506 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005507 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005508 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5509 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5510 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005511 ctxt->instate = XML_PARSER_CONTENT;
5512 ctxt->checkIndex = 0;
5513#ifdef DEBUG_PUSH
5514 xmlGenericError(xmlGenericErrorContext,
5515 "HPP: entering CONTENT\n");
5516#endif
5517 break;
5518
Owen Taylor3473f882001-02-23 17:55:21 +00005519 }
5520 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005521done:
Owen Taylor3473f882001-02-23 17:55:21 +00005522 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005523 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005524 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005525 /*
5526 * SAX: end of the document processing.
5527 */
5528 ctxt->instate = XML_PARSER_EOF;
5529 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5530 ctxt->sax->endDocument(ctxt->userData);
5531 }
5532 }
5533 if ((ctxt->myDoc != NULL) &&
5534 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5535 (ctxt->instate == XML_PARSER_EPILOG))) {
5536 xmlDtdPtr dtd;
5537 dtd = xmlGetIntSubset(ctxt->myDoc);
5538 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005539 ctxt->myDoc->intSubset =
5540 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005541 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5542 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5543 }
5544#ifdef DEBUG_PUSH
5545 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5546#endif
5547 return(ret);
5548}
5549
5550/**
Owen Taylor3473f882001-02-23 17:55:21 +00005551 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005552 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005553 * @chunk: an char array
5554 * @size: the size in byte of the chunk
5555 * @terminate: last chunk indicator
5556 *
5557 * Parse a Chunk of memory
5558 *
5559 * Returns zero if no error, the xmlParserErrors otherwise.
5560 */
5561int
5562htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5563 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005564 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5565 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5566 "htmlParseChunk: context error\n", NULL, NULL);
5567 return(XML_ERR_INTERNAL_ERROR);
5568 }
Owen Taylor3473f882001-02-23 17:55:21 +00005569 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5570 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5571 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5572 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005573 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005574
5575 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005576 if (res < 0) {
5577 ctxt->errNo = XML_PARSER_EOF;
5578 ctxt->disableSAX = 1;
5579 return (XML_PARSER_EOF);
5580 }
Owen Taylor3473f882001-02-23 17:55:21 +00005581 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5582 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005583 ctxt->input->end =
5584 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005585#ifdef DEBUG_PUSH
5586 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5587#endif
5588
Daniel Veillard14f752c2003-08-09 11:44:50 +00005589#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005590 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5591 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005592#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005593 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005594 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5595 xmlParserInputBufferPtr in = ctxt->input->buf;
5596 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5597 (in->raw != NULL)) {
5598 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005599
Daniel Veillard14f752c2003-08-09 11:44:50 +00005600 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5601 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005602 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5603 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005604 return(XML_ERR_INVALID_ENCODING);
5605 }
5606 }
5607 }
Owen Taylor3473f882001-02-23 17:55:21 +00005608 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005609 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005610 if (terminate) {
5611 if ((ctxt->instate != XML_PARSER_EOF) &&
5612 (ctxt->instate != XML_PARSER_EPILOG) &&
5613 (ctxt->instate != XML_PARSER_MISC)) {
5614 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005615 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005616 }
Owen Taylor3473f882001-02-23 17:55:21 +00005617 if (ctxt->instate != XML_PARSER_EOF) {
5618 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5619 ctxt->sax->endDocument(ctxt->userData);
5620 }
5621 ctxt->instate = XML_PARSER_EOF;
5622 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005623 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005624}
5625
5626/************************************************************************
5627 * *
5628 * User entry points *
5629 * *
5630 ************************************************************************/
5631
5632/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005633 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005634 * @sax: a SAX handler
5635 * @user_data: The user data returned on SAX callbacks
5636 * @chunk: a pointer to an array of chars
5637 * @size: number of chars in the array
5638 * @filename: an optional file name or URI
5639 * @enc: an optional encoding
5640 *
5641 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005642 * The value of @filename is used for fetching external entities
5643 * and error/warning reports.
5644 *
5645 * Returns the new parser context or NULL
5646 */
5647htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005648htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005649 const char *chunk, int size, const char *filename,
5650 xmlCharEncoding enc) {
5651 htmlParserCtxtPtr ctxt;
5652 htmlParserInputPtr inputStream;
5653 xmlParserInputBufferPtr buf;
5654
Daniel Veillardd0463562001-10-13 09:15:48 +00005655 xmlInitParser();
5656
Owen Taylor3473f882001-02-23 17:55:21 +00005657 buf = xmlAllocParserInputBuffer(enc);
5658 if (buf == NULL) return(NULL);
5659
Daniel Veillardf403d292003-10-05 13:51:35 +00005660 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005661 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005662 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005663 return(NULL);
5664 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005665 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5666 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005667 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005668 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005669 xmlFree(ctxt->sax);
5670 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5671 if (ctxt->sax == NULL) {
5672 xmlFree(buf);
5673 xmlFree(ctxt);
5674 return(NULL);
5675 }
5676 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5677 if (user_data != NULL)
5678 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005679 }
Owen Taylor3473f882001-02-23 17:55:21 +00005680 if (filename == NULL) {
5681 ctxt->directory = NULL;
5682 } else {
5683 ctxt->directory = xmlParserGetDirectory(filename);
5684 }
5685
5686 inputStream = htmlNewInputStream(ctxt);
5687 if (inputStream == NULL) {
5688 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005689 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005690 return(NULL);
5691 }
5692
5693 if (filename == NULL)
5694 inputStream->filename = NULL;
5695 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005696 inputStream->filename = (char *)
5697 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005698 inputStream->buf = buf;
5699 inputStream->base = inputStream->buf->buffer->content;
5700 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005701 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005702 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005703
5704 inputPush(ctxt, inputStream);
5705
5706 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005707 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005708 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5709 int cur = ctxt->input->cur - ctxt->input->base;
5710
Daniel Veillarde77db162009-08-22 11:32:38 +02005711 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005712
5713 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5714 ctxt->input->cur = ctxt->input->base + cur;
5715 ctxt->input->end =
5716 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005717#ifdef DEBUG_PUSH
5718 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5719#endif
5720 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005721 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005722
5723 return(ctxt);
5724}
William M. Brack21e4ef22005-01-02 09:53:13 +00005725#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005726
5727/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005728 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005729 * @cur: a pointer to an array of xmlChar
5730 * @encoding: a free form C string describing the HTML document encoding, or NULL
5731 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005732 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005733 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005734 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5735 * to handle parse events. If sax is NULL, fallback to the default DOM
5736 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005737 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005738 * Returns the resulting document tree unless SAX is NULL or the document is
5739 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005740 */
5741
5742htmlDocPtr
5743htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5744 htmlDocPtr ret;
5745 htmlParserCtxtPtr ctxt;
5746
Daniel Veillardd0463562001-10-13 09:15:48 +00005747 xmlInitParser();
5748
Owen Taylor3473f882001-02-23 17:55:21 +00005749 if (cur == NULL) return(NULL);
5750
5751
5752 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5753 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005754 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005755 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005756 ctxt->sax = sax;
5757 ctxt->userData = userData;
5758 }
5759
5760 htmlParseDocument(ctxt);
5761 ret = ctxt->myDoc;
5762 if (sax != NULL) {
5763 ctxt->sax = NULL;
5764 ctxt->userData = NULL;
5765 }
5766 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005767
Owen Taylor3473f882001-02-23 17:55:21 +00005768 return(ret);
5769}
5770
5771/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005772 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005773 * @cur: a pointer to an array of xmlChar
5774 * @encoding: a free form C string describing the HTML document encoding, or NULL
5775 *
5776 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005777 *
Owen Taylor3473f882001-02-23 17:55:21 +00005778 * Returns the resulting document tree
5779 */
5780
5781htmlDocPtr
5782htmlParseDoc(xmlChar *cur, const char *encoding) {
5783 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5784}
5785
5786
5787/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005788 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005789 * @filename: the filename
5790 * @encoding: a free form C string describing the HTML document encoding, or NULL
5791 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005792 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005793 * Automatic support for ZLIB/Compress compressed document is provided
5794 * by default if found at compile-time.
5795 *
5796 * Returns the new parser context or NULL
5797 */
5798htmlParserCtxtPtr
5799htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5800{
5801 htmlParserCtxtPtr ctxt;
5802 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005803 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005804 /* htmlCharEncoding enc; */
5805 xmlChar *content, *content_line = (xmlChar *) "charset=";
5806
Daniel Veillarda03e3652004-11-02 18:45:30 +00005807 if (filename == NULL)
5808 return(NULL);
5809
Daniel Veillardf403d292003-10-05 13:51:35 +00005810 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005811 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005812 return(NULL);
5813 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005814 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5815 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005816#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005817 if (xmlDefaultSAXHandler.error != NULL) {
5818 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5819 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005820#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005821 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005822 return(NULL);
5823 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005824
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005825 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5826 xmlFree(canonicFilename);
5827 if (inputStream == NULL) {
5828 xmlFreeParserCtxt(ctxt);
5829 return(NULL);
5830 }
Owen Taylor3473f882001-02-23 17:55:21 +00005831
5832 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005833
Owen Taylor3473f882001-02-23 17:55:21 +00005834 /* set encoding */
5835 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005836 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005837 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005838 strcpy ((char *)content, (char *)content_line);
5839 strcat ((char *)content, (char *)encoding);
5840 htmlCheckEncoding (ctxt, content);
5841 xmlFree (content);
5842 }
5843 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005844
Owen Taylor3473f882001-02-23 17:55:21 +00005845 return(ctxt);
5846}
5847
5848/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005849 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005850 * @filename: the filename
5851 * @encoding: a free form C string describing the HTML document encoding, or NULL
5852 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005853 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005854 *
5855 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5856 * compressed document is provided by default if found at compile-time.
5857 * It use the given SAX function block to handle the parsing callback.
5858 * If sax is NULL, fallback to the default DOM tree building routines.
5859 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005860 * Returns the resulting document tree unless SAX is NULL or the document is
5861 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005862 */
5863
5864htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005865htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005866 void *userData) {
5867 htmlDocPtr ret;
5868 htmlParserCtxtPtr ctxt;
5869 htmlSAXHandlerPtr oldsax = NULL;
5870
Daniel Veillardd0463562001-10-13 09:15:48 +00005871 xmlInitParser();
5872
Owen Taylor3473f882001-02-23 17:55:21 +00005873 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5874 if (ctxt == NULL) return(NULL);
5875 if (sax != NULL) {
5876 oldsax = ctxt->sax;
5877 ctxt->sax = sax;
5878 ctxt->userData = userData;
5879 }
5880
5881 htmlParseDocument(ctxt);
5882
5883 ret = ctxt->myDoc;
5884 if (sax != NULL) {
5885 ctxt->sax = oldsax;
5886 ctxt->userData = NULL;
5887 }
5888 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005889
Owen Taylor3473f882001-02-23 17:55:21 +00005890 return(ret);
5891}
5892
5893/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005894 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005895 * @filename: the filename
5896 * @encoding: a free form C string describing the HTML document encoding, or NULL
5897 *
5898 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5899 * compressed document is provided by default if found at compile-time.
5900 *
5901 * Returns the resulting document tree
5902 */
5903
5904htmlDocPtr
5905htmlParseFile(const char *filename, const char *encoding) {
5906 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5907}
5908
5909/**
5910 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005911 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005912 *
5913 * Set and return the previous value for handling HTML omitted tags.
5914 *
5915 * Returns the last value for 0 for no handling, 1 for auto insertion.
5916 */
5917
5918int
5919htmlHandleOmittedElem(int val) {
5920 int old = htmlOmittedDefaultValue;
5921
5922 htmlOmittedDefaultValue = val;
5923 return(old);
5924}
5925
Daniel Veillard930dfb62003-02-05 10:17:38 +00005926/**
5927 * htmlElementAllowedHere:
5928 * @parent: HTML parent element
5929 * @elt: HTML element
5930 *
5931 * Checks whether an HTML element may be a direct child of a parent element.
5932 * Note - doesn't check for deprecated elements
5933 *
5934 * Returns 1 if allowed; 0 otherwise.
5935 */
5936int
5937htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5938 const char** p ;
5939
5940 if ( ! elt || ! parent || ! parent->subelts )
5941 return 0 ;
5942
5943 for ( p = parent->subelts; *p; ++p )
5944 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5945 return 1 ;
5946
5947 return 0 ;
5948}
5949/**
5950 * htmlElementStatusHere:
5951 * @parent: HTML parent element
5952 * @elt: HTML element
5953 *
5954 * Checks whether an HTML element may be a direct child of a parent element.
5955 * and if so whether it is valid or deprecated.
5956 *
5957 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5958 */
5959htmlStatus
5960htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5961 if ( ! parent || ! elt )
5962 return HTML_INVALID ;
5963 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5964 return HTML_INVALID ;
5965
5966 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5967}
5968/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005969 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005970 * @elt: HTML element
5971 * @attr: HTML attribute
5972 * @legacy: whether to allow deprecated attributes
5973 *
5974 * Checks whether an attribute is valid for an element
5975 * Has full knowledge of Required and Deprecated attributes
5976 *
5977 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5978 */
5979htmlStatus
5980htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5981 const char** p ;
5982
5983 if ( !elt || ! attr )
5984 return HTML_INVALID ;
5985
5986 if ( elt->attrs_req )
5987 for ( p = elt->attrs_req; *p; ++p)
5988 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5989 return HTML_REQUIRED ;
5990
5991 if ( elt->attrs_opt )
5992 for ( p = elt->attrs_opt; *p; ++p)
5993 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5994 return HTML_VALID ;
5995
5996 if ( legacy && elt->attrs_depr )
5997 for ( p = elt->attrs_depr; *p; ++p)
5998 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5999 return HTML_DEPRECATED ;
6000
6001 return HTML_INVALID ;
6002}
6003/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006004 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006005 * @node: an htmlNodePtr in a tree
6006 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006007 * for Element nodes)
6008 *
6009 * Checks whether the tree node is valid. Experimental (the author
6010 * only uses the HTML enhancements in a SAX parser)
6011 *
6012 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6013 * legacy allowed) or htmlElementStatusHere (otherwise).
6014 * for Attribute nodes, a return from htmlAttrAllowed
6015 * for other nodes, HTML_NA (no checks performed)
6016 */
6017htmlStatus
6018htmlNodeStatus(const htmlNodePtr node, int legacy) {
6019 if ( ! node )
6020 return HTML_INVALID ;
6021
6022 switch ( node->type ) {
6023 case XML_ELEMENT_NODE:
6024 return legacy
6025 ? ( htmlElementAllowedHere (
6026 htmlTagLookup(node->parent->name) , node->name
6027 ) ? HTML_VALID : HTML_INVALID )
6028 : htmlElementStatusHere(
6029 htmlTagLookup(node->parent->name) ,
6030 htmlTagLookup(node->name) )
6031 ;
6032 case XML_ATTRIBUTE_NODE:
6033 return htmlAttrAllowed(
6034 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6035 default: return HTML_NA ;
6036 }
6037}
Daniel Veillard9475a352003-09-26 12:47:50 +00006038/************************************************************************
6039 * *
6040 * New set (2.6.0) of simpler and more flexible APIs *
6041 * *
6042 ************************************************************************/
6043/**
6044 * DICT_FREE:
6045 * @str: a string
6046 *
6047 * Free a string if it is not owned by the "dict" dictionnary in the
6048 * current scope
6049 */
6050#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006051 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006052 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6053 xmlFree((char *)(str));
6054
6055/**
6056 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006057 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006058 *
6059 * Reset a parser context
6060 */
6061void
6062htmlCtxtReset(htmlParserCtxtPtr ctxt)
6063{
6064 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006065 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006066
Daniel Veillarda03e3652004-11-02 18:45:30 +00006067 if (ctxt == NULL)
6068 return;
6069
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006070 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006071 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006072
6073 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6074 xmlFreeInputStream(input);
6075 }
6076 ctxt->inputNr = 0;
6077 ctxt->input = NULL;
6078
6079 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006080 if (ctxt->spaceTab != NULL) {
6081 ctxt->spaceTab[0] = -1;
6082 ctxt->space = &ctxt->spaceTab[0];
6083 } else {
6084 ctxt->space = NULL;
6085 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006086
6087
6088 ctxt->nodeNr = 0;
6089 ctxt->node = NULL;
6090
6091 ctxt->nameNr = 0;
6092 ctxt->name = NULL;
6093
6094 DICT_FREE(ctxt->version);
6095 ctxt->version = NULL;
6096 DICT_FREE(ctxt->encoding);
6097 ctxt->encoding = NULL;
6098 DICT_FREE(ctxt->directory);
6099 ctxt->directory = NULL;
6100 DICT_FREE(ctxt->extSubURI);
6101 ctxt->extSubURI = NULL;
6102 DICT_FREE(ctxt->extSubSystem);
6103 ctxt->extSubSystem = NULL;
6104 if (ctxt->myDoc != NULL)
6105 xmlFreeDoc(ctxt->myDoc);
6106 ctxt->myDoc = NULL;
6107
6108 ctxt->standalone = -1;
6109 ctxt->hasExternalSubset = 0;
6110 ctxt->hasPErefs = 0;
6111 ctxt->html = 1;
6112 ctxt->external = 0;
6113 ctxt->instate = XML_PARSER_START;
6114 ctxt->token = 0;
6115
6116 ctxt->wellFormed = 1;
6117 ctxt->nsWellFormed = 1;
6118 ctxt->valid = 1;
6119 ctxt->vctxt.userData = ctxt;
6120 ctxt->vctxt.error = xmlParserValidityError;
6121 ctxt->vctxt.warning = xmlParserValidityWarning;
6122 ctxt->record_info = 0;
6123 ctxt->nbChars = 0;
6124 ctxt->checkIndex = 0;
6125 ctxt->inSubset = 0;
6126 ctxt->errNo = XML_ERR_OK;
6127 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006128 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006129 ctxt->catalogs = NULL;
6130 xmlInitNodeInfoSeq(&ctxt->node_seq);
6131
6132 if (ctxt->attsDefault != NULL) {
6133 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6134 ctxt->attsDefault = NULL;
6135 }
6136 if (ctxt->attsSpecial != NULL) {
6137 xmlHashFree(ctxt->attsSpecial, NULL);
6138 ctxt->attsSpecial = NULL;
6139 }
6140}
6141
6142/**
6143 * htmlCtxtUseOptions:
6144 * @ctxt: an HTML parser context
6145 * @options: a combination of htmlParserOption(s)
6146 *
6147 * Applies the options to the parser context
6148 *
6149 * Returns 0 in case of success, the set of unknown or unimplemented options
6150 * in case of error.
6151 */
6152int
6153htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6154{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006155 if (ctxt == NULL)
6156 return(-1);
6157
Daniel Veillard9475a352003-09-26 12:47:50 +00006158 if (options & HTML_PARSE_NOWARNING) {
6159 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006160 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006161 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006162 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006163 }
6164 if (options & HTML_PARSE_NOERROR) {
6165 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006166 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006167 ctxt->sax->fatalError = NULL;
6168 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006169 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006170 }
6171 if (options & HTML_PARSE_PEDANTIC) {
6172 ctxt->pedantic = 1;
6173 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006174 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006175 } else
6176 ctxt->pedantic = 0;
6177 if (options & XML_PARSE_NOBLANKS) {
6178 ctxt->keepBlanks = 0;
6179 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6180 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006181 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006182 } else
6183 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006184 if (options & HTML_PARSE_RECOVER) {
6185 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006186 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006187 } else
6188 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006189 if (options & HTML_PARSE_COMPACT) {
6190 ctxt->options |= HTML_PARSE_COMPACT;
6191 options -= HTML_PARSE_COMPACT;
6192 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006193 if (options & XML_PARSE_HUGE) {
6194 ctxt->options |= XML_PARSE_HUGE;
6195 options -= XML_PARSE_HUGE;
6196 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006197 ctxt->dictNames = 0;
6198 return (options);
6199}
6200
6201/**
6202 * htmlDoRead:
6203 * @ctxt: an HTML parser context
6204 * @URL: the base URL to use for the document
6205 * @encoding: the document encoding, or NULL
6206 * @options: a combination of htmlParserOption(s)
6207 * @reuse: keep the context for reuse
6208 *
6209 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006210 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006211 * Returns the resulting document tree or NULL
6212 */
6213static htmlDocPtr
6214htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6215 int options, int reuse)
6216{
6217 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006218
Daniel Veillard9475a352003-09-26 12:47:50 +00006219 htmlCtxtUseOptions(ctxt, options);
6220 ctxt->html = 1;
6221 if (encoding != NULL) {
6222 xmlCharEncodingHandlerPtr hdlr;
6223
6224 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006225 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006226 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006227 if (ctxt->input->encoding != NULL)
6228 xmlFree((xmlChar *) ctxt->input->encoding);
6229 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6230 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006231 }
6232 if ((URL != NULL) && (ctxt->input != NULL) &&
6233 (ctxt->input->filename == NULL))
6234 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6235 htmlParseDocument(ctxt);
6236 ret = ctxt->myDoc;
6237 ctxt->myDoc = NULL;
6238 if (!reuse) {
6239 if ((ctxt->dictNames) &&
6240 (ret != NULL) &&
6241 (ret->dict == ctxt->dict))
6242 ctxt->dict = NULL;
6243 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006244 }
6245 return (ret);
6246}
6247
6248/**
6249 * htmlReadDoc:
6250 * @cur: a pointer to a zero terminated string
6251 * @URL: the base URL to use for the document
6252 * @encoding: the document encoding, or NULL
6253 * @options: a combination of htmlParserOption(s)
6254 *
6255 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006256 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006257 * Returns the resulting document tree
6258 */
6259htmlDocPtr
6260htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6261{
6262 htmlParserCtxtPtr ctxt;
6263
6264 if (cur == NULL)
6265 return (NULL);
6266
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006267 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006268 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006269 if (ctxt == NULL)
6270 return (NULL);
6271 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6272}
6273
6274/**
6275 * htmlReadFile:
6276 * @filename: a file or URL
6277 * @encoding: the document encoding, or NULL
6278 * @options: a combination of htmlParserOption(s)
6279 *
6280 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006281 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006282 * Returns the resulting document tree
6283 */
6284htmlDocPtr
6285htmlReadFile(const char *filename, const char *encoding, int options)
6286{
6287 htmlParserCtxtPtr ctxt;
6288
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006289 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006290 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6291 if (ctxt == NULL)
6292 return (NULL);
6293 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6294}
6295
6296/**
6297 * htmlReadMemory:
6298 * @buffer: a pointer to a char array
6299 * @size: the size of the array
6300 * @URL: the base URL to use for the document
6301 * @encoding: the document encoding, or NULL
6302 * @options: a combination of htmlParserOption(s)
6303 *
6304 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006305 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006306 * Returns the resulting document tree
6307 */
6308htmlDocPtr
6309htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6310{
6311 htmlParserCtxtPtr ctxt;
6312
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006313 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006314 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6315 if (ctxt == NULL)
6316 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006317 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006318 if (ctxt->sax != NULL)
6319 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006320 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6321}
6322
6323/**
6324 * htmlReadFd:
6325 * @fd: an open file descriptor
6326 * @URL: the base URL to use for the document
6327 * @encoding: the document encoding, or NULL
6328 * @options: a combination of htmlParserOption(s)
6329 *
6330 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006331 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006332 * Returns the resulting document tree
6333 */
6334htmlDocPtr
6335htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6336{
6337 htmlParserCtxtPtr ctxt;
6338 xmlParserInputBufferPtr input;
6339 xmlParserInputPtr stream;
6340
6341 if (fd < 0)
6342 return (NULL);
6343
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006344 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006345 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6346 if (input == NULL)
6347 return (NULL);
6348 ctxt = xmlNewParserCtxt();
6349 if (ctxt == NULL) {
6350 xmlFreeParserInputBuffer(input);
6351 return (NULL);
6352 }
6353 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6354 if (stream == NULL) {
6355 xmlFreeParserInputBuffer(input);
6356 xmlFreeParserCtxt(ctxt);
6357 return (NULL);
6358 }
6359 inputPush(ctxt, stream);
6360 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6361}
6362
6363/**
6364 * htmlReadIO:
6365 * @ioread: an I/O read function
6366 * @ioclose: an I/O close function
6367 * @ioctx: an I/O handler
6368 * @URL: the base URL to use for the document
6369 * @encoding: the document encoding, or NULL
6370 * @options: a combination of htmlParserOption(s)
6371 *
6372 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006373 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006374 * Returns the resulting document tree
6375 */
6376htmlDocPtr
6377htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6378 void *ioctx, const char *URL, const char *encoding, int options)
6379{
6380 htmlParserCtxtPtr ctxt;
6381 xmlParserInputBufferPtr input;
6382 xmlParserInputPtr stream;
6383
6384 if (ioread == NULL)
6385 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006386 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006387
6388 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6389 XML_CHAR_ENCODING_NONE);
6390 if (input == NULL)
6391 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006392 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006393 if (ctxt == NULL) {
6394 xmlFreeParserInputBuffer(input);
6395 return (NULL);
6396 }
6397 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6398 if (stream == NULL) {
6399 xmlFreeParserInputBuffer(input);
6400 xmlFreeParserCtxt(ctxt);
6401 return (NULL);
6402 }
6403 inputPush(ctxt, stream);
6404 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6405}
6406
6407/**
6408 * htmlCtxtReadDoc:
6409 * @ctxt: an HTML parser context
6410 * @cur: a pointer to a zero terminated string
6411 * @URL: the base URL to use for the document
6412 * @encoding: the document encoding, or NULL
6413 * @options: a combination of htmlParserOption(s)
6414 *
6415 * parse an XML in-memory document and build a tree.
6416 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006417 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006418 * Returns the resulting document tree
6419 */
6420htmlDocPtr
6421htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6422 const char *URL, const char *encoding, int options)
6423{
6424 xmlParserInputPtr stream;
6425
6426 if (cur == NULL)
6427 return (NULL);
6428 if (ctxt == NULL)
6429 return (NULL);
6430
6431 htmlCtxtReset(ctxt);
6432
6433 stream = xmlNewStringInputStream(ctxt, cur);
6434 if (stream == NULL) {
6435 return (NULL);
6436 }
6437 inputPush(ctxt, stream);
6438 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6439}
6440
6441/**
6442 * htmlCtxtReadFile:
6443 * @ctxt: an HTML parser context
6444 * @filename: a file or URL
6445 * @encoding: the document encoding, or NULL
6446 * @options: a combination of htmlParserOption(s)
6447 *
6448 * parse an XML file from the filesystem or the network.
6449 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006450 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006451 * Returns the resulting document tree
6452 */
6453htmlDocPtr
6454htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6455 const char *encoding, int options)
6456{
6457 xmlParserInputPtr stream;
6458
6459 if (filename == NULL)
6460 return (NULL);
6461 if (ctxt == NULL)
6462 return (NULL);
6463
6464 htmlCtxtReset(ctxt);
6465
Daniel Veillard29614c72004-11-26 10:47:26 +00006466 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006467 if (stream == NULL) {
6468 return (NULL);
6469 }
6470 inputPush(ctxt, stream);
6471 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6472}
6473
6474/**
6475 * htmlCtxtReadMemory:
6476 * @ctxt: an HTML parser context
6477 * @buffer: a pointer to a char array
6478 * @size: the size of the array
6479 * @URL: the base URL to use for the document
6480 * @encoding: the document encoding, or NULL
6481 * @options: a combination of htmlParserOption(s)
6482 *
6483 * parse an XML in-memory document and build a tree.
6484 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006485 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006486 * Returns the resulting document tree
6487 */
6488htmlDocPtr
6489htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6490 const char *URL, const char *encoding, int options)
6491{
6492 xmlParserInputBufferPtr input;
6493 xmlParserInputPtr stream;
6494
6495 if (ctxt == NULL)
6496 return (NULL);
6497 if (buffer == NULL)
6498 return (NULL);
6499
6500 htmlCtxtReset(ctxt);
6501
6502 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6503 if (input == NULL) {
6504 return(NULL);
6505 }
6506
6507 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6508 if (stream == NULL) {
6509 xmlFreeParserInputBuffer(input);
6510 return(NULL);
6511 }
6512
6513 inputPush(ctxt, stream);
6514 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6515}
6516
6517/**
6518 * htmlCtxtReadFd:
6519 * @ctxt: an HTML parser context
6520 * @fd: an open file descriptor
6521 * @URL: the base URL to use for the document
6522 * @encoding: the document encoding, or NULL
6523 * @options: a combination of htmlParserOption(s)
6524 *
6525 * parse an XML from a file descriptor and build a tree.
6526 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006527 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006528 * Returns the resulting document tree
6529 */
6530htmlDocPtr
6531htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6532 const char *URL, const char *encoding, int options)
6533{
6534 xmlParserInputBufferPtr input;
6535 xmlParserInputPtr stream;
6536
6537 if (fd < 0)
6538 return (NULL);
6539 if (ctxt == NULL)
6540 return (NULL);
6541
6542 htmlCtxtReset(ctxt);
6543
6544
6545 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6546 if (input == NULL)
6547 return (NULL);
6548 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6549 if (stream == NULL) {
6550 xmlFreeParserInputBuffer(input);
6551 return (NULL);
6552 }
6553 inputPush(ctxt, stream);
6554 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6555}
6556
6557/**
6558 * htmlCtxtReadIO:
6559 * @ctxt: an HTML parser context
6560 * @ioread: an I/O read function
6561 * @ioclose: an I/O close function
6562 * @ioctx: an I/O handler
6563 * @URL: the base URL to use for the document
6564 * @encoding: the document encoding, or NULL
6565 * @options: a combination of htmlParserOption(s)
6566 *
6567 * parse an HTML document from I/O functions and source and build a tree.
6568 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006569 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006570 * Returns the resulting document tree
6571 */
6572htmlDocPtr
6573htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6574 xmlInputCloseCallback ioclose, void *ioctx,
6575 const char *URL,
6576 const char *encoding, int options)
6577{
6578 xmlParserInputBufferPtr input;
6579 xmlParserInputPtr stream;
6580
6581 if (ioread == NULL)
6582 return (NULL);
6583 if (ctxt == NULL)
6584 return (NULL);
6585
6586 htmlCtxtReset(ctxt);
6587
6588 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6589 XML_CHAR_ENCODING_NONE);
6590 if (input == NULL)
6591 return (NULL);
6592 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6593 if (stream == NULL) {
6594 xmlFreeParserInputBuffer(input);
6595 return (NULL);
6596 }
6597 inputPush(ctxt, stream);
6598 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6599}
6600
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006601#define bottom_HTMLparser
6602#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006603#endif /* LIBXML_HTML_ENABLED */