blob: da3b173dce7c4629800a85a2b6e6777a4ea63644 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
212/*
213 * Macros for accessing the content. Those should be used only by the parser,
214 * and not exported.
215 *
216 * Dirty macros, i.e. one need to make assumption on the context to use them
217 *
218 * CUR_PTR return the current pointer to the xmlChar to be parsed.
219 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
220 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
221 * in UNICODE mode. This should be used internally by the parser
222 * only to compare to ASCII values otherwise it would break when
223 * running with UTF-8 encoding.
224 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
225 * to compare on ASCII based substring.
226 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
227 * it should be used only to compare on ASCII based substring.
228 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 *
231 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
232 *
233 * CURRENT Returns the current char value, with the full decoding of
234 * UTF-8 if we are using this mode. It returns an int.
235 * NEXT Skip to the next character, this does the proper decoding
236 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000237 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000238 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
239 */
240
241#define UPPER (toupper(*ctxt->input->cur))
242
Daniel Veillard77a90a72003-03-22 00:04:05 +0000243#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000244
245#define NXT(val) ctxt->input->cur[(val)]
246
247#define UPP(val) (toupper(ctxt->input->cur[(val)]))
248
249#define CUR_PTR ctxt->input->cur
250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
252 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
253 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000255#define GROW if ((ctxt->progressive == 0) && \
256 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
257 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000258
259#define CURRENT ((int) (*ctxt->input->cur))
260
261#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
262
263/* Inported from XML */
264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
266#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000267#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000268
Daniel Veillard561b7f82002-03-20 21:55:57 +0000269#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000270#define NXT(val) ctxt->input->cur[(val)]
271#define CUR_PTR ctxt->input->cur
272
273
274#define NEXTL(l) do { \
275 if (*(ctxt->input->cur) == '\n') { \
276 ctxt->input->line++; ctxt->input->col = 1; \
277 } else ctxt->input->col++; \
278 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200280
Owen Taylor3473f882001-02-23 17:55:21 +0000281/************
282 \
283 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
284 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
285 ************/
286
287#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
288#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
289
290#define COPY_BUF(l,b,i,v) \
291 if (l == 1) b[i++] = (xmlChar) v; \
292 else i += xmlCopyChar(l,&b[i],v)
293
294/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200295 * htmlFindEncoding:
296 * @the HTML parser context
297 *
298 * Ty to find and encoding in the current data available in the input
299 * buffer this is needed to try to switch to the proper encoding when
300 * one face a character error.
301 * That's an heuristic, since it's operating outside of parsing it could
302 * try to use a meta which had been commented out, that's the reason it
303 * should only be used in case of error, not as a default.
304 *
305 * Returns an encoding string or NULL if not found, the string need to
306 * be freed
307 */
308static xmlChar *
309htmlFindEncoding(xmlParserCtxtPtr ctxt) {
310 const xmlChar *start, *cur, *end;
311
312 if ((ctxt == NULL) || (ctxt->input == NULL) ||
313 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
314 (ctxt->input->buf->encoder != NULL))
315 return(NULL);
316 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
317 return(NULL);
318
319 start = ctxt->input->cur;
320 end = ctxt->input->end;
321 /* we also expect the input buffer to be zero terminated */
322 if (*end != 0)
323 return(NULL);
324
325 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
326 if (cur == NULL)
327 return(NULL);
328 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
329 if (cur == NULL)
330 return(NULL);
331 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
332 if (cur == NULL)
333 return(NULL);
334 cur += 8;
335 start = cur;
336 while (((*cur >= 'A') && (*cur <= 'Z')) ||
337 ((*cur >= 'a') && (*cur <= 'z')) ||
338 ((*cur >= '0') && (*cur <= '9')) ||
339 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
340 cur++;
341 if (cur == start)
342 return(NULL);
343 return(xmlStrndup(start, cur - start));
344}
345
346/**
Owen Taylor3473f882001-02-23 17:55:21 +0000347 * htmlCurrentChar:
348 * @ctxt: the HTML parser context
349 * @len: pointer to the length of the char read
350 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000351 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000352 * bytes in the input buffer. Implement the end of line normalization:
353 * 2.11 End-of-Line Handling
354 * If the encoding is unspecified, in the case we find an ISO-Latin-1
355 * char, then the encoding converter is plugged in automatically.
356 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000357 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
359
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000360static int
Owen Taylor3473f882001-02-23 17:55:21 +0000361htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
362 if (ctxt->instate == XML_PARSER_EOF)
363 return(0);
364
365 if (ctxt->token != 0) {
366 *len = 0;
367 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200368 }
Owen Taylor3473f882001-02-23 17:55:21 +0000369 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
370 /*
371 * We are supposed to handle UTF8, check it's valid
372 * From rfc2044: encoding of the Unicode values on UTF-8:
373 *
374 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
375 * 0000 0000-0000 007F 0xxxxxxx
376 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200377 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000378 *
379 * Check for the 0x110000 limit too
380 */
381 const unsigned char *cur = ctxt->input->cur;
382 unsigned char c;
383 unsigned int val;
384
385 c = *cur;
386 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200387 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000388 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200389 cur = ctxt->input->cur;
390 }
Owen Taylor3473f882001-02-23 17:55:21 +0000391 if ((cur[1] & 0xc0) != 0x80)
392 goto encoding_error;
393 if ((c & 0xe0) == 0xe0) {
394
Adiel Mittmann8a103792009-08-25 11:27:13 +0200395 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000396 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200397 cur = ctxt->input->cur;
398 }
Owen Taylor3473f882001-02-23 17:55:21 +0000399 if ((cur[2] & 0xc0) != 0x80)
400 goto encoding_error;
401 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200402 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000403 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200404 cur = ctxt->input->cur;
405 }
Owen Taylor3473f882001-02-23 17:55:21 +0000406 if (((c & 0xf8) != 0xf0) ||
407 ((cur[3] & 0xc0) != 0x80))
408 goto encoding_error;
409 /* 4-byte code */
410 *len = 4;
411 val = (cur[0] & 0x7) << 18;
412 val |= (cur[1] & 0x3f) << 12;
413 val |= (cur[2] & 0x3f) << 6;
414 val |= cur[3] & 0x3f;
415 } else {
416 /* 3-byte code */
417 *len = 3;
418 val = (cur[0] & 0xf) << 12;
419 val |= (cur[1] & 0x3f) << 6;
420 val |= cur[2] & 0x3f;
421 }
422 } else {
423 /* 2-byte code */
424 *len = 2;
425 val = (cur[0] & 0x1f) << 6;
426 val |= cur[1] & 0x3f;
427 }
428 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000429 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
430 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200431 }
Owen Taylor3473f882001-02-23 17:55:21 +0000432 return(val);
433 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200434 if ((*ctxt->input->cur == 0) &&
435 (ctxt->input->cur < ctxt->input->end)) {
436 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
437 "Char 0x%X out of allowed range\n", 0);
438 *len = 1;
439 return(' ');
440 }
Owen Taylor3473f882001-02-23 17:55:21 +0000441 /* 1-byte code */
442 *len = 1;
443 return((int) *ctxt->input->cur);
444 }
445 }
446 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000447 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000448 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000449 * XML constructs only use < 128 chars
450 */
451 *len = 1;
452 if ((int) *ctxt->input->cur < 0x80)
453 return((int) *ctxt->input->cur);
454
455 /*
456 * Humm this is bad, do an automatic flow conversion
457 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200458 {
459 xmlChar * guess;
460 xmlCharEncodingHandlerPtr handler;
461
462 guess = htmlFindEncoding(ctxt);
463 if (guess == NULL) {
464 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
465 } else {
466 if (ctxt->input->encoding != NULL)
467 xmlFree((xmlChar *) ctxt->input->encoding);
468 ctxt->input->encoding = guess;
469 handler = xmlFindCharEncodingHandler((const char *) guess);
470 if (handler != NULL) {
471 xmlSwitchToEncoding(ctxt, handler);
472 } else {
473 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
474 "Unsupported encoding %s", guess, NULL);
475 }
476 }
477 ctxt->charset = XML_CHAR_ENCODING_UTF8;
478 }
479
Owen Taylor3473f882001-02-23 17:55:21 +0000480 return(xmlCurrentChar(ctxt, len));
481
482encoding_error:
483 /*
484 * If we detect an UTF8 error that probably mean that the
485 * input encoding didn't get properly advertized in the
486 * declaration header. Report the error and switch the encoding
487 * to ISO-Latin-1 (if you don't like this policy, just declare the
488 * encoding !)
489 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000490 {
491 char buffer[150];
492
Daniel Veillard861101d2007-06-12 08:38:57 +0000493 if (ctxt->input->end - ctxt->input->cur >= 4) {
494 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
495 ctxt->input->cur[0], ctxt->input->cur[1],
496 ctxt->input->cur[2], ctxt->input->cur[3]);
497 } else {
498 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
499 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000500 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
501 "Input is not proper UTF-8, indicate encoding !\n",
502 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000503 }
504
Daniel Veillarde77db162009-08-22 11:32:38 +0200505 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000506 *len = 1;
507 return((int) *ctxt->input->cur);
508}
509
510/**
Owen Taylor3473f882001-02-23 17:55:21 +0000511 * htmlSkipBlankChars:
512 * @ctxt: the HTML parser context
513 *
514 * skip all blanks character found at that point in the input streams.
515 *
516 * Returns the number of space chars skipped
517 */
518
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000519static int
Owen Taylor3473f882001-02-23 17:55:21 +0000520htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
521 int res = 0;
522
William M. Brack76e95df2003-10-18 16:20:14 +0000523 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000524 if ((*ctxt->input->cur == 0) &&
525 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
526 xmlPopInput(ctxt);
527 } else {
528 if (*(ctxt->input->cur) == '\n') {
529 ctxt->input->line++; ctxt->input->col = 1;
530 } else ctxt->input->col++;
531 ctxt->input->cur++;
532 ctxt->nbChars++;
533 if (*ctxt->input->cur == 0)
534 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
535 }
536 res++;
537 }
538 return(res);
539}
540
541
542
543/************************************************************************
544 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200545 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000546 * *
547 ************************************************************************/
548
549/*
550 * Start Tag: 1 means the start tag can be ommited
551 * End Tag: 1 means the end tag can be ommited
552 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000553 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000554 * Depr: this element is deprecated
555 * DTD: 1 means that this element is valid only in the Loose DTD
556 * 2 means that this element is valid only in the Frameset DTD
557 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000558 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000559 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000560 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000561
562/* Definitions and a couple of vars for HTML Elements */
563
564#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000565#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000566#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000567#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000568#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
569#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100570#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000571#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100572#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000573#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000575#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000576#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000577#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000578#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000579#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000580#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000581#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000582#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000583#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000584#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000585#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000586#define EMPTY NULL
587
588
Daniel Veillard065abe82006-07-03 08:55:04 +0000589static const char* const html_flow[] = { FLOW, NULL } ;
590static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000591
592/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000593static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000594#define html_cdata html_pcdata
595
596
597/* ... and for HTML Attributes */
598
599#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000600#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000601#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000602#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000604#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000605#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000606#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000607#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000608#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000609#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000610#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000611
Daniel Veillard065abe82006-07-03 08:55:04 +0000612static const char* const html_attrs[] = { ATTRS, NULL } ;
613static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
614static const char* const core_attrs[] = { COREATTRS, NULL } ;
615static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000616
617
618/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000619static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
621 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000622static const char* const target_attr[] = { "target", NULL } ;
623static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
624static const char* const alt_attr[] = { "alt", NULL } ;
625static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
626static const char* const href_attrs[] = { "href", NULL } ;
627static const char* const clear_attrs[] = { "clear", NULL } ;
628static const char* const inline_p[] = { INLINE, "p", NULL } ;
629
630static const char* const flow_param[] = { FLOW, "param", NULL } ;
631static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000632 "archive", "alt", "name", "height", "width", "align",
633 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000634static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000635 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000636static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000637 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000638static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
639static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
640static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
641static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000642 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000643static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000644 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
645
646
Daniel Veillard065abe82006-07-03 08:55:04 +0000647static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
648static const char* const col_elt[] = { "col", NULL } ;
649static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
650static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
651static const char* const dl_contents[] = { "dt", "dd", NULL } ;
652static const char* const compact_attr[] = { "compact", NULL } ;
653static const char* const label_attr[] = { "label", NULL } ;
654static const char* const fieldset_contents[] = { FLOW, "legend" } ;
655static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
656static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
657static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
658static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
659static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
660static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
661static const char* const head_attrs[] = { I18N, "profile", NULL } ;
662static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
663static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
664static const char* const version_attr[] = { "version", NULL } ;
665static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
666static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
667static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000668static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000669static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
670static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
671static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
672static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
673static const char* const align_attr[] = { "align", NULL } ;
674static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
675static const char* const map_contents[] = { BLOCK, "area", NULL } ;
676static const char* const name_attr[] = { "name", NULL } ;
677static const char* const action_attr[] = { "action", NULL } ;
678static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
679static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
680static const char* const content_attr[] = { "content", NULL } ;
681static const char* const type_attr[] = { "type", NULL } ;
682static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
683static const char* const object_contents[] = { FLOW, "param", NULL } ;
684static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
685static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
686static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
687static const char* const option_elt[] = { "option", NULL } ;
688static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
689static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
690static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
691static const char* const width_attr[] = { "width", NULL } ;
692static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
693static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
694static const char* const language_attr[] = { "language", NULL } ;
695static const char* const select_content[] = { "optgroup", "option", NULL } ;
696static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
697static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200698static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000699static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
700static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
701static const char* const tr_elt[] = { "tr", NULL } ;
702static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
703static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
704static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
705static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
706static const char* const tr_contents[] = { "th", "td", NULL } ;
707static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
708static const char* const li_elt[] = { "li", NULL } ;
709static const char* const ul_depr[] = { "type", "compact", NULL} ;
710static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000711
712#define DECL (const char**)
713
Daniel Veillard22090732001-07-16 00:06:07 +0000714static const htmlElemDesc
715html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000716{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
717 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
718},
719{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
720 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
721},
722{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
723 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
724},
725{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
726 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
727},
728{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
729 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
730},
731{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
732 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
733},
734{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
735 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
736},
737{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
738 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
739},
740{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
741 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
742},
743{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
744 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
745},
746{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
747 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
748},
749{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
750 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
751},
752{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
753 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
754},
755{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
756 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
757},
758{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
759 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
760},
761{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
762 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
763},
764{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
765 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
766},
767{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
768 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
769},
770{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772},
773{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
774 EMPTY , NULL , DECL col_attrs , NULL, NULL
775},
776{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
777 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
778},
779{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
780 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
781},
782{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
783 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
784},
785{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787},
788{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
789 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
790},
791{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
792 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
793},
794{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000795 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000796},
797{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
798 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
799},
800{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
801 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
802},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000803{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000804 EMPTY, NULL, DECL embed_attrs, NULL, NULL
805},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000806{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
807 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
808},
809{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
810 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
811},
812{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
813 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
814},
815{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
816 EMPTY, NULL, NULL, DECL frame_attrs, NULL
817},
818{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
819 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
820},
821{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
822 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
823},
824{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
825 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
826},
827{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
828 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
829},
830{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
831 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
832},
833{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
834 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
835},
836{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
837 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
838},
839{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
840 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
841},
842{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
843 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
844},
845{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
846 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
847},
848{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850},
851{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
852 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
853},
854{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000855 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000856},
857{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
858 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
859},
860{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
861 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
862},
863{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
864 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
865},
866{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
867 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
868},
869{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
870 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
871},
872{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
873 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
874},
875{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
876 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
877},
878{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
879 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
880},
881{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000882 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000883},
884{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
885 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
886},
887{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
888 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
889},
890{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
891 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
892},
893{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
894 DECL html_flow, "div", DECL html_attrs, NULL, NULL
895},
896{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
897 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
898},
899{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
900 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
901},
902{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000903 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000904},
905{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
906 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
907},
908{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910},
911{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000912 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000913},
914{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
915 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
916},
917{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
918 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
919},
920{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
921 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
922},
923{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
924 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
925},
926{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
927 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
928},
929{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
930 DECL select_content, NULL, DECL select_attrs, NULL, NULL
931},
932{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
933 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
934},
935{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
936 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
937},
938{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
939 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
940},
941{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
942 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
943},
944{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
945 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
946},
947{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949},
950{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
951 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
952},
953{ "table", 0, 0, 0, 0, 0, 0, 0, "",
954 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
955},
956{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
957 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
958},
959{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
960 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
961},
962{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
963 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
964},
965{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
966 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
967},
968{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
969 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
970},
971{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
972 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
973},
974{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
975 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
976},
977{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
978 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
979},
980{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
981 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
982},
983{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
984 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
985},
986{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
987 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
988},
989{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
990 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991}
Owen Taylor3473f882001-02-23 17:55:21 +0000992};
993
994/*
Owen Taylor3473f882001-02-23 17:55:21 +0000995 * start tags that imply the end of current element
996 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000997static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000998"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
999 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1000 "listing", "xmp", "head", NULL,
1001"head", "p", NULL,
1002"title", "p", NULL,
1003"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001004"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001005"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1006 "pre", "listing", "xmp", "head", "li", NULL,
1007"hr", "p", "head", NULL,
1008"h1", "p", "head", NULL,
1009"h2", "p", "head", NULL,
1010"h3", "p", "head", NULL,
1011"h4", "p", "head", NULL,
1012"h5", "p", "head", NULL,
1013"h6", "p", "head", NULL,
1014"dir", "p", "head", NULL,
1015"address", "p", "head", "ul", NULL,
1016"pre", "p", "head", "ul", NULL,
1017"listing", "p", "head", NULL,
1018"xmp", "p", "head", NULL,
1019"blockquote", "p", "head", NULL,
1020"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1021 "xmp", "head", NULL,
1022"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1023 "head", "dd", NULL,
1024"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1025 "head", "dt", NULL,
1026"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1027 "listing", "xmp", NULL,
1028"ol", "p", "head", "ul", NULL,
1029"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001030"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001031"div", "p", "head", NULL,
1032"noscript", "p", "head", NULL,
1033"center", "font", "b", "i", "p", "head", NULL,
1034"a", "a", NULL,
1035"caption", "p", NULL,
1036"colgroup", "caption", "colgroup", "col", "p", NULL,
1037"col", "caption", "col", "p", NULL,
1038"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1039 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001040"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001041"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001042"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1043"thead", "caption", "col", "colgroup", NULL,
1044"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1045 "tbody", "p", NULL,
1046"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1047 "tfoot", "tbody", "p", NULL,
1048"optgroup", "option", NULL,
1049"option", "option", NULL,
1050"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1051 "pre", "listing", "xmp", "a", NULL,
1052NULL
1053};
1054
1055/*
1056 * The list of HTML elements which are supposed not to have
1057 * CDATA content and where a p element will be implied
1058 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001059 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001060 * implied paragraph
1061 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001062static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001063 "html",
1064 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001065 NULL
1066};
1067
1068/*
1069 * The list of HTML attributes which are of content %Script;
1070 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1071 * it assumes the name starts with 'on'
1072 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001073static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001074 "onclick",
1075 "ondblclick",
1076 "onmousedown",
1077 "onmouseup",
1078 "onmouseover",
1079 "onmousemove",
1080 "onmouseout",
1081 "onkeypress",
1082 "onkeydown",
1083 "onkeyup",
1084 "onload",
1085 "onunload",
1086 "onfocus",
1087 "onblur",
1088 "onsubmit",
1089 "onrest",
1090 "onchange",
1091 "onselect"
1092};
1093
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001094/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095 * This table is used by the htmlparser to know what to do with
1096 * broken html pages. By assigning different priorities to different
1097 * elements the parser can decide how to handle extra endtags.
1098 * Endtags are only allowed to close elements with lower or equal
1099 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001100 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001101
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001102typedef struct {
1103 const char *name;
1104 int priority;
1105} elementPriority;
1106
Daniel Veillard22090732001-07-16 00:06:07 +00001107static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001108 {"div", 150},
1109 {"td", 160},
1110 {"th", 160},
1111 {"tr", 170},
1112 {"thead", 180},
1113 {"tbody", 180},
1114 {"tfoot", 180},
1115 {"table", 190},
1116 {"head", 200},
1117 {"body", 200},
1118 {"html", 220},
1119 {NULL, 100} /* Default priority */
1120};
Owen Taylor3473f882001-02-23 17:55:21 +00001121
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001122static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001123static int htmlStartCloseIndexinitialized = 0;
1124
1125/************************************************************************
1126 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001127 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001128 * *
1129 ************************************************************************/
1130
1131/**
1132 * htmlInitAutoClose:
1133 *
1134 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1135 * This is not reentrant. Call xmlInitParser() once before processing in
1136 * case of use in multithreaded programs.
1137 */
1138void
1139htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001140 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001141
1142 if (htmlStartCloseIndexinitialized) return;
1143
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001144 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1145 indx = 0;
1146 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001147 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001148 while (htmlStartClose[i] != NULL) i++;
1149 i++;
1150 }
1151 htmlStartCloseIndexinitialized = 1;
1152}
1153
1154/**
1155 * htmlTagLookup:
1156 * @tag: The tag name in lowercase
1157 *
1158 * Lookup the HTML tag in the ElementTable
1159 *
1160 * Returns the related htmlElemDescPtr or NULL if not found.
1161 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001162const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001163htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001164 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001165
1166 for (i = 0; i < (sizeof(html40ElementTable) /
1167 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001168 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001169 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001170 }
1171 return(NULL);
1172}
1173
1174/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001175 * htmlGetEndPriority:
1176 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001177 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001178 * Return value: The "endtag" priority.
1179 **/
1180static int
1181htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001182 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001183
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001184 while ((htmlEndPriority[i].name != NULL) &&
1185 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1186 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001187
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001188 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001189}
1190
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001191
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001192/**
Owen Taylor3473f882001-02-23 17:55:21 +00001193 * htmlCheckAutoClose:
1194 * @newtag: The new tag name
1195 * @oldtag: The old tag name
1196 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001197 * Checks whether the new tag is one of the registered valid tags for
1198 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001199 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1200 *
1201 * Returns 0 if no, 1 if yes.
1202 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001203static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001204htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1205{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001206 int i, indx;
1207 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001208
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001209 if (htmlStartCloseIndexinitialized == 0)
1210 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001211
1212 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001214 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 if (closed == NULL)
1216 return (0);
1217 if (xmlStrEqual(BAD_CAST * closed, newtag))
1218 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001219 }
1220
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001221 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001222 i++;
1223 while (htmlStartClose[i] != NULL) {
1224 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001225 return (1);
1226 }
1227 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001228 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001230}
1231
1232/**
1233 * htmlAutoCloseOnClose:
1234 * @ctxt: an HTML parser context
1235 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001236 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001237 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001238 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001239 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001240static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001241htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1242{
1243 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001244 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001245
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001246 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001247
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001249
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001250 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1251 break;
1252 /*
1253 * A missplaced endtag can only close elements with lower
1254 * or equal priority, so if we find an element with higher
1255 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001256 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001257 */
1258 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1259 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001260 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001261 if (i < 0)
1262 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001263
1264 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001265 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001266 if ((info != NULL) && (info->endTag == 3)) {
1267 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1268 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001269 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001270 }
1271 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1272 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001273 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001274 }
1275}
1276
1277/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001278 * htmlAutoCloseOnEnd:
1279 * @ctxt: an HTML parser context
1280 *
1281 * Close all remaining tags at the end of the stream
1282 */
1283static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001284htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1285{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001286 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001287
William M. Brack899e64a2003-09-26 18:03:42 +00001288 if (ctxt->nameNr == 0)
1289 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001290 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001291 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1292 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001293 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001294 }
1295}
1296
1297/**
Owen Taylor3473f882001-02-23 17:55:21 +00001298 * htmlAutoClose:
1299 * @ctxt: an HTML parser context
1300 * @newtag: The new tag name or NULL
1301 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001302 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001303 * The list is kept in htmlStartClose array. This function is
1304 * called when a new tag has been detected and generates the
1305 * appropriates closes if possible/needed.
1306 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001307 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001308 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001309static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001310htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1311{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001313 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001314 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1315 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001316 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001317 }
1318 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001319 htmlAutoCloseOnEnd(ctxt);
1320 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001321 }
1322 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001323 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1324 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1325 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001326 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1327 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001328 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001329 }
Owen Taylor3473f882001-02-23 17:55:21 +00001330}
1331
1332/**
1333 * htmlAutoCloseTag:
1334 * @doc: the HTML document
1335 * @name: The tag name
1336 * @elem: the HTML element
1337 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001338 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001339 * The list is kept in htmlStartClose array. This function checks
1340 * if the element or one of it's children would autoclose the
1341 * given tag.
1342 *
1343 * Returns 1 if autoclose, 0 otherwise
1344 */
1345int
1346htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1347 htmlNodePtr child;
1348
1349 if (elem == NULL) return(1);
1350 if (xmlStrEqual(name, elem->name)) return(0);
1351 if (htmlCheckAutoClose(elem->name, name)) return(1);
1352 child = elem->children;
1353 while (child != NULL) {
1354 if (htmlAutoCloseTag(doc, name, child)) return(1);
1355 child = child->next;
1356 }
1357 return(0);
1358}
1359
1360/**
1361 * htmlIsAutoClosed:
1362 * @doc: the HTML document
1363 * @elem: the HTML element
1364 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001365 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001366 * The list is kept in htmlStartClose array. This function checks
1367 * if a tag is autoclosed by one of it's child
1368 *
1369 * Returns 1 if autoclosed, 0 otherwise
1370 */
1371int
1372htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1373 htmlNodePtr child;
1374
1375 if (elem == NULL) return(1);
1376 child = elem->children;
1377 while (child != NULL) {
1378 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1379 child = child->next;
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlCheckImplied:
1386 * @ctxt: an HTML parser context
1387 * @newtag: The new tag name
1388 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001389 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001390 * called when a new tag has been detected and generates the
1391 * appropriates implicit tags if missing
1392 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001393static void
Owen Taylor3473f882001-02-23 17:55:21 +00001394htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001395 int i;
1396
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001397 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1398 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001399 if (!htmlOmittedDefaultValue)
1400 return;
1401 if (xmlStrEqual(newtag, BAD_CAST"html"))
1402 return;
1403 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001404 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001405 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1406 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1407 }
1408 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1409 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001410 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001411 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1412 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1413 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1414 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1415 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1416 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001417 if (ctxt->html >= 3) {
1418 /* we already saw or generated an <head> before */
1419 return;
1420 }
1421 /*
1422 * dropped OBJECT ... i you put it first BODY will be
1423 * assumed !
1424 */
1425 htmlnamePush(ctxt, BAD_CAST"head");
1426 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1427 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001428 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1429 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1430 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001431 if (ctxt->html >= 10) {
1432 /* we already saw or generated a <body> before */
1433 return;
1434 }
Owen Taylor3473f882001-02-23 17:55:21 +00001435 for (i = 0;i < ctxt->nameNr;i++) {
1436 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1437 return;
1438 }
1439 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1440 return;
1441 }
1442 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001443
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001444 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001445 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1446 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1447 }
1448}
1449
1450/**
1451 * htmlCheckParagraph
1452 * @ctxt: an HTML parser context
1453 *
1454 * Check whether a p element need to be implied before inserting
1455 * characters in the current element.
1456 *
1457 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1458 * in case of error.
1459 */
1460
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001461static int
Owen Taylor3473f882001-02-23 17:55:21 +00001462htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1463 const xmlChar *tag;
1464 int i;
1465
1466 if (ctxt == NULL)
1467 return(-1);
1468 tag = ctxt->name;
1469 if (tag == NULL) {
1470 htmlAutoClose(ctxt, BAD_CAST"p");
1471 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001472 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001473 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1474 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1475 return(1);
1476 }
1477 if (!htmlOmittedDefaultValue)
1478 return(0);
1479 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1480 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001481 htmlAutoClose(ctxt, BAD_CAST"p");
1482 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001483 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001484 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1485 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1486 return(1);
1487 }
1488 }
1489 return(0);
1490}
1491
1492/**
1493 * htmlIsScriptAttribute:
1494 * @name: an attribute name
1495 *
1496 * Check if an attribute is of content type Script
1497 *
1498 * Returns 1 is the attribute is a script 0 otherwise
1499 */
1500int
1501htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001502 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001503
1504 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001505 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001506 /*
1507 * all script attributes start with 'on'
1508 */
1509 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001510 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001511 for (i = 0;
1512 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1513 i++) {
1514 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1515 return(1);
1516 }
1517 return(0);
1518}
1519
1520/************************************************************************
1521 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001522 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001523 * *
1524 ************************************************************************/
1525
1526
Daniel Veillard22090732001-07-16 00:06:07 +00001527static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001528/*
1529 * the 4 absolute ones, plus apostrophe.
1530 */
1531{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1532{ 38, "amp", "ampersand, U+0026 ISOnum" },
1533{ 39, "apos", "single quote" },
1534{ 60, "lt", "less-than sign, U+003C ISOnum" },
1535{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1536
1537/*
1538 * A bunch still in the 128-255 range
1539 * Replacing them depend really on the charset used.
1540 */
1541{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1542{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1543{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1544{ 163, "pound","pound sign, U+00A3 ISOnum" },
1545{ 164, "curren","currency sign, U+00A4 ISOnum" },
1546{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1547{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1548{ 167, "sect", "section sign, U+00A7 ISOnum" },
1549{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1550{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1551{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1552{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1553{ 172, "not", "not sign, U+00AC ISOnum" },
1554{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1555{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1556{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1557{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1558{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1559{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1560{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1561{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1562{ 181, "micro","micro sign, U+00B5 ISOnum" },
1563{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1564{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1565{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1566{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1567{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1568{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1569{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1570{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1571{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1572{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1573{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1574{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1575{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1576{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1577{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1578{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1579{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1580{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1581{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1582{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1583{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1584{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1585{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1586{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1587{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1588{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1589{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1590{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1591{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1592{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1593{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1594{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1595{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1596{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1597{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1598{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1599{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1600{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1601{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1602{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1603{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1604{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1605{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1606{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1607{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1608{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1609{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1610{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1611{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1612{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1613{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1614{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1615{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1616{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1617{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1618{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1619{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1620{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1621{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1622{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1623{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1624{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1625{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1626{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1627{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1628{ 247, "divide","division sign, U+00F7 ISOnum" },
1629{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1630{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1631{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1632{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1633{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1634{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1635{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1636{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1637
1638{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1639{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1640{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1641{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1642{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1643
1644/*
1645 * Anything below should really be kept as entities references
1646 */
1647{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1648
1649{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1650{ 732, "tilde","small tilde, U+02DC ISOdia" },
1651
1652{ 913, "Alpha","greek capital letter alpha, U+0391" },
1653{ 914, "Beta", "greek capital letter beta, U+0392" },
1654{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1655{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1656{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1657{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1658{ 919, "Eta", "greek capital letter eta, U+0397" },
1659{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1660{ 921, "Iota", "greek capital letter iota, U+0399" },
1661{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001662{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001663{ 924, "Mu", "greek capital letter mu, U+039C" },
1664{ 925, "Nu", "greek capital letter nu, U+039D" },
1665{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1666{ 927, "Omicron","greek capital letter omicron, U+039F" },
1667{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1668{ 929, "Rho", "greek capital letter rho, U+03A1" },
1669{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1670{ 932, "Tau", "greek capital letter tau, U+03A4" },
1671{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1672{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1673{ 935, "Chi", "greek capital letter chi, U+03A7" },
1674{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1675{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1676
1677{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1678{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1679{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1680{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1681{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1682{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1683{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1684{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1685{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1686{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1687{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1688{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1689{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1690{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1691{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1692{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1693{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1694{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1695{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1696{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1697{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1698{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1699{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1700{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1701{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1702{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1703{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1704{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1705
1706{ 8194, "ensp", "en space, U+2002 ISOpub" },
1707{ 8195, "emsp", "em space, U+2003 ISOpub" },
1708{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1709{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1710{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1711{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1712{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1713{ 8211, "ndash","en dash, U+2013 ISOpub" },
1714{ 8212, "mdash","em dash, U+2014 ISOpub" },
1715{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1716{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1717{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1718{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1719{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1720{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1721{ 8224, "dagger","dagger, U+2020 ISOpub" },
1722{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1723
1724{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1725{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1726
1727{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1728
1729{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1730{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1731
1732{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1733{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1734
1735{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1736{ 8260, "frasl","fraction slash, U+2044 NEW" },
1737
1738{ 8364, "euro", "euro sign, U+20AC NEW" },
1739
1740{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1741{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1742{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1743{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1744{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1745{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1746{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1747{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1748{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1749{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1750{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1751{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1752{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1753{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1754{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1755{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1756
1757{ 8704, "forall","for all, U+2200 ISOtech" },
1758{ 8706, "part", "partial differential, U+2202 ISOtech" },
1759{ 8707, "exist","there exists, U+2203 ISOtech" },
1760{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1761{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1762{ 8712, "isin", "element of, U+2208 ISOtech" },
1763{ 8713, "notin","not an element of, U+2209 ISOtech" },
1764{ 8715, "ni", "contains as member, U+220B ISOtech" },
1765{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001766{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001767{ 8722, "minus","minus sign, U+2212 ISOtech" },
1768{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1769{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1770{ 8733, "prop", "proportional to, U+221D ISOtech" },
1771{ 8734, "infin","infinity, U+221E ISOtech" },
1772{ 8736, "ang", "angle, U+2220 ISOamso" },
1773{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1774{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1775{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1776{ 8746, "cup", "union = cup, U+222A ISOtech" },
1777{ 8747, "int", "integral, U+222B ISOtech" },
1778{ 8756, "there4","therefore, U+2234 ISOtech" },
1779{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1780{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1781{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1782{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1783{ 8801, "equiv","identical to, U+2261 ISOtech" },
1784{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1785{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1786{ 8834, "sub", "subset of, U+2282 ISOtech" },
1787{ 8835, "sup", "superset of, U+2283 ISOtech" },
1788{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1789{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1790{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1791{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1792{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1793{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1794{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1795{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1796{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1797{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1798{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1799{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1800{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1801{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1802
1803{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1804{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1805{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1806{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1807
1808};
1809
1810/************************************************************************
1811 * *
1812 * Commodity functions to handle entities *
1813 * *
1814 ************************************************************************/
1815
1816/*
1817 * Macro used to grow the current buffer.
1818 */
1819#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001820 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001821 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001822 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1823 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001824 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001825 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001826 return(NULL); \
1827 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001828 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001829}
1830
1831/**
1832 * htmlEntityLookup:
1833 * @name: the entity name
1834 *
1835 * Lookup the given entity in EntitiesTable
1836 *
1837 * TODO: the linear scan is really ugly, an hash table is really needed.
1838 *
1839 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1840 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001841const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001842htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001843 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001844
1845 for (i = 0;i < (sizeof(html40EntitiesTable)/
1846 sizeof(html40EntitiesTable[0]));i++) {
1847 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001848 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001849 }
1850 }
1851 return(NULL);
1852}
1853
1854/**
1855 * htmlEntityValueLookup:
1856 * @value: the entity's unicode value
1857 *
1858 * Lookup the given entity in EntitiesTable
1859 *
1860 * TODO: the linear scan is really ugly, an hash table is really needed.
1861 *
1862 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1863 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001864const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001865htmlEntityValueLookup(unsigned int value) {
1866 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001867
1868 for (i = 0;i < (sizeof(html40EntitiesTable)/
1869 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001870 if (html40EntitiesTable[i].value >= value) {
1871 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001872 break;
William M. Brack78637da2003-07-31 14:47:38 +00001873 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001874 }
Owen Taylor3473f882001-02-23 17:55:21 +00001875 }
1876 return(NULL);
1877}
1878
1879/**
1880 * UTF8ToHtml:
1881 * @out: a pointer to an array of bytes to store the result
1882 * @outlen: the length of @out
1883 * @in: a pointer to an array of UTF-8 chars
1884 * @inlen: the length of @in
1885 *
1886 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1887 * plus HTML entities block of chars out.
1888 *
1889 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1890 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001891 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001892 * The value of @outlen after return is the number of octets consumed.
1893 */
1894int
1895UTF8ToHtml(unsigned char* out, int *outlen,
1896 const unsigned char* in, int *inlen) {
1897 const unsigned char* processed = in;
1898 const unsigned char* outend;
1899 const unsigned char* outstart = out;
1900 const unsigned char* instart = in;
1901 const unsigned char* inend;
1902 unsigned int c, d;
1903 int trailing;
1904
Daniel Veillardce682bc2004-11-05 17:22:25 +00001905 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001906 if (in == NULL) {
1907 /*
1908 * initialization nothing to do
1909 */
1910 *outlen = 0;
1911 *inlen = 0;
1912 return(0);
1913 }
1914 inend = in + (*inlen);
1915 outend = out + (*outlen);
1916 while (in < inend) {
1917 d = *in++;
1918 if (d < 0x80) { c= d; trailing= 0; }
1919 else if (d < 0xC0) {
1920 /* trailing byte in leading position */
1921 *outlen = out - outstart;
1922 *inlen = processed - instart;
1923 return(-2);
1924 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1925 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1926 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1927 else {
1928 /* no chance for this in Ascii */
1929 *outlen = out - outstart;
1930 *inlen = processed - instart;
1931 return(-2);
1932 }
1933
1934 if (inend - in < trailing) {
1935 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001936 }
Owen Taylor3473f882001-02-23 17:55:21 +00001937
1938 for ( ; trailing; trailing--) {
1939 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1940 break;
1941 c <<= 6;
1942 c |= d & 0x3F;
1943 }
1944
1945 /* assertion: c is a single UTF-4 value */
1946 if (c < 0x80) {
1947 if (out + 1 >= outend)
1948 break;
1949 *out++ = c;
1950 } else {
1951 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001952 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001953 const char *cp;
1954 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001955
1956 /*
1957 * Try to lookup a predefined HTML entity for it
1958 */
1959
1960 ent = htmlEntityValueLookup(c);
1961 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001962 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1963 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001964 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001965 else
1966 cp = ent->name;
1967 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001968 if (out + 2 + len >= outend)
1969 break;
1970 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001971 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001972 out += len;
1973 *out++ = ';';
1974 }
1975 processed = in;
1976 }
1977 *outlen = out - outstart;
1978 *inlen = processed - instart;
1979 return(0);
1980}
1981
1982/**
1983 * htmlEncodeEntities:
1984 * @out: a pointer to an array of bytes to store the result
1985 * @outlen: the length of @out
1986 * @in: a pointer to an array of UTF-8 chars
1987 * @inlen: the length of @in
1988 * @quoteChar: the quote character to escape (' or ") or zero.
1989 *
1990 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1991 * plus HTML entities block of chars out.
1992 *
1993 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1994 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001995 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001996 * The value of @outlen after return is the number of octets consumed.
1997 */
1998int
1999htmlEncodeEntities(unsigned char* out, int *outlen,
2000 const unsigned char* in, int *inlen, int quoteChar) {
2001 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002002 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002003 const unsigned char* outstart = out;
2004 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002005 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002006 unsigned int c, d;
2007 int trailing;
2008
Daniel Veillardce682bc2004-11-05 17:22:25 +00002009 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2010 return(-1);
2011 outend = out + (*outlen);
2012 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002013 while (in < inend) {
2014 d = *in++;
2015 if (d < 0x80) { c= d; trailing= 0; }
2016 else if (d < 0xC0) {
2017 /* trailing byte in leading position */
2018 *outlen = out - outstart;
2019 *inlen = processed - instart;
2020 return(-2);
2021 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2022 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2023 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2024 else {
2025 /* no chance for this in Ascii */
2026 *outlen = out - outstart;
2027 *inlen = processed - instart;
2028 return(-2);
2029 }
2030
2031 if (inend - in < trailing)
2032 break;
2033
2034 while (trailing--) {
2035 if (((d= *in++) & 0xC0) != 0x80) {
2036 *outlen = out - outstart;
2037 *inlen = processed - instart;
2038 return(-2);
2039 }
2040 c <<= 6;
2041 c |= d & 0x3F;
2042 }
2043
2044 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002045 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2046 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002047 if (out >= outend)
2048 break;
2049 *out++ = c;
2050 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002051 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002052 const char *cp;
2053 char nbuf[16];
2054 int len;
2055
2056 /*
2057 * Try to lookup a predefined HTML entity for it
2058 */
2059 ent = htmlEntityValueLookup(c);
2060 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002061 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002062 cp = nbuf;
2063 }
2064 else
2065 cp = ent->name;
2066 len = strlen(cp);
2067 if (out + 2 + len > outend)
2068 break;
2069 *out++ = '&';
2070 memcpy(out, cp, len);
2071 out += len;
2072 *out++ = ';';
2073 }
2074 processed = in;
2075 }
2076 *outlen = out - outstart;
2077 *inlen = processed - instart;
2078 return(0);
2079}
2080
Owen Taylor3473f882001-02-23 17:55:21 +00002081/************************************************************************
2082 * *
2083 * Commodity functions to handle streams *
2084 * *
2085 ************************************************************************/
2086
2087/**
Owen Taylor3473f882001-02-23 17:55:21 +00002088 * htmlNewInputStream:
2089 * @ctxt: an HTML parser context
2090 *
2091 * Create a new input stream structure
2092 * Returns the new input stream or NULL
2093 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002094static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002095htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2096 htmlParserInputPtr input;
2097
2098 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2099 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002100 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002101 return(NULL);
2102 }
2103 memset(input, 0, sizeof(htmlParserInput));
2104 input->filename = NULL;
2105 input->directory = NULL;
2106 input->base = NULL;
2107 input->cur = NULL;
2108 input->buf = NULL;
2109 input->line = 1;
2110 input->col = 1;
2111 input->buf = NULL;
2112 input->free = NULL;
2113 input->version = NULL;
2114 input->consumed = 0;
2115 input->length = 0;
2116 return(input);
2117}
2118
2119
2120/************************************************************************
2121 * *
2122 * Commodity functions, cleanup needed ? *
2123 * *
2124 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002125/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002126 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002127 * NOTE: it might be more apropriate to integrate this information
2128 * into the html40ElementTable array but I don't want to risk any
2129 * binary incomptibility
2130 */
2131static const char *allowPCData[] = {
2132 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2133 "blockquote", "body", "button", "caption", "center", "cite", "code",
2134 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2135 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2136 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2137 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2138};
Owen Taylor3473f882001-02-23 17:55:21 +00002139
2140/**
2141 * areBlanks:
2142 * @ctxt: an HTML parser context
2143 * @str: a xmlChar *
2144 * @len: the size of @str
2145 *
2146 * Is this a sequence of blank chars that one can ignore ?
2147 *
2148 * Returns 1 if ignorable 0 otherwise.
2149 */
2150
2151static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002152 unsigned int i;
2153 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002154 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002155 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002156
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002157 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002158 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002159
2160 if (CUR == 0) return(1);
2161 if (CUR != '<') return(0);
2162 if (ctxt->name == NULL)
2163 return(1);
2164 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2165 return(1);
2166 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2167 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002168
2169 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2170 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2171 dtd = xmlGetIntSubset(ctxt->myDoc);
2172 if (dtd != NULL && dtd->ExternalID != NULL) {
2173 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2174 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2175 return(1);
2176 }
2177 }
2178
Owen Taylor3473f882001-02-23 17:55:21 +00002179 if (ctxt->node == NULL) return(0);
2180 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002181 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2182 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002183 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002184 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2185 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002186 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002187 for all tags "b" allowing PCDATA */
2188 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2189 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2190 return(0);
2191 }
2192 }
Owen Taylor3473f882001-02-23 17:55:21 +00002193 } else if (xmlNodeIsText(lastChild)) {
2194 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002195 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002196 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002197 for all tags "p" allowing PCDATA */
2198 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2199 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2200 return(0);
2201 }
2202 }
Owen Taylor3473f882001-02-23 17:55:21 +00002203 }
2204 return(1);
2205}
2206
2207/**
Owen Taylor3473f882001-02-23 17:55:21 +00002208 * htmlNewDocNoDtD:
2209 * @URI: URI for the dtd, or NULL
2210 * @ExternalID: the external ID of the DTD, or NULL
2211 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002212 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2213 * are NULL
2214 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002215 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002216 */
2217htmlDocPtr
2218htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2219 xmlDocPtr cur;
2220
2221 /*
2222 * Allocate a new document and fill the fields.
2223 */
2224 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2225 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002226 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002227 return(NULL);
2228 }
2229 memset(cur, 0, sizeof(xmlDoc));
2230
2231 cur->type = XML_HTML_DOCUMENT_NODE;
2232 cur->version = NULL;
2233 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002234 cur->doc = cur;
2235 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002236 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002237 cur->extSubset = NULL;
2238 cur->oldNs = NULL;
2239 cur->encoding = NULL;
2240 cur->standalone = 1;
2241 cur->compression = 0;
2242 cur->ids = NULL;
2243 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002244 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002245 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002246 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002247 if ((ExternalID != NULL) ||
2248 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002249 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002250 return(cur);
2251}
2252
2253/**
2254 * htmlNewDoc:
2255 * @URI: URI for the dtd, or NULL
2256 * @ExternalID: the external ID of the DTD, or NULL
2257 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002258 * Creates a new HTML document
2259 *
Owen Taylor3473f882001-02-23 17:55:21 +00002260 * Returns a new document
2261 */
2262htmlDocPtr
2263htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2264 if ((URI == NULL) && (ExternalID == NULL))
2265 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002266 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2267 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002268
2269 return(htmlNewDocNoDtD(URI, ExternalID));
2270}
2271
2272
2273/************************************************************************
2274 * *
2275 * The parser itself *
2276 * Relates to http://www.w3.org/TR/html40 *
2277 * *
2278 ************************************************************************/
2279
2280/************************************************************************
2281 * *
2282 * The parser itself *
2283 * *
2284 ************************************************************************/
2285
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002286static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002287
Owen Taylor3473f882001-02-23 17:55:21 +00002288/**
2289 * htmlParseHTMLName:
2290 * @ctxt: an HTML parser context
2291 *
2292 * parse an HTML tag or attribute name, note that we convert it to lowercase
2293 * since HTML names are not case-sensitive.
2294 *
2295 * Returns the Tag Name parsed or NULL
2296 */
2297
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002298static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002299htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002300 int i = 0;
2301 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2302
William M. Brackd1757ab2004-10-02 22:07:48 +00002303 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002304 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002305
2306 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002307 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002308 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2309 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002310 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2311 else loc[i] = CUR;
2312 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002313
Owen Taylor3473f882001-02-23 17:55:21 +00002314 NEXT;
2315 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002316
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002317 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002318}
2319
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002320
2321/**
2322 * htmlParseHTMLName_nonInvasive:
2323 * @ctxt: an HTML parser context
2324 *
2325 * parse an HTML tag or attribute name, note that we convert it to lowercase
2326 * since HTML names are not case-sensitive, this doesn't consume the data
2327 * from the stream, it's a look-ahead
2328 *
2329 * Returns the Tag Name parsed or NULL
2330 */
2331
2332static const xmlChar *
2333htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2334 int i = 0;
2335 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2336
2337 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2338 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002339
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002340 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2341 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2342 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2343 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2344 else loc[i] = NXT(1+i);
2345 i++;
2346 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002347
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002348 return(xmlDictLookup(ctxt->dict, loc, i));
2349}
2350
2351
Owen Taylor3473f882001-02-23 17:55:21 +00002352/**
2353 * htmlParseName:
2354 * @ctxt: an HTML parser context
2355 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002356 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002357 *
2358 * Returns the Name parsed or NULL
2359 */
2360
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002361static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002362htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002363 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002364 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002365 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002366
2367 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002368
2369 /*
2370 * Accelerator for simple ASCII names
2371 */
2372 in = ctxt->input->cur;
2373 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2374 ((*in >= 0x41) && (*in <= 0x5A)) ||
2375 (*in == '_') || (*in == ':')) {
2376 in++;
2377 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2378 ((*in >= 0x41) && (*in <= 0x5A)) ||
2379 ((*in >= 0x30) && (*in <= 0x39)) ||
2380 (*in == '_') || (*in == '-') ||
2381 (*in == ':') || (*in == '.'))
2382 in++;
2383 if ((*in > 0) && (*in < 0x80)) {
2384 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002385 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002386 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002387 ctxt->nbChars += count;
2388 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002389 return(ret);
2390 }
2391 }
2392 return(htmlParseNameComplex(ctxt));
2393}
2394
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002395static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002396htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002397 int len = 0, l;
2398 int c;
2399 int count = 0;
2400
2401 /*
2402 * Handler for more complex cases
2403 */
2404 GROW;
2405 c = CUR_CHAR(l);
2406 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2407 (!IS_LETTER(c) && (c != '_') &&
2408 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002409 return(NULL);
2410 }
2411
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002412 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2413 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2414 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002415 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002416 (IS_COMBINING(c)) ||
2417 (IS_EXTENDER(c)))) {
2418 if (count++ > 100) {
2419 count = 0;
2420 GROW;
2421 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002422 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002423 NEXTL(l);
2424 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002425 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002426 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002427}
2428
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002429
Owen Taylor3473f882001-02-23 17:55:21 +00002430/**
2431 * htmlParseHTMLAttribute:
2432 * @ctxt: an HTML parser context
2433 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002434 *
Owen Taylor3473f882001-02-23 17:55:21 +00002435 * parse an HTML attribute value till the stop (quote), if
2436 * stop is 0 then it stops at the first space
2437 *
2438 * Returns the attribute parsed or NULL
2439 */
2440
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002441static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002442htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2443 xmlChar *buffer = NULL;
2444 int buffer_size = 0;
2445 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002446 const xmlChar *name = NULL;
2447 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002448 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002449
2450 /*
2451 * allocate a translation buffer.
2452 */
2453 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002454 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002455 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002456 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002457 return(NULL);
2458 }
2459 out = buffer;
2460
2461 /*
2462 * Ok loop until we reach one of the ending chars
2463 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002464 while ((CUR != 0) && (CUR != stop)) {
2465 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002466 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002467 if (CUR == '&') {
2468 if (NXT(1) == '#') {
2469 unsigned int c;
2470 int bits;
2471
2472 c = htmlParseCharRef(ctxt);
2473 if (c < 0x80)
2474 { *out++ = c; bits= -6; }
2475 else if (c < 0x800)
2476 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2477 else if (c < 0x10000)
2478 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002479 else
Owen Taylor3473f882001-02-23 17:55:21 +00002480 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002481
Owen Taylor3473f882001-02-23 17:55:21 +00002482 for ( ; bits >= 0; bits-= 6) {
2483 *out++ = ((c >> bits) & 0x3F) | 0x80;
2484 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002485
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002486 if (out - buffer > buffer_size - 100) {
2487 int indx = out - buffer;
2488
2489 growBuffer(buffer);
2490 out = &buffer[indx];
2491 }
Owen Taylor3473f882001-02-23 17:55:21 +00002492 } else {
2493 ent = htmlParseEntityRef(ctxt, &name);
2494 if (name == NULL) {
2495 *out++ = '&';
2496 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002497 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002498
2499 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002500 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002501 }
2502 } else if (ent == NULL) {
2503 *out++ = '&';
2504 cur = name;
2505 while (*cur != 0) {
2506 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002507 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002508
2509 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002510 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002511 }
2512 *out++ = *cur++;
2513 }
Owen Taylor3473f882001-02-23 17:55:21 +00002514 } else {
2515 unsigned int c;
2516 int bits;
2517
2518 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002519 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002520
2521 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002522 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002523 }
Daniel Veillard48519092006-10-17 15:56:35 +00002524 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002525 if (c < 0x80)
2526 { *out++ = c; bits= -6; }
2527 else if (c < 0x800)
2528 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2529 else if (c < 0x10000)
2530 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002531 else
Owen Taylor3473f882001-02-23 17:55:21 +00002532 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002533
Owen Taylor3473f882001-02-23 17:55:21 +00002534 for ( ; bits >= 0; bits-= 6) {
2535 *out++ = ((c >> bits) & 0x3F) | 0x80;
2536 }
Owen Taylor3473f882001-02-23 17:55:21 +00002537 }
2538 }
2539 } else {
2540 unsigned int c;
2541 int bits, l;
2542
2543 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002544 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002545
2546 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002547 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002548 }
2549 c = CUR_CHAR(l);
2550 if (c < 0x80)
2551 { *out++ = c; bits= -6; }
2552 else if (c < 0x800)
2553 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2554 else if (c < 0x10000)
2555 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002556 else
Owen Taylor3473f882001-02-23 17:55:21 +00002557 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002558
Owen Taylor3473f882001-02-23 17:55:21 +00002559 for ( ; bits >= 0; bits-= 6) {
2560 *out++ = ((c >> bits) & 0x3F) | 0x80;
2561 }
2562 NEXT;
2563 }
2564 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002565 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002566 return(buffer);
2567}
2568
2569/**
Owen Taylor3473f882001-02-23 17:55:21 +00002570 * htmlParseEntityRef:
2571 * @ctxt: an HTML parser context
2572 * @str: location to store the entity name
2573 *
2574 * parse an HTML ENTITY references
2575 *
2576 * [68] EntityRef ::= '&' Name ';'
2577 *
2578 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2579 * if non-NULL *str will have to be freed by the caller.
2580 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002581const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002582htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2583 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002584 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002585
2586 if (str != NULL) *str = NULL;
2587 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002588
2589 if (CUR == '&') {
2590 NEXT;
2591 name = htmlParseName(ctxt);
2592 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002593 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2594 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002595 } else {
2596 GROW;
2597 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002598 if (str != NULL)
2599 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002600
2601 /*
2602 * Lookup the entity in the table.
2603 */
2604 ent = htmlEntityLookup(name);
2605 if (ent != NULL) /* OK that's ugly !!! */
2606 NEXT;
2607 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002608 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2609 "htmlParseEntityRef: expecting ';'\n",
2610 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002611 if (str != NULL)
2612 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002613 }
2614 }
2615 }
2616 return(ent);
2617}
2618
2619/**
2620 * htmlParseAttValue:
2621 * @ctxt: an HTML parser context
2622 *
2623 * parse a value for an attribute
2624 * Note: the parser won't do substitution of entities here, this
2625 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002626 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002627 *
2628 * Returns the AttValue parsed or NULL.
2629 */
2630
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002631static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002632htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2633 xmlChar *ret = NULL;
2634
2635 if (CUR == '"') {
2636 NEXT;
2637 ret = htmlParseHTMLAttribute(ctxt, '"');
2638 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002639 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2640 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002641 } else
2642 NEXT;
2643 } else if (CUR == '\'') {
2644 NEXT;
2645 ret = htmlParseHTMLAttribute(ctxt, '\'');
2646 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002647 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2648 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002649 } else
2650 NEXT;
2651 } else {
2652 /*
2653 * That's an HTMLism, the attribute value may not be quoted
2654 */
2655 ret = htmlParseHTMLAttribute(ctxt, 0);
2656 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002657 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2658 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002659 }
2660 }
2661 return(ret);
2662}
2663
2664/**
2665 * htmlParseSystemLiteral:
2666 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002667 *
Owen Taylor3473f882001-02-23 17:55:21 +00002668 * parse an HTML Literal
2669 *
2670 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2671 *
2672 * Returns the SystemLiteral parsed or NULL
2673 */
2674
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002675static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002676htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2677 const xmlChar *q;
2678 xmlChar *ret = NULL;
2679
2680 if (CUR == '"') {
2681 NEXT;
2682 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002683 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002684 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002685 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002686 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2687 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002688 } else {
2689 ret = xmlStrndup(q, CUR_PTR - q);
2690 NEXT;
2691 }
2692 } else if (CUR == '\'') {
2693 NEXT;
2694 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002695 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002696 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002697 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002698 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2699 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002700 } else {
2701 ret = xmlStrndup(q, CUR_PTR - q);
2702 NEXT;
2703 }
2704 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002705 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2706 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002707 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002708
Owen Taylor3473f882001-02-23 17:55:21 +00002709 return(ret);
2710}
2711
2712/**
2713 * htmlParsePubidLiteral:
2714 * @ctxt: an HTML parser context
2715 *
2716 * parse an HTML public literal
2717 *
2718 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2719 *
2720 * Returns the PubidLiteral parsed or NULL.
2721 */
2722
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002723static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002724htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2725 const xmlChar *q;
2726 xmlChar *ret = NULL;
2727 /*
2728 * Name ::= (Letter | '_') (NameChar)*
2729 */
2730 if (CUR == '"') {
2731 NEXT;
2732 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002733 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002734 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002735 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2736 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002737 } else {
2738 ret = xmlStrndup(q, CUR_PTR - q);
2739 NEXT;
2740 }
2741 } else if (CUR == '\'') {
2742 NEXT;
2743 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002744 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002745 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002746 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002747 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2748 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002749 } else {
2750 ret = xmlStrndup(q, CUR_PTR - q);
2751 NEXT;
2752 }
2753 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002754 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2755 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002756 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002757
Owen Taylor3473f882001-02-23 17:55:21 +00002758 return(ret);
2759}
2760
2761/**
2762 * htmlParseScript:
2763 * @ctxt: an HTML parser context
2764 *
2765 * parse the content of an HTML SCRIPT or STYLE element
2766 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2767 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2768 * http://www.w3.org/TR/html4/types.html#type-script
2769 * http://www.w3.org/TR/html4/types.html#h-6.15
2770 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2771 *
2772 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2773 * element and the value of intrinsic event attributes. User agents must
2774 * not evaluate script data as HTML markup but instead must pass it on as
2775 * data to a script engine.
2776 * NOTES:
2777 * - The content is passed like CDATA
2778 * - the attributes for style and scripting "onXXX" are also described
2779 * as CDATA but SGML allows entities references in attributes so their
2780 * processing is identical as other attributes
2781 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002782static void
Owen Taylor3473f882001-02-23 17:55:21 +00002783htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002784 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002785 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002786 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002787
2788 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002789 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002790 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002791 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002792 /*
2793 * One should break here, the specification is clear:
2794 * Authors should therefore escape "</" within the content.
2795 * Escape mechanisms are specific to each scripting or
2796 * style sheet language.
2797 *
2798 * In recovery mode, only break if end tag match the
2799 * current tag, effectively ignoring all tags inside the
2800 * script/style block and treating the entire block as
2801 * CDATA.
2802 */
2803 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002804 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2805 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002806 {
2807 break; /* while */
2808 } else {
2809 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002810 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002811 ctxt->name, NULL);
2812 }
2813 } else {
2814 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002815 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002816 {
2817 break; /* while */
2818 }
2819 }
Owen Taylor3473f882001-02-23 17:55:21 +00002820 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002821 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002822 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2823 if (ctxt->sax->cdataBlock!= NULL) {
2824 /*
2825 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2826 */
2827 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002828 } else if (ctxt->sax->characters != NULL) {
2829 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002830 }
2831 nbchar = 0;
2832 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002833 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002834 NEXTL(l);
2835 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002836 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002837
Daniel Veillard68716a72006-10-16 09:32:17 +00002838 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002839 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2840 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002841 NEXT;
2842 }
2843
2844 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2845 if (ctxt->sax->cdataBlock!= NULL) {
2846 /*
2847 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2848 */
2849 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002850 } else if (ctxt->sax->characters != NULL) {
2851 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002852 }
2853 }
2854}
2855
2856
2857/**
2858 * htmlParseCharData:
2859 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002860 *
2861 * parse a CharData section.
2862 * if we are within a CDATA section ']]>' marks an end of section.
2863 *
2864 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2865 */
2866
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002867static void
2868htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002869 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2870 int nbchar = 0;
2871 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002872 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002873
2874 SHRINK;
2875 cur = CUR_CHAR(l);
2876 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002877 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002878 (cur != 0)) {
2879 if (!(IS_CHAR(cur))) {
2880 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2881 "Invalid char in CDATA 0x%X\n", cur);
2882 } else {
2883 COPY_BUF(l,buf,nbchar,cur);
2884 }
Owen Taylor3473f882001-02-23 17:55:21 +00002885 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2886 /*
2887 * Ok the segment is to be consumed as chars.
2888 */
2889 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2890 if (areBlanks(ctxt, buf, nbchar)) {
2891 if (ctxt->sax->ignorableWhitespace != NULL)
2892 ctxt->sax->ignorableWhitespace(ctxt->userData,
2893 buf, nbchar);
2894 } else {
2895 htmlCheckParagraph(ctxt);
2896 if (ctxt->sax->characters != NULL)
2897 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2898 }
2899 }
2900 nbchar = 0;
2901 }
2902 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002903 chunk++;
2904 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2905 chunk = 0;
2906 SHRINK;
2907 GROW;
2908 }
Owen Taylor3473f882001-02-23 17:55:21 +00002909 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002910 if (cur == 0) {
2911 SHRINK;
2912 GROW;
2913 cur = CUR_CHAR(l);
2914 }
Owen Taylor3473f882001-02-23 17:55:21 +00002915 }
2916 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002917 buf[nbchar] = 0;
2918
Owen Taylor3473f882001-02-23 17:55:21 +00002919 /*
2920 * Ok the segment is to be consumed as chars.
2921 */
2922 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2923 if (areBlanks(ctxt, buf, nbchar)) {
2924 if (ctxt->sax->ignorableWhitespace != NULL)
2925 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2926 } else {
2927 htmlCheckParagraph(ctxt);
2928 if (ctxt->sax->characters != NULL)
2929 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2930 }
2931 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002932 } else {
2933 /*
2934 * Loop detection
2935 */
2936 if (cur == 0)
2937 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002938 }
2939}
2940
2941/**
2942 * htmlParseExternalID:
2943 * @ctxt: an HTML parser context
2944 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002945 *
2946 * Parse an External ID or a Public ID
2947 *
Owen Taylor3473f882001-02-23 17:55:21 +00002948 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2949 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2950 *
2951 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2952 *
2953 * Returns the function returns SystemLiteral and in the second
2954 * case publicID receives PubidLiteral, is strict is off
2955 * it is possible to return NULL and have publicID set.
2956 */
2957
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002958static xmlChar *
2959htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002960 xmlChar *URI = NULL;
2961
2962 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2963 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2964 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2965 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002966 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002967 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2968 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002969 }
2970 SKIP_BLANKS;
2971 URI = htmlParseSystemLiteral(ctxt);
2972 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002973 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2974 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002975 }
2976 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2977 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2978 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2979 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002980 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002981 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2982 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002983 }
2984 SKIP_BLANKS;
2985 *publicID = htmlParsePubidLiteral(ctxt);
2986 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002987 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2988 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2989 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002990 }
2991 SKIP_BLANKS;
2992 if ((CUR == '"') || (CUR == '\'')) {
2993 URI = htmlParseSystemLiteral(ctxt);
2994 }
2995 }
2996 return(URI);
2997}
2998
2999/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003000 * xmlParsePI:
3001 * @ctxt: an XML parser context
3002 *
3003 * parse an XML Processing Instruction.
3004 *
3005 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3006 */
3007static void
3008htmlParsePI(htmlParserCtxtPtr ctxt) {
3009 xmlChar *buf = NULL;
3010 int len = 0;
3011 int size = HTML_PARSER_BUFFER_SIZE;
3012 int cur, l;
3013 const xmlChar *target;
3014 xmlParserInputState state;
3015 int count = 0;
3016
3017 if ((RAW == '<') && (NXT(1) == '?')) {
3018 state = ctxt->instate;
3019 ctxt->instate = XML_PARSER_PI;
3020 /*
3021 * this is a Processing Instruction.
3022 */
3023 SKIP(2);
3024 SHRINK;
3025
3026 /*
3027 * Parse the target name and check for special support like
3028 * namespace.
3029 */
3030 target = htmlParseName(ctxt);
3031 if (target != NULL) {
3032 if (RAW == '>') {
3033 SKIP(1);
3034
3035 /*
3036 * SAX: PI detected.
3037 */
3038 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3039 (ctxt->sax->processingInstruction != NULL))
3040 ctxt->sax->processingInstruction(ctxt->userData,
3041 target, NULL);
3042 ctxt->instate = state;
3043 return;
3044 }
3045 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3046 if (buf == NULL) {
3047 htmlErrMemory(ctxt, NULL);
3048 ctxt->instate = state;
3049 return;
3050 }
3051 cur = CUR;
3052 if (!IS_BLANK(cur)) {
3053 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3054 "ParsePI: PI %s space expected\n", target, NULL);
3055 }
3056 SKIP_BLANKS;
3057 cur = CUR_CHAR(l);
3058 while (IS_CHAR(cur) && (cur != '>')) {
3059 if (len + 5 >= size) {
3060 xmlChar *tmp;
3061
3062 size *= 2;
3063 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3064 if (tmp == NULL) {
3065 htmlErrMemory(ctxt, NULL);
3066 xmlFree(buf);
3067 ctxt->instate = state;
3068 return;
3069 }
3070 buf = tmp;
3071 }
3072 count++;
3073 if (count > 50) {
3074 GROW;
3075 count = 0;
3076 }
3077 COPY_BUF(l,buf,len,cur);
3078 NEXTL(l);
3079 cur = CUR_CHAR(l);
3080 if (cur == 0) {
3081 SHRINK;
3082 GROW;
3083 cur = CUR_CHAR(l);
3084 }
3085 }
3086 buf[len] = 0;
3087 if (cur != '>') {
3088 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3089 "ParsePI: PI %s never end ...\n", target, NULL);
3090 } else {
3091 SKIP(1);
3092
3093 /*
3094 * SAX: PI detected.
3095 */
3096 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3097 (ctxt->sax->processingInstruction != NULL))
3098 ctxt->sax->processingInstruction(ctxt->userData,
3099 target, buf);
3100 }
3101 xmlFree(buf);
3102 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003103 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003104 "PI is not started correctly", NULL, NULL);
3105 }
3106 ctxt->instate = state;
3107 }
3108}
3109
3110/**
Owen Taylor3473f882001-02-23 17:55:21 +00003111 * htmlParseComment:
3112 * @ctxt: an HTML parser context
3113 *
3114 * Parse an XML (SGML) comment <!-- .... -->
3115 *
3116 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3117 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003118static void
Owen Taylor3473f882001-02-23 17:55:21 +00003119htmlParseComment(htmlParserCtxtPtr ctxt) {
3120 xmlChar *buf = NULL;
3121 int len;
3122 int size = HTML_PARSER_BUFFER_SIZE;
3123 int q, ql;
3124 int r, rl;
3125 int cur, l;
3126 xmlParserInputState state;
3127
3128 /*
3129 * Check that there is a comment right here.
3130 */
3131 if ((RAW != '<') || (NXT(1) != '!') ||
3132 (NXT(2) != '-') || (NXT(3) != '-')) return;
3133
3134 state = ctxt->instate;
3135 ctxt->instate = XML_PARSER_COMMENT;
3136 SHRINK;
3137 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003138 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003139 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003140 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003141 ctxt->instate = state;
3142 return;
3143 }
3144 q = CUR_CHAR(ql);
3145 NEXTL(ql);
3146 r = CUR_CHAR(rl);
3147 NEXTL(rl);
3148 cur = CUR_CHAR(l);
3149 len = 0;
3150 while (IS_CHAR(cur) &&
3151 ((cur != '>') ||
3152 (r != '-') || (q != '-'))) {
3153 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003154 xmlChar *tmp;
3155
Owen Taylor3473f882001-02-23 17:55:21 +00003156 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003157 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3158 if (tmp == NULL) {
3159 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003160 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003161 ctxt->instate = state;
3162 return;
3163 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003164 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003165 }
3166 COPY_BUF(ql,buf,len,q);
3167 q = r;
3168 ql = rl;
3169 r = cur;
3170 rl = l;
3171 NEXTL(l);
3172 cur = CUR_CHAR(l);
3173 if (cur == 0) {
3174 SHRINK;
3175 GROW;
3176 cur = CUR_CHAR(l);
3177 }
3178 }
3179 buf[len] = 0;
3180 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003181 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3182 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003183 xmlFree(buf);
3184 } else {
3185 NEXT;
3186 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3187 (!ctxt->disableSAX))
3188 ctxt->sax->comment(ctxt->userData, buf);
3189 xmlFree(buf);
3190 }
3191 ctxt->instate = state;
3192}
3193
3194/**
3195 * htmlParseCharRef:
3196 * @ctxt: an HTML parser context
3197 *
3198 * parse Reference declarations
3199 *
3200 * [66] CharRef ::= '&#' [0-9]+ ';' |
3201 * '&#x' [0-9a-fA-F]+ ';'
3202 *
3203 * Returns the value parsed (as an int)
3204 */
3205int
3206htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3207 int val = 0;
3208
Daniel Veillarda03e3652004-11-02 18:45:30 +00003209 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3210 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3211 "htmlParseCharRef: context error\n",
3212 NULL, NULL);
3213 return(0);
3214 }
Owen Taylor3473f882001-02-23 17:55:21 +00003215 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003216 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003217 SKIP(3);
3218 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003219 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003220 val = val * 16 + (CUR - '0');
3221 else if ((CUR >= 'a') && (CUR <= 'f'))
3222 val = val * 16 + (CUR - 'a') + 10;
3223 else if ((CUR >= 'A') && (CUR <= 'F'))
3224 val = val * 16 + (CUR - 'A') + 10;
3225 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003226 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003227 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003228 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003229 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003230 }
3231 NEXT;
3232 }
3233 if (CUR == ';')
3234 NEXT;
3235 } else if ((CUR == '&') && (NXT(1) == '#')) {
3236 SKIP(2);
3237 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003238 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003239 val = val * 10 + (CUR - '0');
3240 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003241 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003242 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003243 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003244 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003245 }
3246 NEXT;
3247 }
3248 if (CUR == ';')
3249 NEXT;
3250 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003251 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3252 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003253 }
3254 /*
3255 * Check the value IS_CHAR ...
3256 */
3257 if (IS_CHAR(val)) {
3258 return(val);
3259 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003260 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3261 "htmlParseCharRef: invalid xmlChar value %d\n",
3262 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003263 }
3264 return(0);
3265}
3266
3267
3268/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003269 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003270 * @ctxt: an HTML parser context
3271 *
3272 * parse a DOCTYPE declaration
3273 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003274 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003275 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3276 */
3277
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003278static void
Owen Taylor3473f882001-02-23 17:55:21 +00003279htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003280 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003281 xmlChar *ExternalID = NULL;
3282 xmlChar *URI = NULL;
3283
3284 /*
3285 * We know that '<!DOCTYPE' has been detected.
3286 */
3287 SKIP(9);
3288
3289 SKIP_BLANKS;
3290
3291 /*
3292 * Parse the DOCTYPE name.
3293 */
3294 name = htmlParseName(ctxt);
3295 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003296 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3297 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3298 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003299 }
3300 /*
3301 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3302 */
3303
3304 SKIP_BLANKS;
3305
3306 /*
3307 * Check for SystemID and ExternalID
3308 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003309 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003310 SKIP_BLANKS;
3311
3312 /*
3313 * We should be at the end of the DOCTYPE declaration.
3314 */
3315 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003316 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3317 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003318 /* We shouldn't try to resynchronize ... */
3319 }
3320 NEXT;
3321
3322 /*
3323 * Create or update the document accordingly to the DOCTYPE
3324 */
3325 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3326 (!ctxt->disableSAX))
3327 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3328
3329 /*
3330 * Cleanup, since we don't use all those identifiers
3331 */
3332 if (URI != NULL) xmlFree(URI);
3333 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003334}
3335
3336/**
3337 * htmlParseAttribute:
3338 * @ctxt: an HTML parser context
3339 * @value: a xmlChar ** used to store the value of the attribute
3340 *
3341 * parse an attribute
3342 *
3343 * [41] Attribute ::= Name Eq AttValue
3344 *
3345 * [25] Eq ::= S? '=' S?
3346 *
3347 * With namespace:
3348 *
3349 * [NS 11] Attribute ::= QName Eq AttValue
3350 *
3351 * Also the case QName == xmlns:??? is handled independently as a namespace
3352 * definition.
3353 *
3354 * Returns the attribute name, and the value in *value.
3355 */
3356
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003357static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003358htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003359 const xmlChar *name;
3360 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003361
3362 *value = NULL;
3363 name = htmlParseHTMLName(ctxt);
3364 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003365 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3366 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003367 return(NULL);
3368 }
3369
3370 /*
3371 * read the value
3372 */
3373 SKIP_BLANKS;
3374 if (CUR == '=') {
3375 NEXT;
3376 SKIP_BLANKS;
3377 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003378 } else if (htmlIsBooleanAttr(name)) {
3379 /*
3380 * assume a minimized attribute
3381 */
3382 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003383 }
3384
3385 *value = val;
3386 return(name);
3387}
3388
3389/**
3390 * htmlCheckEncoding:
3391 * @ctxt: an HTML parser context
3392 * @attvalue: the attribute value
3393 *
3394 * Checks an http-equiv attribute from a Meta tag to detect
3395 * the encoding
3396 * If a new encoding is detected the parser is switched to decode
3397 * it and pass UTF8
3398 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003399static void
Owen Taylor3473f882001-02-23 17:55:21 +00003400htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3401 const xmlChar *encoding;
3402
3403 if ((ctxt == NULL) || (attvalue == NULL))
3404 return;
3405
Daniel Veillarde77db162009-08-22 11:32:38 +02003406 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003407 if (ctxt->input->encoding != NULL)
3408 return;
3409
3410 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3411 if (encoding != NULL) {
3412 encoding += 8;
3413 } else {
3414 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3415 if (encoding != NULL)
3416 encoding += 9;
3417 }
3418 if (encoding != NULL) {
3419 xmlCharEncoding enc;
3420 xmlCharEncodingHandlerPtr handler;
3421
3422 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3423
3424 if (ctxt->input->encoding != NULL)
3425 xmlFree((xmlChar *) ctxt->input->encoding);
3426 ctxt->input->encoding = xmlStrdup(encoding);
3427
3428 enc = xmlParseCharEncoding((const char *) encoding);
3429 /*
3430 * registered set of known encodings
3431 */
3432 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003433 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003434 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3435 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3436 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3437 (ctxt->input->buf != NULL) &&
3438 (ctxt->input->buf->encoder == NULL)) {
3439 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3440 "htmlCheckEncoding: wrong encoding meta\n",
3441 NULL, NULL);
3442 } else {
3443 xmlSwitchEncoding(ctxt, enc);
3444 }
Owen Taylor3473f882001-02-23 17:55:21 +00003445 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3446 } else {
3447 /*
3448 * fallback for unknown encodings
3449 */
3450 handler = xmlFindCharEncodingHandler((const char *) encoding);
3451 if (handler != NULL) {
3452 xmlSwitchToEncoding(ctxt, handler);
3453 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3454 } else {
3455 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3456 }
3457 }
3458
3459 if ((ctxt->input->buf != NULL) &&
3460 (ctxt->input->buf->encoder != NULL) &&
3461 (ctxt->input->buf->raw != NULL) &&
3462 (ctxt->input->buf->buffer != NULL)) {
3463 int nbchars;
3464 int processed;
3465
3466 /*
3467 * convert as much as possible to the parser reading buffer.
3468 */
3469 processed = ctxt->input->cur - ctxt->input->base;
3470 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3471 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3472 ctxt->input->buf->buffer,
3473 ctxt->input->buf->raw);
3474 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003475 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3476 "htmlCheckEncoding: encoder error\n",
3477 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003478 }
3479 ctxt->input->base =
3480 ctxt->input->cur = ctxt->input->buf->buffer->content;
Eugene Pimenov1e60fbc2010-03-10 18:10:49 +01003481 ctxt->input->end =
3482 &ctxt->input->base[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00003483 }
3484 }
3485}
3486
3487/**
3488 * htmlCheckMeta:
3489 * @ctxt: an HTML parser context
3490 * @atts: the attributes values
3491 *
3492 * Checks an attributes from a Meta tag
3493 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003494static void
Owen Taylor3473f882001-02-23 17:55:21 +00003495htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3496 int i;
3497 const xmlChar *att, *value;
3498 int http = 0;
3499 const xmlChar *content = NULL;
3500
3501 if ((ctxt == NULL) || (atts == NULL))
3502 return;
3503
3504 i = 0;
3505 att = atts[i++];
3506 while (att != NULL) {
3507 value = atts[i++];
3508 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3509 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3510 http = 1;
3511 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3512 content = value;
3513 att = atts[i++];
3514 }
3515 if ((http) && (content != NULL))
3516 htmlCheckEncoding(ctxt, content);
3517
3518}
3519
3520/**
3521 * htmlParseStartTag:
3522 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003523 *
Owen Taylor3473f882001-02-23 17:55:21 +00003524 * parse a start of tag either for rule element or
3525 * EmptyElement. In both case we don't parse the tag closing chars.
3526 *
3527 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3528 *
3529 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3530 *
3531 * With namespace:
3532 *
3533 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3534 *
3535 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3536 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003537 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003538 */
3539
Daniel Veillard597f1c12005-07-03 23:00:18 +00003540static int
Owen Taylor3473f882001-02-23 17:55:21 +00003541htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003542 const xmlChar *name;
3543 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003544 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003545 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003546 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003547 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003548 int meta = 0;
3549 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003550 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003551
Daniel Veillarde77db162009-08-22 11:32:38 +02003552 if (ctxt->instate == XML_PARSER_EOF)
3553 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003554 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3555 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3556 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003557 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003558 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003559 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003560 NEXT;
3561
Daniel Veillard30e76072006-03-09 14:13:55 +00003562 atts = ctxt->atts;
3563 maxatts = ctxt->maxatts;
3564
Owen Taylor3473f882001-02-23 17:55:21 +00003565 GROW;
3566 name = htmlParseHTMLName(ctxt);
3567 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003568 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3569 "htmlParseStartTag: invalid element name\n",
3570 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003571 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003572 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3573 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003574 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003575 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003576 }
3577 if (xmlStrEqual(name, BAD_CAST"meta"))
3578 meta = 1;
3579
3580 /*
3581 * Check for auto-closure of HTML elements.
3582 */
3583 htmlAutoClose(ctxt, name);
3584
3585 /*
3586 * Check for implied HTML elements.
3587 */
3588 htmlCheckImplied(ctxt, name);
3589
3590 /*
3591 * Avoid html at any level > 0, head at any level != 1
3592 * or any attempt to recurse body
3593 */
3594 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003595 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3596 "htmlParseStartTag: misplaced <html> tag\n",
3597 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003598 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003599 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003600 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003601 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003602 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003603 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3604 "htmlParseStartTag: misplaced <head> tag\n",
3605 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003606 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003607 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003608 }
3609 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003610 int indx;
3611 for (indx = 0;indx < ctxt->nameNr;indx++) {
3612 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003613 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3614 "htmlParseStartTag: misplaced <body> tag\n",
3615 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003616 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003617 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003618 }
3619 }
3620 }
3621
3622 /*
3623 * Now parse the attributes, it ends up with the ending
3624 *
3625 * (S Attribute)* S?
3626 */
3627 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003628 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003629 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003630 ((CUR != '/') || (NXT(1) != '>'))) {
3631 long cons = ctxt->nbChars;
3632
3633 GROW;
3634 attname = htmlParseAttribute(ctxt, &attvalue);
3635 if (attname != NULL) {
3636
3637 /*
3638 * Well formedness requires at most one declaration of an attribute
3639 */
3640 for (i = 0; i < nbatts;i += 2) {
3641 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003642 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3643 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003644 if (attvalue != NULL)
3645 xmlFree(attvalue);
3646 goto failed;
3647 }
3648 }
3649
3650 /*
3651 * Add the pair to atts
3652 */
3653 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003654 maxatts = 22; /* allow for 10 attrs by default */
3655 atts = (const xmlChar **)
3656 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003657 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003658 htmlErrMemory(ctxt, NULL);
3659 if (attvalue != NULL)
3660 xmlFree(attvalue);
3661 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003662 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003663 ctxt->atts = atts;
3664 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003665 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003666 const xmlChar **n;
3667
Owen Taylor3473f882001-02-23 17:55:21 +00003668 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003669 n = (const xmlChar **) xmlRealloc((void *) atts,
3670 maxatts * sizeof(const xmlChar *));
3671 if (n == NULL) {
3672 htmlErrMemory(ctxt, NULL);
3673 if (attvalue != NULL)
3674 xmlFree(attvalue);
3675 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003676 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003677 atts = n;
3678 ctxt->atts = atts;
3679 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003680 }
3681 atts[nbatts++] = attname;
3682 atts[nbatts++] = attvalue;
3683 atts[nbatts] = NULL;
3684 atts[nbatts + 1] = NULL;
3685 }
3686 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003687 if (attvalue != NULL)
3688 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003689 /* Dump the bogus attribute string up to the next blank or
3690 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003691 while ((IS_CHAR_CH(CUR)) &&
3692 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003693 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003694 NEXT;
3695 }
3696
3697failed:
3698 SKIP_BLANKS;
3699 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003700 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3701 "htmlParseStartTag: problem parsing attributes\n",
3702 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003703 break;
3704 }
3705 }
3706
3707 /*
3708 * Handle specific association to the META tag
3709 */
William M. Bracke978ae22007-03-21 06:16:02 +00003710 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003711 htmlCheckMeta(ctxt, atts);
3712
3713 /*
3714 * SAX: Start of Element !
3715 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003716 if (!discardtag) {
3717 htmlnamePush(ctxt, name);
3718 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3719 if (nbatts != 0)
3720 ctxt->sax->startElement(ctxt->userData, name, atts);
3721 else
3722 ctxt->sax->startElement(ctxt->userData, name, NULL);
3723 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003724 }
Owen Taylor3473f882001-02-23 17:55:21 +00003725
3726 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003727 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003728 if (atts[i] != NULL)
3729 xmlFree((xmlChar *) atts[i]);
3730 }
Owen Taylor3473f882001-02-23 17:55:21 +00003731 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003732
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003733 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003734}
3735
3736/**
3737 * htmlParseEndTag:
3738 * @ctxt: an HTML parser context
3739 *
3740 * parse an end of tag
3741 *
3742 * [42] ETag ::= '</' Name S? '>'
3743 *
3744 * With namespace
3745 *
3746 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003747 *
3748 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003749 */
3750
Daniel Veillardf420ac52001-07-04 16:04:09 +00003751static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003752htmlParseEndTag(htmlParserCtxtPtr ctxt)
3753{
3754 const xmlChar *name;
3755 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003756 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003757
3758 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003759 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3760 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003761 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003762 }
3763 SKIP(2);
3764
3765 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003766 if (name == NULL)
3767 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003768 /*
3769 * We should definitely be at the ending "S? '>'" part
3770 */
3771 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003772 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003773 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3774 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003775 if (ctxt->recovery) {
3776 /*
3777 * We're not at the ending > !!
3778 * Error, unless in recover mode where we search forwards
3779 * until we find a >
3780 */
3781 while (CUR != '\0' && CUR != '>') NEXT;
3782 NEXT;
3783 }
Owen Taylor3473f882001-02-23 17:55:21 +00003784 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003785 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003786
3787 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003788 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3789 * out now.
3790 */
3791 if ((ctxt->depth > 0) &&
3792 (xmlStrEqual(name, BAD_CAST "html") ||
3793 xmlStrEqual(name, BAD_CAST "body") ||
3794 xmlStrEqual(name, BAD_CAST "head"))) {
3795 ctxt->depth--;
3796 return (0);
3797 }
3798
3799 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003800 * If the name read is not one of the element in the parsing stack
3801 * then return, it's just an error.
3802 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003803 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3804 if (xmlStrEqual(name, ctxt->nameTab[i]))
3805 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003806 }
3807 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003808 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3809 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003810 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003811 }
3812
3813
3814 /*
3815 * Check for auto-closure of HTML elements.
3816 */
3817
3818 htmlAutoCloseOnClose(ctxt, name);
3819
3820 /*
3821 * Well formedness constraints, opening and closing must match.
3822 * With the exception that the autoclose may have popped stuff out
3823 * of the stack.
3824 */
3825 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003826 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003827 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3828 "Opening and ending tag mismatch: %s and %s\n",
3829 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003830 }
3831 }
3832
3833 /*
3834 * SAX: End of Tag
3835 */
3836 oldname = ctxt->name;
3837 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003838 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3839 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003840 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003841 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003842 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003843 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003844 }
3845
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003846 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003847}
3848
3849
3850/**
3851 * htmlParseReference:
3852 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003853 *
Owen Taylor3473f882001-02-23 17:55:21 +00003854 * parse and handle entity references in content,
3855 * this will end-up in a call to character() since this is either a
3856 * CharRef, or a predefined entity.
3857 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003858static void
Owen Taylor3473f882001-02-23 17:55:21 +00003859htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003860 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003861 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003862 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003863 if (CUR != '&') return;
3864
3865 if (NXT(1) == '#') {
3866 unsigned int c;
3867 int bits, i = 0;
3868
3869 c = htmlParseCharRef(ctxt);
3870 if (c == 0)
3871 return;
3872
3873 if (c < 0x80) { out[i++]= c; bits= -6; }
3874 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3875 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3876 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003877
Owen Taylor3473f882001-02-23 17:55:21 +00003878 for ( ; bits >= 0; bits-= 6) {
3879 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3880 }
3881 out[i] = 0;
3882
3883 htmlCheckParagraph(ctxt);
3884 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3885 ctxt->sax->characters(ctxt->userData, out, i);
3886 } else {
3887 ent = htmlParseEntityRef(ctxt, &name);
3888 if (name == NULL) {
3889 htmlCheckParagraph(ctxt);
3890 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3891 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3892 return;
3893 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003894 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003895 htmlCheckParagraph(ctxt);
3896 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3897 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3898 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3899 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3900 }
3901 } else {
3902 unsigned int c;
3903 int bits, i = 0;
3904
3905 c = ent->value;
3906 if (c < 0x80)
3907 { out[i++]= c; bits= -6; }
3908 else if (c < 0x800)
3909 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3910 else if (c < 0x10000)
3911 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003912 else
Owen Taylor3473f882001-02-23 17:55:21 +00003913 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003914
Owen Taylor3473f882001-02-23 17:55:21 +00003915 for ( ; bits >= 0; bits-= 6) {
3916 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3917 }
3918 out[i] = 0;
3919
3920 htmlCheckParagraph(ctxt);
3921 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3922 ctxt->sax->characters(ctxt->userData, out, i);
3923 }
Owen Taylor3473f882001-02-23 17:55:21 +00003924 }
3925}
3926
3927/**
3928 * htmlParseContent:
3929 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003930 *
3931 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003932 */
3933
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003934static void
Owen Taylor3473f882001-02-23 17:55:21 +00003935htmlParseContent(htmlParserCtxtPtr ctxt) {
3936 xmlChar *currentNode;
3937 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003938 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003939
3940 currentNode = xmlStrdup(ctxt->name);
3941 depth = ctxt->nameNr;
3942 while (1) {
3943 long cons = ctxt->nbChars;
3944
3945 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003946
3947 if (ctxt->instate == XML_PARSER_EOF)
3948 break;
3949
Owen Taylor3473f882001-02-23 17:55:21 +00003950 /*
3951 * Our tag or one of it's parent or children is ending.
3952 */
3953 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003954 if (htmlParseEndTag(ctxt) &&
3955 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3956 if (currentNode != NULL)
3957 xmlFree(currentNode);
3958 return;
3959 }
3960 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003961 }
3962
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003963 else if ((CUR == '<') &&
3964 ((IS_ASCII_LETTER(NXT(1))) ||
3965 (NXT(1) == '_') || (NXT(1) == ':'))) {
3966 name = htmlParseHTMLName_nonInvasive(ctxt);
3967 if (name == NULL) {
3968 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3969 "htmlParseStartTag: invalid element name\n",
3970 NULL, NULL);
3971 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003972 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003973 NEXT;
3974
3975 if (currentNode != NULL)
3976 xmlFree(currentNode);
3977 return;
3978 }
3979
3980 if (ctxt->name != NULL) {
3981 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3982 htmlAutoClose(ctxt, name);
3983 continue;
3984 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003985 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003986 }
3987
Owen Taylor3473f882001-02-23 17:55:21 +00003988 /*
3989 * Has this node been popped out during parsing of
3990 * the next element
3991 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003992 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3993 (!xmlStrEqual(currentNode, ctxt->name)))
3994 {
Owen Taylor3473f882001-02-23 17:55:21 +00003995 if (currentNode != NULL) xmlFree(currentNode);
3996 return;
3997 }
3998
Daniel Veillardf9533d12001-03-03 10:04:57 +00003999 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4000 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004001 /*
4002 * Handle SCRIPT/STYLE separately
4003 */
4004 htmlParseScript(ctxt);
4005 } else {
4006 /*
4007 * Sometimes DOCTYPE arrives in the middle of the document
4008 */
4009 if ((CUR == '<') && (NXT(1) == '!') &&
4010 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4011 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4012 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4013 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004014 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4015 "Misplaced DOCTYPE declaration\n",
4016 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004017 htmlParseDocTypeDecl(ctxt);
4018 }
4019
4020 /*
4021 * First case : a comment
4022 */
4023 if ((CUR == '<') && (NXT(1) == '!') &&
4024 (NXT(2) == '-') && (NXT(3) == '-')) {
4025 htmlParseComment(ctxt);
4026 }
4027
4028 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004029 * Second case : a Processing Instruction.
4030 */
4031 else if ((CUR == '<') && (NXT(1) == '?')) {
4032 htmlParsePI(ctxt);
4033 }
4034
4035 /*
4036 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004037 */
4038 else if (CUR == '<') {
4039 htmlParseElement(ctxt);
4040 }
4041
4042 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004043 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004044 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004045 */
4046 else if (CUR == '&') {
4047 htmlParseReference(ctxt);
4048 }
4049
4050 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004051 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004052 */
4053 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004054 htmlAutoCloseOnEnd(ctxt);
4055 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004056 }
4057
4058 /*
4059 * Last case, text. Note that References are handled directly.
4060 */
4061 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004062 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004063 }
4064
4065 if (cons == ctxt->nbChars) {
4066 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004067 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4068 "detected an error in element content\n",
4069 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004070 }
4071 break;
4072 }
4073 }
4074 GROW;
4075 }
4076 if (currentNode != NULL) xmlFree(currentNode);
4077}
4078
4079/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004080 * htmlParseContent:
4081 * @ctxt: an HTML parser context
4082 *
4083 * Parse a content: comment, sub-element, reference or text.
4084 */
4085
4086void
4087__htmlParseContent(void *ctxt) {
4088 if (ctxt != NULL)
4089 htmlParseContent((htmlParserCtxtPtr) ctxt);
4090}
4091
4092/**
Owen Taylor3473f882001-02-23 17:55:21 +00004093 * htmlParseElement:
4094 * @ctxt: an HTML parser context
4095 *
4096 * parse an HTML element, this is highly recursive
4097 *
4098 * [39] element ::= EmptyElemTag | STag content ETag
4099 *
4100 * [41] Attribute ::= Name Eq AttValue
4101 */
4102
4103void
4104htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004105 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004106 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004107 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004108 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004109 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004110 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004111 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004112
Daniel Veillarda03e3652004-11-02 18:45:30 +00004113 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4114 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004115 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004116 return;
4117 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004118
4119 if (ctxt->instate == XML_PARSER_EOF)
4120 return;
4121
Owen Taylor3473f882001-02-23 17:55:21 +00004122 /* Capture start position */
4123 if (ctxt->record_info) {
4124 node_info.begin_pos = ctxt->input->consumed +
4125 (CUR_PTR - ctxt->input->base);
4126 node_info.begin_line = ctxt->input->line;
4127 }
4128
Daniel Veillard597f1c12005-07-03 23:00:18 +00004129 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004130 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004131 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004132 if (CUR == '>')
4133 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004134 return;
4135 }
Owen Taylor3473f882001-02-23 17:55:21 +00004136
4137 /*
4138 * Lookup the info for that element.
4139 */
4140 info = htmlTagLookup(name);
4141 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004142 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4143 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004144 }
4145
4146 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004147 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004148 */
4149 if ((CUR == '/') && (NXT(1) == '>')) {
4150 SKIP(2);
4151 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4152 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004153 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004154 return;
4155 }
4156
4157 if (CUR == '>') {
4158 NEXT;
4159 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004160 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4161 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004162
4163 /*
4164 * end of parsing of this node.
4165 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004166 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004167 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004168 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004169 }
Owen Taylor3473f882001-02-23 17:55:21 +00004170
4171 /*
4172 * Capture end position and add node
4173 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004174 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004175 node_info.end_pos = ctxt->input->consumed +
4176 (CUR_PTR - ctxt->input->base);
4177 node_info.end_line = ctxt->input->line;
4178 node_info.node = ctxt->node;
4179 xmlParserAddNodeInfo(ctxt, &node_info);
4180 }
4181 return;
4182 }
4183
4184 /*
4185 * Check for an Empty Element from DTD definition
4186 */
4187 if ((info != NULL) && (info->empty)) {
4188 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4189 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004190 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004191 return;
4192 }
4193
4194 /*
4195 * Parse the content of the element:
4196 */
4197 currentNode = xmlStrdup(ctxt->name);
4198 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004199 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004200 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004201 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004202 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004203 if (ctxt->nameNr < depth) break;
4204 }
Owen Taylor3473f882001-02-23 17:55:21 +00004205
Owen Taylor3473f882001-02-23 17:55:21 +00004206 /*
4207 * Capture end position and add node
4208 */
4209 if ( currentNode != NULL && ctxt->record_info ) {
4210 node_info.end_pos = ctxt->input->consumed +
4211 (CUR_PTR - ctxt->input->base);
4212 node_info.end_line = ctxt->input->line;
4213 node_info.node = ctxt->node;
4214 xmlParserAddNodeInfo(ctxt, &node_info);
4215 }
William M. Brack76e95df2003-10-18 16:20:14 +00004216 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004217 htmlAutoCloseOnEnd(ctxt);
4218 }
4219
Owen Taylor3473f882001-02-23 17:55:21 +00004220 if (currentNode != NULL)
4221 xmlFree(currentNode);
4222}
4223
4224/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004225 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004226 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004227 *
Owen Taylor3473f882001-02-23 17:55:21 +00004228 * parse an HTML document (and build a tree if using the standard SAX
4229 * interface).
4230 *
4231 * Returns 0, -1 in case of error. the parser context is augmented
4232 * as a result of the parsing.
4233 */
4234
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004235int
Owen Taylor3473f882001-02-23 17:55:21 +00004236htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004237 xmlChar start[4];
4238 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004239 xmlDtdPtr dtd;
4240
Daniel Veillardd0463562001-10-13 09:15:48 +00004241 xmlInitParser();
4242
Owen Taylor3473f882001-02-23 17:55:21 +00004243 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004244
Daniel Veillarda03e3652004-11-02 18:45:30 +00004245 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4246 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4247 "htmlParseDocument: context error\n", NULL, NULL);
4248 return(XML_ERR_INTERNAL_ERROR);
4249 }
4250 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004251 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004252 GROW;
4253 /*
4254 * SAX: beginning of the document processing.
4255 */
4256 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4257 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4258
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004259 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4260 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4261 /*
4262 * Get the 4 first bytes and decode the charset
4263 * if enc != XML_CHAR_ENCODING_NONE
4264 * plug some encoding conversion routines.
4265 */
4266 start[0] = RAW;
4267 start[1] = NXT(1);
4268 start[2] = NXT(2);
4269 start[3] = NXT(3);
4270 enc = xmlDetectCharEncoding(&start[0], 4);
4271 if (enc != XML_CHAR_ENCODING_NONE) {
4272 xmlSwitchEncoding(ctxt, enc);
4273 }
4274 }
4275
Owen Taylor3473f882001-02-23 17:55:21 +00004276 /*
4277 * Wipe out everything which is before the first '<'
4278 */
4279 SKIP_BLANKS;
4280 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004281 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004282 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004283 }
4284
4285 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4286 ctxt->sax->startDocument(ctxt->userData);
4287
4288
4289 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004290 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004291 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004292 while (((CUR == '<') && (NXT(1) == '!') &&
4293 (NXT(2) == '-') && (NXT(3) == '-')) ||
4294 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004295 htmlParseComment(ctxt);
4296 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004297 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004298 }
Owen Taylor3473f882001-02-23 17:55:21 +00004299
4300
4301 /*
4302 * Then possibly doc type declaration(s) and more Misc
4303 * (doctypedecl Misc*)?
4304 */
4305 if ((CUR == '<') && (NXT(1) == '!') &&
4306 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4307 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4308 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4309 (UPP(8) == 'E')) {
4310 htmlParseDocTypeDecl(ctxt);
4311 }
4312 SKIP_BLANKS;
4313
4314 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004315 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004316 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004317 while (((CUR == '<') && (NXT(1) == '!') &&
4318 (NXT(2) == '-') && (NXT(3) == '-')) ||
4319 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004320 htmlParseComment(ctxt);
4321 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004322 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004323 }
Owen Taylor3473f882001-02-23 17:55:21 +00004324
4325 /*
4326 * Time to start parsing the tree itself
4327 */
4328 htmlParseContent(ctxt);
4329
4330 /*
4331 * autoclose
4332 */
4333 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004334 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004335
4336
4337 /*
4338 * SAX: end of the document processing.
4339 */
4340 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4341 ctxt->sax->endDocument(ctxt->userData);
4342
4343 if (ctxt->myDoc != NULL) {
4344 dtd = xmlGetIntSubset(ctxt->myDoc);
4345 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004346 ctxt->myDoc->intSubset =
4347 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004348 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4349 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4350 }
4351 if (! ctxt->wellFormed) return(-1);
4352 return(0);
4353}
4354
4355
4356/************************************************************************
4357 * *
4358 * Parser contexts handling *
4359 * *
4360 ************************************************************************/
4361
4362/**
William M. Brackedb65a72004-02-06 07:36:04 +00004363 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004364 * @ctxt: an HTML parser context
4365 *
4366 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004367 *
4368 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004369 */
4370
Daniel Veillardf403d292003-10-05 13:51:35 +00004371static int
Owen Taylor3473f882001-02-23 17:55:21 +00004372htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4373{
4374 htmlSAXHandler *sax;
4375
Daniel Veillardf403d292003-10-05 13:51:35 +00004376 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004377 memset(ctxt, 0, sizeof(htmlParserCtxt));
4378
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004379 ctxt->dict = xmlDictCreate();
4380 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004381 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4382 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004383 }
Owen Taylor3473f882001-02-23 17:55:21 +00004384 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4385 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004386 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4387 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004388 }
4389 else
4390 memset(sax, 0, sizeof(htmlSAXHandler));
4391
4392 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004393 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004394 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4395 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004396 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004397 ctxt->inputNr = 0;
4398 ctxt->inputMax = 0;
4399 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004400 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004401 }
4402 ctxt->inputNr = 0;
4403 ctxt->inputMax = 5;
4404 ctxt->input = NULL;
4405 ctxt->version = NULL;
4406 ctxt->encoding = NULL;
4407 ctxt->standalone = -1;
4408 ctxt->instate = XML_PARSER_START;
4409
4410 /* Allocate the Node stack */
4411 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4412 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004413 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004414 ctxt->nodeNr = 0;
4415 ctxt->nodeMax = 0;
4416 ctxt->node = NULL;
4417 ctxt->inputNr = 0;
4418 ctxt->inputMax = 0;
4419 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004420 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004421 }
4422 ctxt->nodeNr = 0;
4423 ctxt->nodeMax = 10;
4424 ctxt->node = NULL;
4425
4426 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004427 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004428 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004429 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004430 ctxt->nameNr = 0;
4431 ctxt->nameMax = 10;
4432 ctxt->name = NULL;
4433 ctxt->nodeNr = 0;
4434 ctxt->nodeMax = 0;
4435 ctxt->node = NULL;
4436 ctxt->inputNr = 0;
4437 ctxt->inputMax = 0;
4438 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004439 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004440 }
4441 ctxt->nameNr = 0;
4442 ctxt->nameMax = 10;
4443 ctxt->name = NULL;
4444
Daniel Veillard092643b2003-09-25 14:29:29 +00004445 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004446 else {
4447 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004448 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004449 }
4450 ctxt->userData = ctxt;
4451 ctxt->myDoc = NULL;
4452 ctxt->wellFormed = 1;
4453 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004454 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004455 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004456 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004457 ctxt->vctxt.userData = ctxt;
4458 ctxt->vctxt.error = xmlParserValidityError;
4459 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004460 ctxt->record_info = 0;
4461 ctxt->validate = 0;
4462 ctxt->nbChars = 0;
4463 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004464 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004465 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004466 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004467}
4468
4469/**
4470 * htmlFreeParserCtxt:
4471 * @ctxt: an HTML parser context
4472 *
4473 * Free all the memory used by a parser context. However the parsed
4474 * document in ctxt->myDoc is not freed.
4475 */
4476
4477void
4478htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4479{
4480 xmlFreeParserCtxt(ctxt);
4481}
4482
4483/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004484 * htmlNewParserCtxt:
4485 *
4486 * Allocate and initialize a new parser context.
4487 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004488 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004489 */
4490
Daniel Veillard34c647c2006-09-21 06:53:59 +00004491htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004492htmlNewParserCtxt(void)
4493{
4494 xmlParserCtxtPtr ctxt;
4495
4496 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4497 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004498 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004499 return(NULL);
4500 }
4501 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004502 if (htmlInitParserCtxt(ctxt) < 0) {
4503 htmlFreeParserCtxt(ctxt);
4504 return(NULL);
4505 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004506 return(ctxt);
4507}
4508
4509/**
4510 * htmlCreateMemoryParserCtxt:
4511 * @buffer: a pointer to a char array
4512 * @size: the size of the array
4513 *
4514 * Create a parser context for an HTML in-memory document.
4515 *
4516 * Returns the new parser context or NULL
4517 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004518htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004519htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4520 xmlParserCtxtPtr ctxt;
4521 xmlParserInputPtr input;
4522 xmlParserInputBufferPtr buf;
4523
4524 if (buffer == NULL)
4525 return(NULL);
4526 if (size <= 0)
4527 return(NULL);
4528
4529 ctxt = htmlNewParserCtxt();
4530 if (ctxt == NULL)
4531 return(NULL);
4532
4533 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4534 if (buf == NULL) return(NULL);
4535
4536 input = xmlNewInputStream(ctxt);
4537 if (input == NULL) {
4538 xmlFreeParserCtxt(ctxt);
4539 return(NULL);
4540 }
4541
4542 input->filename = NULL;
4543 input->buf = buf;
4544 input->base = input->buf->buffer->content;
4545 input->cur = input->buf->buffer->content;
4546 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4547
4548 inputPush(ctxt, input);
4549 return(ctxt);
4550}
4551
4552/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004553 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004554 * @cur: a pointer to an array of xmlChar
4555 * @encoding: a free form C string describing the HTML document encoding, or NULL
4556 *
4557 * Create a parser context for an HTML document.
4558 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004559 * TODO: check the need to add encoding handling there
4560 *
Owen Taylor3473f882001-02-23 17:55:21 +00004561 * Returns the new parser context or NULL
4562 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004563static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004564htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004565 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004566 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004567
Daniel Veillard1d995272002-07-22 16:43:32 +00004568 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004569 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004570 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004571 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004572 if (ctxt == NULL)
4573 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004574
4575 if (encoding != NULL) {
4576 xmlCharEncoding enc;
4577 xmlCharEncodingHandlerPtr handler;
4578
4579 if (ctxt->input->encoding != NULL)
4580 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004581 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004582
4583 enc = xmlParseCharEncoding(encoding);
4584 /*
4585 * registered set of known encodings
4586 */
4587 if (enc != XML_CHAR_ENCODING_ERROR) {
4588 xmlSwitchEncoding(ctxt, enc);
4589 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004590 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004591 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004592 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004593 }
4594 } else {
4595 /*
4596 * fallback for unknown encodings
4597 */
4598 handler = xmlFindCharEncodingHandler((const char *) encoding);
4599 if (handler != NULL) {
4600 xmlSwitchToEncoding(ctxt, handler);
4601 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004602 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4603 "Unsupported encoding %s\n",
4604 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004605 }
4606 }
4607 }
4608 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004609}
4610
Daniel Veillard73b013f2003-09-30 12:36:01 +00004611#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004612/************************************************************************
4613 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004614 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004615 * *
4616 ************************************************************************/
4617
4618/**
4619 * htmlParseLookupSequence:
4620 * @ctxt: an HTML parser context
4621 * @first: the first char to lookup
4622 * @next: the next char to lookup or zero
4623 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004624 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004625 *
4626 * Try to find if a sequence (first, next, third) or just (first next) or
4627 * (first) is available in the input stream.
4628 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4629 * to avoid rescanning sequences of bytes, it DOES change the state of the
4630 * parser, do not use liberally.
4631 * This is basically similar to xmlParseLookupSequence()
4632 *
4633 * Returns the index to the current parsing point if the full sequence
4634 * is available, -1 otherwise.
4635 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004636static int
Owen Taylor3473f882001-02-23 17:55:21 +00004637htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004638 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02004639 int ignoreattrval)
4640{
Owen Taylor3473f882001-02-23 17:55:21 +00004641 int base, len;
4642 htmlParserInputPtr in;
4643 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004644 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004645 int invalue = 0;
4646 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004647
4648 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004649 if (in == NULL)
4650 return (-1);
4651
Owen Taylor3473f882001-02-23 17:55:21 +00004652 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004653 if (base < 0)
4654 return (-1);
4655
Owen Taylor3473f882001-02-23 17:55:21 +00004656 if (ctxt->checkIndex > base)
4657 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004658
Owen Taylor3473f882001-02-23 17:55:21 +00004659 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004660 buf = in->base;
4661 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00004662 } else {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004663 buf = in->buf->buffer->content;
4664 len = in->buf->buffer->use;
Owen Taylor3473f882001-02-23 17:55:21 +00004665 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02004666
Owen Taylor3473f882001-02-23 17:55:21 +00004667 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02004668 if (third)
4669 len -= 2;
4670 else if (next)
4671 len--;
4672 for (; base < len; base++) {
4673 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
4674 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4675 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4676 incomment = 1;
4677 /* do not increment past <! - some people use <!--> */
4678 base += 2;
4679 }
4680 }
4681 if (ignoreattrval) {
4682 if (buf[base] == '"' || buf[base] == '\'') {
4683 if (invalue) {
4684 if (buf[base] == valdellim) {
4685 invalue = 0;
4686 continue;
4687 }
4688 } else {
4689 valdellim = buf[base];
4690 invalue = 1;
4691 continue;
4692 }
4693 } else if (invalue) {
4694 continue;
4695 }
4696 }
4697 if (incomment) {
4698 if (base + 3 > len)
4699 return (-1);
4700 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4701 (buf[base + 2] == '>')) {
4702 incomment = 0;
4703 base += 2;
4704 }
4705 continue;
4706 }
Owen Taylor3473f882001-02-23 17:55:21 +00004707 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004708 if (third != 0) {
4709 if ((buf[base + 1] != next) || (buf[base + 2] != third))
4710 continue;
4711 } else if (next != 0) {
4712 if (buf[base + 1] != next)
4713 continue;
4714 }
4715 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004716#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02004717 if (next == 0)
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: lookup '%c' found at %d\n",
4720 first, base);
4721 else if (third == 0)
4722 xmlGenericError(xmlGenericErrorContext,
4723 "HPP: lookup '%c%c' found at %d\n",
4724 first, next, base);
4725 else
4726 xmlGenericError(xmlGenericErrorContext,
4727 "HPP: lookup '%c%c%c' found at %d\n",
4728 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00004729#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02004730 return (base - (in->cur - in->base));
4731 }
Owen Taylor3473f882001-02-23 17:55:21 +00004732 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02004733 if ((!incomment) && (!invalue))
4734 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00004735#ifdef DEBUG_PUSH
4736 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02004737 xmlGenericError(xmlGenericErrorContext,
4738 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00004739 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02004740 xmlGenericError(xmlGenericErrorContext,
4741 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004742 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02004743 xmlGenericError(xmlGenericErrorContext,
4744 "HPP: lookup '%c%c%c' failed\n", first, next,
4745 third);
Owen Taylor3473f882001-02-23 17:55:21 +00004746#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02004747 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004748}
4749
4750/**
Markus Kull56a03032009-08-24 19:00:23 +02004751 * htmlParseLookupChars:
4752 * @ctxt: an HTML parser context
4753 * @stop: Array of chars, which stop the lookup.
4754 * @stopLen: Length of stop-Array
4755 *
4756 * Try to find if any char of the stop-Array is available in the input
4757 * stream.
4758 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4759 * to avoid rescanning sequences of bytes, it DOES change the state of the
4760 * parser, do not use liberally.
4761 *
4762 * Returns the index to the current parsing point if a stopChar
4763 * is available, -1 otherwise.
4764 */
4765static int
4766htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
4767 int stopLen)
4768{
4769 int base, len;
4770 htmlParserInputPtr in;
4771 const xmlChar *buf;
4772 int incomment = 0;
4773 int i;
4774
4775 in = ctxt->input;
4776 if (in == NULL)
4777 return (-1);
4778
4779 base = in->cur - in->base;
4780 if (base < 0)
4781 return (-1);
4782
4783 if (ctxt->checkIndex > base)
4784 base = ctxt->checkIndex;
4785
4786 if (in->buf == NULL) {
4787 buf = in->base;
4788 len = in->length;
4789 } else {
4790 buf = in->buf->buffer->content;
4791 len = in->buf->buffer->use;
4792 }
4793
4794 for (; base < len; base++) {
4795 if (!incomment && (base + 4 < len)) {
4796 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4797 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4798 incomment = 1;
4799 /* do not increment past <! - some people use <!--> */
4800 base += 2;
4801 }
4802 }
4803 if (incomment) {
4804 if (base + 3 > len)
4805 return (-1);
4806 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4807 (buf[base + 2] == '>')) {
4808 incomment = 0;
4809 base += 2;
4810 }
4811 continue;
4812 }
4813 for (i = 0; i < stopLen; ++i) {
4814 if (buf[base] == stop[i]) {
4815 ctxt->checkIndex = 0;
4816 return (base - (in->cur - in->base));
4817 }
4818 }
4819 }
4820 ctxt->checkIndex = base;
4821 return (-1);
4822}
4823
4824/**
Owen Taylor3473f882001-02-23 17:55:21 +00004825 * htmlParseTryOrFinish:
4826 * @ctxt: an HTML parser context
4827 * @terminate: last chunk indicator
4828 *
4829 * Try to progress on parsing
4830 *
4831 * Returns zero if no parsing was possible
4832 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004833static int
Owen Taylor3473f882001-02-23 17:55:21 +00004834htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4835 int ret = 0;
4836 htmlParserInputPtr in;
4837 int avail = 0;
4838 xmlChar cur, next;
4839
4840#ifdef DEBUG_PUSH
4841 switch (ctxt->instate) {
4842 case XML_PARSER_EOF:
4843 xmlGenericError(xmlGenericErrorContext,
4844 "HPP: try EOF\n"); break;
4845 case XML_PARSER_START:
4846 xmlGenericError(xmlGenericErrorContext,
4847 "HPP: try START\n"); break;
4848 case XML_PARSER_MISC:
4849 xmlGenericError(xmlGenericErrorContext,
4850 "HPP: try MISC\n");break;
4851 case XML_PARSER_COMMENT:
4852 xmlGenericError(xmlGenericErrorContext,
4853 "HPP: try COMMENT\n");break;
4854 case XML_PARSER_PROLOG:
4855 xmlGenericError(xmlGenericErrorContext,
4856 "HPP: try PROLOG\n");break;
4857 case XML_PARSER_START_TAG:
4858 xmlGenericError(xmlGenericErrorContext,
4859 "HPP: try START_TAG\n");break;
4860 case XML_PARSER_CONTENT:
4861 xmlGenericError(xmlGenericErrorContext,
4862 "HPP: try CONTENT\n");break;
4863 case XML_PARSER_CDATA_SECTION:
4864 xmlGenericError(xmlGenericErrorContext,
4865 "HPP: try CDATA_SECTION\n");break;
4866 case XML_PARSER_END_TAG:
4867 xmlGenericError(xmlGenericErrorContext,
4868 "HPP: try END_TAG\n");break;
4869 case XML_PARSER_ENTITY_DECL:
4870 xmlGenericError(xmlGenericErrorContext,
4871 "HPP: try ENTITY_DECL\n");break;
4872 case XML_PARSER_ENTITY_VALUE:
4873 xmlGenericError(xmlGenericErrorContext,
4874 "HPP: try ENTITY_VALUE\n");break;
4875 case XML_PARSER_ATTRIBUTE_VALUE:
4876 xmlGenericError(xmlGenericErrorContext,
4877 "HPP: try ATTRIBUTE_VALUE\n");break;
4878 case XML_PARSER_DTD:
4879 xmlGenericError(xmlGenericErrorContext,
4880 "HPP: try DTD\n");break;
4881 case XML_PARSER_EPILOG:
4882 xmlGenericError(xmlGenericErrorContext,
4883 "HPP: try EPILOG\n");break;
4884 case XML_PARSER_PI:
4885 xmlGenericError(xmlGenericErrorContext,
4886 "HPP: try PI\n");break;
4887 case XML_PARSER_SYSTEM_LITERAL:
4888 xmlGenericError(xmlGenericErrorContext,
4889 "HPP: try SYSTEM_LITERAL\n");break;
4890 }
4891#endif
4892
4893 while (1) {
4894
4895 in = ctxt->input;
4896 if (in == NULL) break;
4897 if (in->buf == NULL)
4898 avail = in->length - (in->cur - in->base);
4899 else
4900 avail = in->buf->buffer->use - (in->cur - in->base);
4901 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004902 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004903 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004904 /*
4905 * SAX: end of the document processing.
4906 */
4907 ctxt->instate = XML_PARSER_EOF;
4908 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4909 ctxt->sax->endDocument(ctxt->userData);
4910 }
4911 }
4912 if (avail < 1)
4913 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004914 cur = in->cur[0];
4915 if (cur == 0) {
4916 SKIP(1);
4917 continue;
4918 }
4919
Owen Taylor3473f882001-02-23 17:55:21 +00004920 switch (ctxt->instate) {
4921 case XML_PARSER_EOF:
4922 /*
4923 * Document parsing is done !
4924 */
4925 goto done;
4926 case XML_PARSER_START:
4927 /*
4928 * Very first chars read from the document flow.
4929 */
4930 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004931 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004932 SKIP_BLANKS;
4933 if (in->buf == NULL)
4934 avail = in->length - (in->cur - in->base);
4935 else
4936 avail = in->buf->buffer->use - (in->cur - in->base);
4937 }
4938 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4939 ctxt->sax->setDocumentLocator(ctxt->userData,
4940 &xmlDefaultSAXLocator);
4941 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4942 (!ctxt->disableSAX))
4943 ctxt->sax->startDocument(ctxt->userData);
4944
4945 cur = in->cur[0];
4946 next = in->cur[1];
4947 if ((cur == '<') && (next == '!') &&
4948 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4949 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4950 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4951 (UPP(8) == 'E')) {
4952 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004953 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004954 goto done;
4955#ifdef DEBUG_PUSH
4956 xmlGenericError(xmlGenericErrorContext,
4957 "HPP: Parsing internal subset\n");
4958#endif
4959 htmlParseDocTypeDecl(ctxt);
4960 ctxt->instate = XML_PARSER_PROLOG;
4961#ifdef DEBUG_PUSH
4962 xmlGenericError(xmlGenericErrorContext,
4963 "HPP: entering PROLOG\n");
4964#endif
4965 } else {
4966 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004967#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004968 xmlGenericError(xmlGenericErrorContext,
4969 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004970#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004971 }
Owen Taylor3473f882001-02-23 17:55:21 +00004972 break;
4973 case XML_PARSER_MISC:
4974 SKIP_BLANKS;
4975 if (in->buf == NULL)
4976 avail = in->length - (in->cur - in->base);
4977 else
4978 avail = in->buf->buffer->use - (in->cur - in->base);
4979 if (avail < 2)
4980 goto done;
4981 cur = in->cur[0];
4982 next = in->cur[1];
4983 if ((cur == '<') && (next == '!') &&
4984 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4985 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004986 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004987 goto done;
4988#ifdef DEBUG_PUSH
4989 xmlGenericError(xmlGenericErrorContext,
4990 "HPP: Parsing Comment\n");
4991#endif
4992 htmlParseComment(ctxt);
4993 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004994 } else if ((cur == '<') && (next == '?')) {
4995 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004996 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004997 goto done;
4998#ifdef DEBUG_PUSH
4999 xmlGenericError(xmlGenericErrorContext,
5000 "HPP: Parsing PI\n");
5001#endif
5002 htmlParsePI(ctxt);
5003 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005004 } else if ((cur == '<') && (next == '!') &&
5005 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5006 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5007 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5008 (UPP(8) == 'E')) {
5009 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005010 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005011 goto done;
5012#ifdef DEBUG_PUSH
5013 xmlGenericError(xmlGenericErrorContext,
5014 "HPP: Parsing internal subset\n");
5015#endif
5016 htmlParseDocTypeDecl(ctxt);
5017 ctxt->instate = XML_PARSER_PROLOG;
5018#ifdef DEBUG_PUSH
5019 xmlGenericError(xmlGenericErrorContext,
5020 "HPP: entering PROLOG\n");
5021#endif
5022 } else if ((cur == '<') && (next == '!') &&
5023 (avail < 9)) {
5024 goto done;
5025 } else {
5026 ctxt->instate = XML_PARSER_START_TAG;
5027#ifdef DEBUG_PUSH
5028 xmlGenericError(xmlGenericErrorContext,
5029 "HPP: entering START_TAG\n");
5030#endif
5031 }
5032 break;
5033 case XML_PARSER_PROLOG:
5034 SKIP_BLANKS;
5035 if (in->buf == NULL)
5036 avail = in->length - (in->cur - in->base);
5037 else
5038 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005039 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005040 goto done;
5041 cur = in->cur[0];
5042 next = in->cur[1];
5043 if ((cur == '<') && (next == '!') &&
5044 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5045 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005046 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005047 goto done;
5048#ifdef DEBUG_PUSH
5049 xmlGenericError(xmlGenericErrorContext,
5050 "HPP: Parsing Comment\n");
5051#endif
5052 htmlParseComment(ctxt);
5053 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005054 } else if ((cur == '<') && (next == '?')) {
5055 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005056 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005057 goto done;
5058#ifdef DEBUG_PUSH
5059 xmlGenericError(xmlGenericErrorContext,
5060 "HPP: Parsing PI\n");
5061#endif
5062 htmlParsePI(ctxt);
5063 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005064 } else if ((cur == '<') && (next == '!') &&
5065 (avail < 4)) {
5066 goto done;
5067 } else {
5068 ctxt->instate = XML_PARSER_START_TAG;
5069#ifdef DEBUG_PUSH
5070 xmlGenericError(xmlGenericErrorContext,
5071 "HPP: entering START_TAG\n");
5072#endif
5073 }
5074 break;
5075 case XML_PARSER_EPILOG:
5076 if (in->buf == NULL)
5077 avail = in->length - (in->cur - in->base);
5078 else
5079 avail = in->buf->buffer->use - (in->cur - in->base);
5080 if (avail < 1)
5081 goto done;
5082 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005083 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005084 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005085 goto done;
5086 }
5087 if (avail < 2)
5088 goto done;
5089 next = in->cur[1];
5090 if ((cur == '<') && (next == '!') &&
5091 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5092 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005093 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005094 goto done;
5095#ifdef DEBUG_PUSH
5096 xmlGenericError(xmlGenericErrorContext,
5097 "HPP: Parsing Comment\n");
5098#endif
5099 htmlParseComment(ctxt);
5100 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005101 } else if ((cur == '<') && (next == '?')) {
5102 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005103 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005104 goto done;
5105#ifdef DEBUG_PUSH
5106 xmlGenericError(xmlGenericErrorContext,
5107 "HPP: Parsing PI\n");
5108#endif
5109 htmlParsePI(ctxt);
5110 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005111 } else if ((cur == '<') && (next == '!') &&
5112 (avail < 4)) {
5113 goto done;
5114 } else {
5115 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005116 ctxt->wellFormed = 0;
5117 ctxt->instate = XML_PARSER_EOF;
5118#ifdef DEBUG_PUSH
5119 xmlGenericError(xmlGenericErrorContext,
5120 "HPP: entering EOF\n");
5121#endif
5122 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5123 ctxt->sax->endDocument(ctxt->userData);
5124 goto done;
5125 }
5126 break;
5127 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005128 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005129 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005130 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005131
5132 if (avail < 2)
5133 goto done;
5134 cur = in->cur[0];
5135 if (cur != '<') {
5136 ctxt->instate = XML_PARSER_CONTENT;
5137#ifdef DEBUG_PUSH
5138 xmlGenericError(xmlGenericErrorContext,
5139 "HPP: entering CONTENT\n");
5140#endif
5141 break;
5142 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005143 if (in->cur[1] == '/') {
5144 ctxt->instate = XML_PARSER_END_TAG;
5145 ctxt->checkIndex = 0;
5146#ifdef DEBUG_PUSH
5147 xmlGenericError(xmlGenericErrorContext,
5148 "HPP: entering END_TAG\n");
5149#endif
5150 break;
5151 }
Owen Taylor3473f882001-02-23 17:55:21 +00005152 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005153 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005154 goto done;
5155
Daniel Veillard597f1c12005-07-03 23:00:18 +00005156 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005157 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005158 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005159 (name == NULL)) {
5160 if (CUR == '>')
5161 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005162 break;
5163 }
Owen Taylor3473f882001-02-23 17:55:21 +00005164
5165 /*
5166 * Lookup the info for that element.
5167 */
5168 info = htmlTagLookup(name);
5169 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005170 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5171 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005172 }
5173
5174 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005175 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005176 */
5177 if ((CUR == '/') && (NXT(1) == '>')) {
5178 SKIP(2);
5179 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5180 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005181 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005182 ctxt->instate = XML_PARSER_CONTENT;
5183#ifdef DEBUG_PUSH
5184 xmlGenericError(xmlGenericErrorContext,
5185 "HPP: entering CONTENT\n");
5186#endif
5187 break;
5188 }
5189
5190 if (CUR == '>') {
5191 NEXT;
5192 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005193 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5194 "Couldn't find end of Start Tag %s\n",
5195 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005196
5197 /*
5198 * end of parsing of this node.
5199 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005200 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005201 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005202 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005203 }
Owen Taylor3473f882001-02-23 17:55:21 +00005204
5205 ctxt->instate = XML_PARSER_CONTENT;
5206#ifdef DEBUG_PUSH
5207 xmlGenericError(xmlGenericErrorContext,
5208 "HPP: entering CONTENT\n");
5209#endif
5210 break;
5211 }
5212
5213 /*
5214 * Check for an Empty Element from DTD definition
5215 */
5216 if ((info != NULL) && (info->empty)) {
5217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5218 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005220 }
5221 ctxt->instate = XML_PARSER_CONTENT;
5222#ifdef DEBUG_PUSH
5223 xmlGenericError(xmlGenericErrorContext,
5224 "HPP: entering CONTENT\n");
5225#endif
5226 break;
5227 }
5228 case XML_PARSER_CONTENT: {
5229 long cons;
5230 /*
5231 * Handle preparsed entities and charRef
5232 */
5233 if (ctxt->token != 0) {
5234 xmlChar chr[2] = { 0 , 0 } ;
5235
5236 chr[0] = (xmlChar) ctxt->token;
5237 htmlCheckParagraph(ctxt);
5238 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5239 ctxt->sax->characters(ctxt->userData, chr, 1);
5240 ctxt->token = 0;
5241 ctxt->checkIndex = 0;
5242 }
5243 if ((avail == 1) && (terminate)) {
5244 cur = in->cur[0];
5245 if ((cur != '<') && (cur != '&')) {
5246 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005247 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005248 if (ctxt->sax->ignorableWhitespace != NULL)
5249 ctxt->sax->ignorableWhitespace(
5250 ctxt->userData, &cur, 1);
5251 } else {
5252 htmlCheckParagraph(ctxt);
5253 if (ctxt->sax->characters != NULL)
5254 ctxt->sax->characters(
5255 ctxt->userData, &cur, 1);
5256 }
5257 }
5258 ctxt->token = 0;
5259 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005260 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005261 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005262 }
Owen Taylor3473f882001-02-23 17:55:21 +00005263 }
5264 if (avail < 2)
5265 goto done;
5266 cur = in->cur[0];
5267 next = in->cur[1];
5268 cons = ctxt->nbChars;
5269 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5270 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5271 /*
5272 * Handle SCRIPT/STYLE separately
5273 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005274 if (!terminate) {
5275 int idx;
5276 xmlChar val;
5277
Jiri Netolicky446e1262009-08-07 17:05:36 +02005278 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005279 if (idx < 0)
5280 goto done;
5281 val = in->cur[idx + 2];
5282 if (val == 0) /* bad cut of input */
5283 goto done;
5284 }
Owen Taylor3473f882001-02-23 17:55:21 +00005285 htmlParseScript(ctxt);
5286 if ((cur == '<') && (next == '/')) {
5287 ctxt->instate = XML_PARSER_END_TAG;
5288 ctxt->checkIndex = 0;
5289#ifdef DEBUG_PUSH
5290 xmlGenericError(xmlGenericErrorContext,
5291 "HPP: entering END_TAG\n");
5292#endif
5293 break;
5294 }
5295 } else {
5296 /*
5297 * Sometimes DOCTYPE arrives in the middle of the document
5298 */
5299 if ((cur == '<') && (next == '!') &&
5300 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5301 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5302 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5303 (UPP(8) == 'E')) {
5304 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005305 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005306 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005307 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5308 "Misplaced DOCTYPE declaration\n",
5309 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005310 htmlParseDocTypeDecl(ctxt);
5311 } else if ((cur == '<') && (next == '!') &&
5312 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5313 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005314 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005315 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005316 goto done;
5317#ifdef DEBUG_PUSH
5318 xmlGenericError(xmlGenericErrorContext,
5319 "HPP: Parsing Comment\n");
5320#endif
5321 htmlParseComment(ctxt);
5322 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005323 } else if ((cur == '<') && (next == '?')) {
5324 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005325 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005326 goto done;
5327#ifdef DEBUG_PUSH
5328 xmlGenericError(xmlGenericErrorContext,
5329 "HPP: Parsing PI\n");
5330#endif
5331 htmlParsePI(ctxt);
5332 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005333 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5334 goto done;
5335 } else if ((cur == '<') && (next == '/')) {
5336 ctxt->instate = XML_PARSER_END_TAG;
5337 ctxt->checkIndex = 0;
5338#ifdef DEBUG_PUSH
5339 xmlGenericError(xmlGenericErrorContext,
5340 "HPP: entering END_TAG\n");
5341#endif
5342 break;
5343 } else if (cur == '<') {
5344 ctxt->instate = XML_PARSER_START_TAG;
5345 ctxt->checkIndex = 0;
5346#ifdef DEBUG_PUSH
5347 xmlGenericError(xmlGenericErrorContext,
5348 "HPP: entering START_TAG\n");
5349#endif
5350 break;
5351 } else if (cur == '&') {
5352 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005353 (htmlParseLookupChars(ctxt,
5354 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005355 goto done;
5356#ifdef DEBUG_PUSH
5357 xmlGenericError(xmlGenericErrorContext,
5358 "HPP: Parsing Reference\n");
5359#endif
5360 /* TODO: check generation of subtrees if noent !!! */
5361 htmlParseReference(ctxt);
5362 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005363 /*
5364 * check that the text sequence is complete
5365 * before handing out the data to the parser
5366 * to avoid problems with erroneous end of
5367 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005368 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005369 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005370 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005371 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005372 ctxt->checkIndex = 0;
5373#ifdef DEBUG_PUSH
5374 xmlGenericError(xmlGenericErrorContext,
5375 "HPP: Parsing char data\n");
5376#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005377 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005378 }
5379 }
5380 if (cons == ctxt->nbChars) {
5381 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005382 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5383 "detected an error in element content\n",
5384 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005385 }
5386 NEXT;
5387 break;
5388 }
5389
5390 break;
5391 }
5392 case XML_PARSER_END_TAG:
5393 if (avail < 2)
5394 goto done;
5395 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005396 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005397 goto done;
5398 htmlParseEndTag(ctxt);
5399 if (ctxt->nameNr == 0) {
5400 ctxt->instate = XML_PARSER_EPILOG;
5401 } else {
5402 ctxt->instate = XML_PARSER_CONTENT;
5403 }
5404 ctxt->checkIndex = 0;
5405#ifdef DEBUG_PUSH
5406 xmlGenericError(xmlGenericErrorContext,
5407 "HPP: entering CONTENT\n");
5408#endif
5409 break;
5410 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005411 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5412 "HPP: internal error, state == CDATA\n",
5413 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005414 ctxt->instate = XML_PARSER_CONTENT;
5415 ctxt->checkIndex = 0;
5416#ifdef DEBUG_PUSH
5417 xmlGenericError(xmlGenericErrorContext,
5418 "HPP: entering CONTENT\n");
5419#endif
5420 break;
5421 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005422 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5423 "HPP: internal error, state == DTD\n",
5424 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005425 ctxt->instate = XML_PARSER_CONTENT;
5426 ctxt->checkIndex = 0;
5427#ifdef DEBUG_PUSH
5428 xmlGenericError(xmlGenericErrorContext,
5429 "HPP: entering CONTENT\n");
5430#endif
5431 break;
5432 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005433 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5434 "HPP: internal error, state == COMMENT\n",
5435 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005436 ctxt->instate = XML_PARSER_CONTENT;
5437 ctxt->checkIndex = 0;
5438#ifdef DEBUG_PUSH
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: entering CONTENT\n");
5441#endif
5442 break;
5443 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005444 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5445 "HPP: internal error, state == PI\n",
5446 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005447 ctxt->instate = XML_PARSER_CONTENT;
5448 ctxt->checkIndex = 0;
5449#ifdef DEBUG_PUSH
5450 xmlGenericError(xmlGenericErrorContext,
5451 "HPP: entering CONTENT\n");
5452#endif
5453 break;
5454 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005455 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5456 "HPP: internal error, state == ENTITY_DECL\n",
5457 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005458 ctxt->instate = XML_PARSER_CONTENT;
5459 ctxt->checkIndex = 0;
5460#ifdef DEBUG_PUSH
5461 xmlGenericError(xmlGenericErrorContext,
5462 "HPP: entering CONTENT\n");
5463#endif
5464 break;
5465 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005466 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5467 "HPP: internal error, state == ENTITY_VALUE\n",
5468 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005469 ctxt->instate = XML_PARSER_CONTENT;
5470 ctxt->checkIndex = 0;
5471#ifdef DEBUG_PUSH
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: entering DTD\n");
5474#endif
5475 break;
5476 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005477 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5478 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5479 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005480 ctxt->instate = XML_PARSER_START_TAG;
5481 ctxt->checkIndex = 0;
5482#ifdef DEBUG_PUSH
5483 xmlGenericError(xmlGenericErrorContext,
5484 "HPP: entering START_TAG\n");
5485#endif
5486 break;
5487 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005488 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5489 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5490 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005491 ctxt->instate = XML_PARSER_CONTENT;
5492 ctxt->checkIndex = 0;
5493#ifdef DEBUG_PUSH
5494 xmlGenericError(xmlGenericErrorContext,
5495 "HPP: entering CONTENT\n");
5496#endif
5497 break;
5498 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005499 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5500 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5501 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005502 ctxt->instate = XML_PARSER_CONTENT;
5503 ctxt->checkIndex = 0;
5504#ifdef DEBUG_PUSH
5505 xmlGenericError(xmlGenericErrorContext,
5506 "HPP: entering CONTENT\n");
5507#endif
5508 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005509 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005510 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5511 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5512 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005513 ctxt->instate = XML_PARSER_CONTENT;
5514 ctxt->checkIndex = 0;
5515#ifdef DEBUG_PUSH
5516 xmlGenericError(xmlGenericErrorContext,
5517 "HPP: entering CONTENT\n");
5518#endif
5519 break;
5520
Owen Taylor3473f882001-02-23 17:55:21 +00005521 }
5522 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005523done:
Owen Taylor3473f882001-02-23 17:55:21 +00005524 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005525 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005526 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005527 /*
5528 * SAX: end of the document processing.
5529 */
5530 ctxt->instate = XML_PARSER_EOF;
5531 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5532 ctxt->sax->endDocument(ctxt->userData);
5533 }
5534 }
5535 if ((ctxt->myDoc != NULL) &&
5536 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5537 (ctxt->instate == XML_PARSER_EPILOG))) {
5538 xmlDtdPtr dtd;
5539 dtd = xmlGetIntSubset(ctxt->myDoc);
5540 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005541 ctxt->myDoc->intSubset =
5542 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005543 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5544 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5545 }
5546#ifdef DEBUG_PUSH
5547 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5548#endif
5549 return(ret);
5550}
5551
5552/**
Owen Taylor3473f882001-02-23 17:55:21 +00005553 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005554 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005555 * @chunk: an char array
5556 * @size: the size in byte of the chunk
5557 * @terminate: last chunk indicator
5558 *
5559 * Parse a Chunk of memory
5560 *
5561 * Returns zero if no error, the xmlParserErrors otherwise.
5562 */
5563int
5564htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5565 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005566 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5567 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5568 "htmlParseChunk: context error\n", NULL, NULL);
5569 return(XML_ERR_INTERNAL_ERROR);
5570 }
Owen Taylor3473f882001-02-23 17:55:21 +00005571 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5572 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5573 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5574 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005575 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005576
5577 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005578 if (res < 0) {
5579 ctxt->errNo = XML_PARSER_EOF;
5580 ctxt->disableSAX = 1;
5581 return (XML_PARSER_EOF);
5582 }
Owen Taylor3473f882001-02-23 17:55:21 +00005583 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5584 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005585 ctxt->input->end =
5586 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005587#ifdef DEBUG_PUSH
5588 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5589#endif
5590
Daniel Veillard14f752c2003-08-09 11:44:50 +00005591#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005592 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5593 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005594#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005595 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005596 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5597 xmlParserInputBufferPtr in = ctxt->input->buf;
5598 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5599 (in->raw != NULL)) {
5600 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005601
Daniel Veillard14f752c2003-08-09 11:44:50 +00005602 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5603 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005604 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5605 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005606 return(XML_ERR_INVALID_ENCODING);
5607 }
5608 }
5609 }
Owen Taylor3473f882001-02-23 17:55:21 +00005610 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005611 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005612 if (terminate) {
5613 if ((ctxt->instate != XML_PARSER_EOF) &&
5614 (ctxt->instate != XML_PARSER_EPILOG) &&
5615 (ctxt->instate != XML_PARSER_MISC)) {
5616 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005617 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005618 }
Owen Taylor3473f882001-02-23 17:55:21 +00005619 if (ctxt->instate != XML_PARSER_EOF) {
5620 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5621 ctxt->sax->endDocument(ctxt->userData);
5622 }
5623 ctxt->instate = XML_PARSER_EOF;
5624 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005625 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005626}
5627
5628/************************************************************************
5629 * *
5630 * User entry points *
5631 * *
5632 ************************************************************************/
5633
5634/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005635 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005636 * @sax: a SAX handler
5637 * @user_data: The user data returned on SAX callbacks
5638 * @chunk: a pointer to an array of chars
5639 * @size: number of chars in the array
5640 * @filename: an optional file name or URI
5641 * @enc: an optional encoding
5642 *
5643 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005644 * The value of @filename is used for fetching external entities
5645 * and error/warning reports.
5646 *
5647 * Returns the new parser context or NULL
5648 */
5649htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005650htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005651 const char *chunk, int size, const char *filename,
5652 xmlCharEncoding enc) {
5653 htmlParserCtxtPtr ctxt;
5654 htmlParserInputPtr inputStream;
5655 xmlParserInputBufferPtr buf;
5656
Daniel Veillardd0463562001-10-13 09:15:48 +00005657 xmlInitParser();
5658
Owen Taylor3473f882001-02-23 17:55:21 +00005659 buf = xmlAllocParserInputBuffer(enc);
5660 if (buf == NULL) return(NULL);
5661
Daniel Veillardf403d292003-10-05 13:51:35 +00005662 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005663 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005664 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005665 return(NULL);
5666 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005667 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5668 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005669 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005670 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005671 xmlFree(ctxt->sax);
5672 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5673 if (ctxt->sax == NULL) {
5674 xmlFree(buf);
5675 xmlFree(ctxt);
5676 return(NULL);
5677 }
5678 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5679 if (user_data != NULL)
5680 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005681 }
Owen Taylor3473f882001-02-23 17:55:21 +00005682 if (filename == NULL) {
5683 ctxt->directory = NULL;
5684 } else {
5685 ctxt->directory = xmlParserGetDirectory(filename);
5686 }
5687
5688 inputStream = htmlNewInputStream(ctxt);
5689 if (inputStream == NULL) {
5690 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005691 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005692 return(NULL);
5693 }
5694
5695 if (filename == NULL)
5696 inputStream->filename = NULL;
5697 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005698 inputStream->filename = (char *)
5699 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005700 inputStream->buf = buf;
5701 inputStream->base = inputStream->buf->buffer->content;
5702 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005703 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005704 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005705
5706 inputPush(ctxt, inputStream);
5707
5708 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005709 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005710 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5711 int cur = ctxt->input->cur - ctxt->input->base;
5712
Daniel Veillarde77db162009-08-22 11:32:38 +02005713 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005714
5715 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5716 ctxt->input->cur = ctxt->input->base + cur;
5717 ctxt->input->end =
5718 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005719#ifdef DEBUG_PUSH
5720 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5721#endif
5722 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005723 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005724
5725 return(ctxt);
5726}
William M. Brack21e4ef22005-01-02 09:53:13 +00005727#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005728
5729/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005730 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005731 * @cur: a pointer to an array of xmlChar
5732 * @encoding: a free form C string describing the HTML document encoding, or NULL
5733 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005734 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005735 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005736 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5737 * to handle parse events. If sax is NULL, fallback to the default DOM
5738 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005739 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005740 * Returns the resulting document tree unless SAX is NULL or the document is
5741 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005742 */
5743
5744htmlDocPtr
5745htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5746 htmlDocPtr ret;
5747 htmlParserCtxtPtr ctxt;
5748
Daniel Veillardd0463562001-10-13 09:15:48 +00005749 xmlInitParser();
5750
Owen Taylor3473f882001-02-23 17:55:21 +00005751 if (cur == NULL) return(NULL);
5752
5753
5754 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5755 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005756 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005757 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005758 ctxt->sax = sax;
5759 ctxt->userData = userData;
5760 }
5761
5762 htmlParseDocument(ctxt);
5763 ret = ctxt->myDoc;
5764 if (sax != NULL) {
5765 ctxt->sax = NULL;
5766 ctxt->userData = NULL;
5767 }
5768 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005769
Owen Taylor3473f882001-02-23 17:55:21 +00005770 return(ret);
5771}
5772
5773/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005774 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005775 * @cur: a pointer to an array of xmlChar
5776 * @encoding: a free form C string describing the HTML document encoding, or NULL
5777 *
5778 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005779 *
Owen Taylor3473f882001-02-23 17:55:21 +00005780 * Returns the resulting document tree
5781 */
5782
5783htmlDocPtr
5784htmlParseDoc(xmlChar *cur, const char *encoding) {
5785 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5786}
5787
5788
5789/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005790 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005791 * @filename: the filename
5792 * @encoding: a free form C string describing the HTML document encoding, or NULL
5793 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005794 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005795 * Automatic support for ZLIB/Compress compressed document is provided
5796 * by default if found at compile-time.
5797 *
5798 * Returns the new parser context or NULL
5799 */
5800htmlParserCtxtPtr
5801htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5802{
5803 htmlParserCtxtPtr ctxt;
5804 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005805 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005806 /* htmlCharEncoding enc; */
5807 xmlChar *content, *content_line = (xmlChar *) "charset=";
5808
Daniel Veillarda03e3652004-11-02 18:45:30 +00005809 if (filename == NULL)
5810 return(NULL);
5811
Daniel Veillardf403d292003-10-05 13:51:35 +00005812 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005813 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005814 return(NULL);
5815 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005816 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5817 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005818#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005819 if (xmlDefaultSAXHandler.error != NULL) {
5820 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5821 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005822#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005823 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005824 return(NULL);
5825 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005826
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005827 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5828 xmlFree(canonicFilename);
5829 if (inputStream == NULL) {
5830 xmlFreeParserCtxt(ctxt);
5831 return(NULL);
5832 }
Owen Taylor3473f882001-02-23 17:55:21 +00005833
5834 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005835
Owen Taylor3473f882001-02-23 17:55:21 +00005836 /* set encoding */
5837 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005838 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005839 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005840 strcpy ((char *)content, (char *)content_line);
5841 strcat ((char *)content, (char *)encoding);
5842 htmlCheckEncoding (ctxt, content);
5843 xmlFree (content);
5844 }
5845 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005846
Owen Taylor3473f882001-02-23 17:55:21 +00005847 return(ctxt);
5848}
5849
5850/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005851 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005852 * @filename: the filename
5853 * @encoding: a free form C string describing the HTML document encoding, or NULL
5854 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005855 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005856 *
5857 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5858 * compressed document is provided by default if found at compile-time.
5859 * It use the given SAX function block to handle the parsing callback.
5860 * If sax is NULL, fallback to the default DOM tree building routines.
5861 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005862 * Returns the resulting document tree unless SAX is NULL or the document is
5863 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005864 */
5865
5866htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005867htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005868 void *userData) {
5869 htmlDocPtr ret;
5870 htmlParserCtxtPtr ctxt;
5871 htmlSAXHandlerPtr oldsax = NULL;
5872
Daniel Veillardd0463562001-10-13 09:15:48 +00005873 xmlInitParser();
5874
Owen Taylor3473f882001-02-23 17:55:21 +00005875 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5876 if (ctxt == NULL) return(NULL);
5877 if (sax != NULL) {
5878 oldsax = ctxt->sax;
5879 ctxt->sax = sax;
5880 ctxt->userData = userData;
5881 }
5882
5883 htmlParseDocument(ctxt);
5884
5885 ret = ctxt->myDoc;
5886 if (sax != NULL) {
5887 ctxt->sax = oldsax;
5888 ctxt->userData = NULL;
5889 }
5890 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005891
Owen Taylor3473f882001-02-23 17:55:21 +00005892 return(ret);
5893}
5894
5895/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005896 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005897 * @filename: the filename
5898 * @encoding: a free form C string describing the HTML document encoding, or NULL
5899 *
5900 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5901 * compressed document is provided by default if found at compile-time.
5902 *
5903 * Returns the resulting document tree
5904 */
5905
5906htmlDocPtr
5907htmlParseFile(const char *filename, const char *encoding) {
5908 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5909}
5910
5911/**
5912 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005913 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005914 *
5915 * Set and return the previous value for handling HTML omitted tags.
5916 *
5917 * Returns the last value for 0 for no handling, 1 for auto insertion.
5918 */
5919
5920int
5921htmlHandleOmittedElem(int val) {
5922 int old = htmlOmittedDefaultValue;
5923
5924 htmlOmittedDefaultValue = val;
5925 return(old);
5926}
5927
Daniel Veillard930dfb62003-02-05 10:17:38 +00005928/**
5929 * htmlElementAllowedHere:
5930 * @parent: HTML parent element
5931 * @elt: HTML element
5932 *
5933 * Checks whether an HTML element may be a direct child of a parent element.
5934 * Note - doesn't check for deprecated elements
5935 *
5936 * Returns 1 if allowed; 0 otherwise.
5937 */
5938int
5939htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5940 const char** p ;
5941
5942 if ( ! elt || ! parent || ! parent->subelts )
5943 return 0 ;
5944
5945 for ( p = parent->subelts; *p; ++p )
5946 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5947 return 1 ;
5948
5949 return 0 ;
5950}
5951/**
5952 * htmlElementStatusHere:
5953 * @parent: HTML parent element
5954 * @elt: HTML element
5955 *
5956 * Checks whether an HTML element may be a direct child of a parent element.
5957 * and if so whether it is valid or deprecated.
5958 *
5959 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5960 */
5961htmlStatus
5962htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5963 if ( ! parent || ! elt )
5964 return HTML_INVALID ;
5965 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5966 return HTML_INVALID ;
5967
5968 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5969}
5970/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005971 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005972 * @elt: HTML element
5973 * @attr: HTML attribute
5974 * @legacy: whether to allow deprecated attributes
5975 *
5976 * Checks whether an attribute is valid for an element
5977 * Has full knowledge of Required and Deprecated attributes
5978 *
5979 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5980 */
5981htmlStatus
5982htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5983 const char** p ;
5984
5985 if ( !elt || ! attr )
5986 return HTML_INVALID ;
5987
5988 if ( elt->attrs_req )
5989 for ( p = elt->attrs_req; *p; ++p)
5990 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5991 return HTML_REQUIRED ;
5992
5993 if ( elt->attrs_opt )
5994 for ( p = elt->attrs_opt; *p; ++p)
5995 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5996 return HTML_VALID ;
5997
5998 if ( legacy && elt->attrs_depr )
5999 for ( p = elt->attrs_depr; *p; ++p)
6000 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6001 return HTML_DEPRECATED ;
6002
6003 return HTML_INVALID ;
6004}
6005/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006006 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006007 * @node: an htmlNodePtr in a tree
6008 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006009 * for Element nodes)
6010 *
6011 * Checks whether the tree node is valid. Experimental (the author
6012 * only uses the HTML enhancements in a SAX parser)
6013 *
6014 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6015 * legacy allowed) or htmlElementStatusHere (otherwise).
6016 * for Attribute nodes, a return from htmlAttrAllowed
6017 * for other nodes, HTML_NA (no checks performed)
6018 */
6019htmlStatus
6020htmlNodeStatus(const htmlNodePtr node, int legacy) {
6021 if ( ! node )
6022 return HTML_INVALID ;
6023
6024 switch ( node->type ) {
6025 case XML_ELEMENT_NODE:
6026 return legacy
6027 ? ( htmlElementAllowedHere (
6028 htmlTagLookup(node->parent->name) , node->name
6029 ) ? HTML_VALID : HTML_INVALID )
6030 : htmlElementStatusHere(
6031 htmlTagLookup(node->parent->name) ,
6032 htmlTagLookup(node->name) )
6033 ;
6034 case XML_ATTRIBUTE_NODE:
6035 return htmlAttrAllowed(
6036 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6037 default: return HTML_NA ;
6038 }
6039}
Daniel Veillard9475a352003-09-26 12:47:50 +00006040/************************************************************************
6041 * *
6042 * New set (2.6.0) of simpler and more flexible APIs *
6043 * *
6044 ************************************************************************/
6045/**
6046 * DICT_FREE:
6047 * @str: a string
6048 *
6049 * Free a string if it is not owned by the "dict" dictionnary in the
6050 * current scope
6051 */
6052#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006053 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006054 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6055 xmlFree((char *)(str));
6056
6057/**
6058 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006059 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006060 *
6061 * Reset a parser context
6062 */
6063void
6064htmlCtxtReset(htmlParserCtxtPtr ctxt)
6065{
6066 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006067 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006068
Daniel Veillarda03e3652004-11-02 18:45:30 +00006069 if (ctxt == NULL)
6070 return;
6071
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006072 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006073 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006074
6075 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6076 xmlFreeInputStream(input);
6077 }
6078 ctxt->inputNr = 0;
6079 ctxt->input = NULL;
6080
6081 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006082 if (ctxt->spaceTab != NULL) {
6083 ctxt->spaceTab[0] = -1;
6084 ctxt->space = &ctxt->spaceTab[0];
6085 } else {
6086 ctxt->space = NULL;
6087 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006088
6089
6090 ctxt->nodeNr = 0;
6091 ctxt->node = NULL;
6092
6093 ctxt->nameNr = 0;
6094 ctxt->name = NULL;
6095
6096 DICT_FREE(ctxt->version);
6097 ctxt->version = NULL;
6098 DICT_FREE(ctxt->encoding);
6099 ctxt->encoding = NULL;
6100 DICT_FREE(ctxt->directory);
6101 ctxt->directory = NULL;
6102 DICT_FREE(ctxt->extSubURI);
6103 ctxt->extSubURI = NULL;
6104 DICT_FREE(ctxt->extSubSystem);
6105 ctxt->extSubSystem = NULL;
6106 if (ctxt->myDoc != NULL)
6107 xmlFreeDoc(ctxt->myDoc);
6108 ctxt->myDoc = NULL;
6109
6110 ctxt->standalone = -1;
6111 ctxt->hasExternalSubset = 0;
6112 ctxt->hasPErefs = 0;
6113 ctxt->html = 1;
6114 ctxt->external = 0;
6115 ctxt->instate = XML_PARSER_START;
6116 ctxt->token = 0;
6117
6118 ctxt->wellFormed = 1;
6119 ctxt->nsWellFormed = 1;
6120 ctxt->valid = 1;
6121 ctxt->vctxt.userData = ctxt;
6122 ctxt->vctxt.error = xmlParserValidityError;
6123 ctxt->vctxt.warning = xmlParserValidityWarning;
6124 ctxt->record_info = 0;
6125 ctxt->nbChars = 0;
6126 ctxt->checkIndex = 0;
6127 ctxt->inSubset = 0;
6128 ctxt->errNo = XML_ERR_OK;
6129 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006130 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006131 ctxt->catalogs = NULL;
6132 xmlInitNodeInfoSeq(&ctxt->node_seq);
6133
6134 if (ctxt->attsDefault != NULL) {
6135 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6136 ctxt->attsDefault = NULL;
6137 }
6138 if (ctxt->attsSpecial != NULL) {
6139 xmlHashFree(ctxt->attsSpecial, NULL);
6140 ctxt->attsSpecial = NULL;
6141 }
6142}
6143
6144/**
6145 * htmlCtxtUseOptions:
6146 * @ctxt: an HTML parser context
6147 * @options: a combination of htmlParserOption(s)
6148 *
6149 * Applies the options to the parser context
6150 *
6151 * Returns 0 in case of success, the set of unknown or unimplemented options
6152 * in case of error.
6153 */
6154int
6155htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6156{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006157 if (ctxt == NULL)
6158 return(-1);
6159
Daniel Veillard9475a352003-09-26 12:47:50 +00006160 if (options & HTML_PARSE_NOWARNING) {
6161 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006162 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006163 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006164 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006165 }
6166 if (options & HTML_PARSE_NOERROR) {
6167 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006168 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006169 ctxt->sax->fatalError = NULL;
6170 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006171 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006172 }
6173 if (options & HTML_PARSE_PEDANTIC) {
6174 ctxt->pedantic = 1;
6175 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006176 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006177 } else
6178 ctxt->pedantic = 0;
6179 if (options & XML_PARSE_NOBLANKS) {
6180 ctxt->keepBlanks = 0;
6181 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6182 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006183 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006184 } else
6185 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006186 if (options & HTML_PARSE_RECOVER) {
6187 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006188 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006189 } else
6190 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006191 if (options & HTML_PARSE_COMPACT) {
6192 ctxt->options |= HTML_PARSE_COMPACT;
6193 options -= HTML_PARSE_COMPACT;
6194 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006195 if (options & XML_PARSE_HUGE) {
6196 ctxt->options |= XML_PARSE_HUGE;
6197 options -= XML_PARSE_HUGE;
6198 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006199 ctxt->dictNames = 0;
6200 return (options);
6201}
6202
6203/**
6204 * htmlDoRead:
6205 * @ctxt: an HTML parser context
6206 * @URL: the base URL to use for the document
6207 * @encoding: the document encoding, or NULL
6208 * @options: a combination of htmlParserOption(s)
6209 * @reuse: keep the context for reuse
6210 *
6211 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006212 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006213 * Returns the resulting document tree or NULL
6214 */
6215static htmlDocPtr
6216htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6217 int options, int reuse)
6218{
6219 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006220
Daniel Veillard9475a352003-09-26 12:47:50 +00006221 htmlCtxtUseOptions(ctxt, options);
6222 ctxt->html = 1;
6223 if (encoding != NULL) {
6224 xmlCharEncodingHandlerPtr hdlr;
6225
6226 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006227 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006228 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006229 if (ctxt->input->encoding != NULL)
6230 xmlFree((xmlChar *) ctxt->input->encoding);
6231 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6232 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006233 }
6234 if ((URL != NULL) && (ctxt->input != NULL) &&
6235 (ctxt->input->filename == NULL))
6236 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6237 htmlParseDocument(ctxt);
6238 ret = ctxt->myDoc;
6239 ctxt->myDoc = NULL;
6240 if (!reuse) {
6241 if ((ctxt->dictNames) &&
6242 (ret != NULL) &&
6243 (ret->dict == ctxt->dict))
6244 ctxt->dict = NULL;
6245 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006246 }
6247 return (ret);
6248}
6249
6250/**
6251 * htmlReadDoc:
6252 * @cur: a pointer to a zero terminated string
6253 * @URL: the base URL to use for the document
6254 * @encoding: the document encoding, or NULL
6255 * @options: a combination of htmlParserOption(s)
6256 *
6257 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006258 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006259 * Returns the resulting document tree
6260 */
6261htmlDocPtr
6262htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6263{
6264 htmlParserCtxtPtr ctxt;
6265
6266 if (cur == NULL)
6267 return (NULL);
6268
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006269 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006270 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006271 if (ctxt == NULL)
6272 return (NULL);
6273 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6274}
6275
6276/**
6277 * htmlReadFile:
6278 * @filename: a file or URL
6279 * @encoding: the document encoding, or NULL
6280 * @options: a combination of htmlParserOption(s)
6281 *
6282 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006283 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006284 * Returns the resulting document tree
6285 */
6286htmlDocPtr
6287htmlReadFile(const char *filename, const char *encoding, int options)
6288{
6289 htmlParserCtxtPtr ctxt;
6290
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006291 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006292 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6293 if (ctxt == NULL)
6294 return (NULL);
6295 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6296}
6297
6298/**
6299 * htmlReadMemory:
6300 * @buffer: a pointer to a char array
6301 * @size: the size of the array
6302 * @URL: the base URL to use for the document
6303 * @encoding: the document encoding, or NULL
6304 * @options: a combination of htmlParserOption(s)
6305 *
6306 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006307 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006308 * Returns the resulting document tree
6309 */
6310htmlDocPtr
6311htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6312{
6313 htmlParserCtxtPtr ctxt;
6314
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006315 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006316 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6317 if (ctxt == NULL)
6318 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006319 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006320 if (ctxt->sax != NULL)
6321 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006322 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6323}
6324
6325/**
6326 * htmlReadFd:
6327 * @fd: an open file descriptor
6328 * @URL: the base URL to use for the document
6329 * @encoding: the document encoding, or NULL
6330 * @options: a combination of htmlParserOption(s)
6331 *
6332 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006333 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006334 * Returns the resulting document tree
6335 */
6336htmlDocPtr
6337htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6338{
6339 htmlParserCtxtPtr ctxt;
6340 xmlParserInputBufferPtr input;
6341 xmlParserInputPtr stream;
6342
6343 if (fd < 0)
6344 return (NULL);
6345
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006346 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006347 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6348 if (input == NULL)
6349 return (NULL);
6350 ctxt = xmlNewParserCtxt();
6351 if (ctxt == NULL) {
6352 xmlFreeParserInputBuffer(input);
6353 return (NULL);
6354 }
6355 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6356 if (stream == NULL) {
6357 xmlFreeParserInputBuffer(input);
6358 xmlFreeParserCtxt(ctxt);
6359 return (NULL);
6360 }
6361 inputPush(ctxt, stream);
6362 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6363}
6364
6365/**
6366 * htmlReadIO:
6367 * @ioread: an I/O read function
6368 * @ioclose: an I/O close function
6369 * @ioctx: an I/O handler
6370 * @URL: the base URL to use for the document
6371 * @encoding: the document encoding, or NULL
6372 * @options: a combination of htmlParserOption(s)
6373 *
6374 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006375 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006376 * Returns the resulting document tree
6377 */
6378htmlDocPtr
6379htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6380 void *ioctx, const char *URL, const char *encoding, int options)
6381{
6382 htmlParserCtxtPtr ctxt;
6383 xmlParserInputBufferPtr input;
6384 xmlParserInputPtr stream;
6385
6386 if (ioread == NULL)
6387 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006388 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006389
6390 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6391 XML_CHAR_ENCODING_NONE);
6392 if (input == NULL)
6393 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006394 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006395 if (ctxt == NULL) {
6396 xmlFreeParserInputBuffer(input);
6397 return (NULL);
6398 }
6399 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6400 if (stream == NULL) {
6401 xmlFreeParserInputBuffer(input);
6402 xmlFreeParserCtxt(ctxt);
6403 return (NULL);
6404 }
6405 inputPush(ctxt, stream);
6406 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6407}
6408
6409/**
6410 * htmlCtxtReadDoc:
6411 * @ctxt: an HTML parser context
6412 * @cur: a pointer to a zero terminated string
6413 * @URL: the base URL to use for the document
6414 * @encoding: the document encoding, or NULL
6415 * @options: a combination of htmlParserOption(s)
6416 *
6417 * parse an XML in-memory document and build a tree.
6418 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006419 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006420 * Returns the resulting document tree
6421 */
6422htmlDocPtr
6423htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6424 const char *URL, const char *encoding, int options)
6425{
6426 xmlParserInputPtr stream;
6427
6428 if (cur == NULL)
6429 return (NULL);
6430 if (ctxt == NULL)
6431 return (NULL);
6432
6433 htmlCtxtReset(ctxt);
6434
6435 stream = xmlNewStringInputStream(ctxt, cur);
6436 if (stream == NULL) {
6437 return (NULL);
6438 }
6439 inputPush(ctxt, stream);
6440 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6441}
6442
6443/**
6444 * htmlCtxtReadFile:
6445 * @ctxt: an HTML parser context
6446 * @filename: a file or URL
6447 * @encoding: the document encoding, or NULL
6448 * @options: a combination of htmlParserOption(s)
6449 *
6450 * parse an XML file from the filesystem or the network.
6451 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006452 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006453 * Returns the resulting document tree
6454 */
6455htmlDocPtr
6456htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6457 const char *encoding, int options)
6458{
6459 xmlParserInputPtr stream;
6460
6461 if (filename == NULL)
6462 return (NULL);
6463 if (ctxt == NULL)
6464 return (NULL);
6465
6466 htmlCtxtReset(ctxt);
6467
Daniel Veillard29614c72004-11-26 10:47:26 +00006468 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006469 if (stream == NULL) {
6470 return (NULL);
6471 }
6472 inputPush(ctxt, stream);
6473 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6474}
6475
6476/**
6477 * htmlCtxtReadMemory:
6478 * @ctxt: an HTML parser context
6479 * @buffer: a pointer to a char array
6480 * @size: the size of the array
6481 * @URL: the base URL to use for the document
6482 * @encoding: the document encoding, or NULL
6483 * @options: a combination of htmlParserOption(s)
6484 *
6485 * parse an XML in-memory document and build a tree.
6486 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006487 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006488 * Returns the resulting document tree
6489 */
6490htmlDocPtr
6491htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6492 const char *URL, const char *encoding, int options)
6493{
6494 xmlParserInputBufferPtr input;
6495 xmlParserInputPtr stream;
6496
6497 if (ctxt == NULL)
6498 return (NULL);
6499 if (buffer == NULL)
6500 return (NULL);
6501
6502 htmlCtxtReset(ctxt);
6503
6504 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6505 if (input == NULL) {
6506 return(NULL);
6507 }
6508
6509 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6510 if (stream == NULL) {
6511 xmlFreeParserInputBuffer(input);
6512 return(NULL);
6513 }
6514
6515 inputPush(ctxt, stream);
6516 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6517}
6518
6519/**
6520 * htmlCtxtReadFd:
6521 * @ctxt: an HTML parser context
6522 * @fd: an open file descriptor
6523 * @URL: the base URL to use for the document
6524 * @encoding: the document encoding, or NULL
6525 * @options: a combination of htmlParserOption(s)
6526 *
6527 * parse an XML from a file descriptor and build a tree.
6528 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006529 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006530 * Returns the resulting document tree
6531 */
6532htmlDocPtr
6533htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6534 const char *URL, const char *encoding, int options)
6535{
6536 xmlParserInputBufferPtr input;
6537 xmlParserInputPtr stream;
6538
6539 if (fd < 0)
6540 return (NULL);
6541 if (ctxt == NULL)
6542 return (NULL);
6543
6544 htmlCtxtReset(ctxt);
6545
6546
6547 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6548 if (input == NULL)
6549 return (NULL);
6550 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6551 if (stream == NULL) {
6552 xmlFreeParserInputBuffer(input);
6553 return (NULL);
6554 }
6555 inputPush(ctxt, stream);
6556 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6557}
6558
6559/**
6560 * htmlCtxtReadIO:
6561 * @ctxt: an HTML parser context
6562 * @ioread: an I/O read function
6563 * @ioclose: an I/O close function
6564 * @ioctx: an I/O handler
6565 * @URL: the base URL to use for the document
6566 * @encoding: the document encoding, or NULL
6567 * @options: a combination of htmlParserOption(s)
6568 *
6569 * parse an HTML document from I/O functions and source and build a tree.
6570 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006571 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006572 * Returns the resulting document tree
6573 */
6574htmlDocPtr
6575htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6576 xmlInputCloseCallback ioclose, void *ioctx,
6577 const char *URL,
6578 const char *encoding, int options)
6579{
6580 xmlParserInputBufferPtr input;
6581 xmlParserInputPtr stream;
6582
6583 if (ioread == NULL)
6584 return (NULL);
6585 if (ctxt == NULL)
6586 return (NULL);
6587
6588 htmlCtxtReset(ctxt);
6589
6590 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6591 XML_CHAR_ENCODING_NONE);
6592 if (input == NULL)
6593 return (NULL);
6594 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6595 if (stream == NULL) {
6596 xmlFreeParserInputBuffer(input);
6597 return (NULL);
6598 }
6599 inputPush(ctxt, stream);
6600 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6601}
6602
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006603#define bottom_HTMLparser
6604#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006605#endif /* LIBXML_HTML_ENABLED */