blob: f63851185a3e3f962c7e6b8d2abd7ad74637f1e2 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
212/*
213 * Macros for accessing the content. Those should be used only by the parser,
214 * and not exported.
215 *
216 * Dirty macros, i.e. one need to make assumption on the context to use them
217 *
218 * CUR_PTR return the current pointer to the xmlChar to be parsed.
219 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
220 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
221 * in UNICODE mode. This should be used internally by the parser
222 * only to compare to ASCII values otherwise it would break when
223 * running with UTF-8 encoding.
224 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
225 * to compare on ASCII based substring.
226 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
227 * it should be used only to compare on ASCII based substring.
228 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 *
231 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
232 *
233 * CURRENT Returns the current char value, with the full decoding of
234 * UTF-8 if we are using this mode. It returns an int.
235 * NEXT Skip to the next character, this does the proper decoding
236 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000237 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000238 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
239 */
240
241#define UPPER (toupper(*ctxt->input->cur))
242
Daniel Veillard77a90a72003-03-22 00:04:05 +0000243#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000244
245#define NXT(val) ctxt->input->cur[(val)]
246
247#define UPP(val) (toupper(ctxt->input->cur[(val)]))
248
249#define CUR_PTR ctxt->input->cur
250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
252 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
253 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000255#define GROW if ((ctxt->progressive == 0) && \
256 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
257 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000258
259#define CURRENT ((int) (*ctxt->input->cur))
260
261#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
262
263/* Inported from XML */
264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
266#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000267#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000268
Daniel Veillard561b7f82002-03-20 21:55:57 +0000269#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000270#define NXT(val) ctxt->input->cur[(val)]
271#define CUR_PTR ctxt->input->cur
272
273
274#define NEXTL(l) do { \
275 if (*(ctxt->input->cur) == '\n') { \
276 ctxt->input->line++; ctxt->input->col = 1; \
277 } else ctxt->input->col++; \
278 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
279 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200280
Owen Taylor3473f882001-02-23 17:55:21 +0000281/************
282 \
283 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
284 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
285 ************/
286
287#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
288#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
289
290#define COPY_BUF(l,b,i,v) \
291 if (l == 1) b[i++] = (xmlChar) v; \
292 else i += xmlCopyChar(l,&b[i],v)
293
294/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200295 * htmlFindEncoding:
296 * @the HTML parser context
297 *
298 * Ty to find and encoding in the current data available in the input
299 * buffer this is needed to try to switch to the proper encoding when
300 * one face a character error.
301 * That's an heuristic, since it's operating outside of parsing it could
302 * try to use a meta which had been commented out, that's the reason it
303 * should only be used in case of error, not as a default.
304 *
305 * Returns an encoding string or NULL if not found, the string need to
306 * be freed
307 */
308static xmlChar *
309htmlFindEncoding(xmlParserCtxtPtr ctxt) {
310 const xmlChar *start, *cur, *end;
311
312 if ((ctxt == NULL) || (ctxt->input == NULL) ||
313 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
314 (ctxt->input->buf->encoder != NULL))
315 return(NULL);
316 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
317 return(NULL);
318
319 start = ctxt->input->cur;
320 end = ctxt->input->end;
321 /* we also expect the input buffer to be zero terminated */
322 if (*end != 0)
323 return(NULL);
324
325 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
326 if (cur == NULL)
327 return(NULL);
328 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
329 if (cur == NULL)
330 return(NULL);
331 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
332 if (cur == NULL)
333 return(NULL);
334 cur += 8;
335 start = cur;
336 while (((*cur >= 'A') && (*cur <= 'Z')) ||
337 ((*cur >= 'a') && (*cur <= 'z')) ||
338 ((*cur >= '0') && (*cur <= '9')) ||
339 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
340 cur++;
341 if (cur == start)
342 return(NULL);
343 return(xmlStrndup(start, cur - start));
344}
345
346/**
Owen Taylor3473f882001-02-23 17:55:21 +0000347 * htmlCurrentChar:
348 * @ctxt: the HTML parser context
349 * @len: pointer to the length of the char read
350 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000351 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000352 * bytes in the input buffer. Implement the end of line normalization:
353 * 2.11 End-of-Line Handling
354 * If the encoding is unspecified, in the case we find an ISO-Latin-1
355 * char, then the encoding converter is plugged in automatically.
356 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000357 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000358 */
359
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000360static int
Owen Taylor3473f882001-02-23 17:55:21 +0000361htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
362 if (ctxt->instate == XML_PARSER_EOF)
363 return(0);
364
365 if (ctxt->token != 0) {
366 *len = 0;
367 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200368 }
Owen Taylor3473f882001-02-23 17:55:21 +0000369 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
370 /*
371 * We are supposed to handle UTF8, check it's valid
372 * From rfc2044: encoding of the Unicode values on UTF-8:
373 *
374 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
375 * 0000 0000-0000 007F 0xxxxxxx
376 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200377 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000378 *
379 * Check for the 0x110000 limit too
380 */
381 const unsigned char *cur = ctxt->input->cur;
382 unsigned char c;
383 unsigned int val;
384
385 c = *cur;
386 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200387 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000388 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200389 cur = ctxt->input->cur;
390 }
Owen Taylor3473f882001-02-23 17:55:21 +0000391 if ((cur[1] & 0xc0) != 0x80)
392 goto encoding_error;
393 if ((c & 0xe0) == 0xe0) {
394
Adiel Mittmann8a103792009-08-25 11:27:13 +0200395 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000396 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200397 cur = ctxt->input->cur;
398 }
Owen Taylor3473f882001-02-23 17:55:21 +0000399 if ((cur[2] & 0xc0) != 0x80)
400 goto encoding_error;
401 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200402 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000403 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200404 cur = ctxt->input->cur;
405 }
Owen Taylor3473f882001-02-23 17:55:21 +0000406 if (((c & 0xf8) != 0xf0) ||
407 ((cur[3] & 0xc0) != 0x80))
408 goto encoding_error;
409 /* 4-byte code */
410 *len = 4;
411 val = (cur[0] & 0x7) << 18;
412 val |= (cur[1] & 0x3f) << 12;
413 val |= (cur[2] & 0x3f) << 6;
414 val |= cur[3] & 0x3f;
415 } else {
416 /* 3-byte code */
417 *len = 3;
418 val = (cur[0] & 0xf) << 12;
419 val |= (cur[1] & 0x3f) << 6;
420 val |= cur[2] & 0x3f;
421 }
422 } else {
423 /* 2-byte code */
424 *len = 2;
425 val = (cur[0] & 0x1f) << 6;
426 val |= cur[1] & 0x3f;
427 }
428 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000429 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
430 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200431 }
Owen Taylor3473f882001-02-23 17:55:21 +0000432 return(val);
433 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200434 if ((*ctxt->input->cur == 0) &&
435 (ctxt->input->cur < ctxt->input->end)) {
436 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
437 "Char 0x%X out of allowed range\n", 0);
438 *len = 1;
439 return(' ');
440 }
Owen Taylor3473f882001-02-23 17:55:21 +0000441 /* 1-byte code */
442 *len = 1;
443 return((int) *ctxt->input->cur);
444 }
445 }
446 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000447 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000448 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000449 * XML constructs only use < 128 chars
450 */
451 *len = 1;
452 if ((int) *ctxt->input->cur < 0x80)
453 return((int) *ctxt->input->cur);
454
455 /*
456 * Humm this is bad, do an automatic flow conversion
457 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200458 {
459 xmlChar * guess;
460 xmlCharEncodingHandlerPtr handler;
461
462 guess = htmlFindEncoding(ctxt);
463 if (guess == NULL) {
464 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
465 } else {
466 if (ctxt->input->encoding != NULL)
467 xmlFree((xmlChar *) ctxt->input->encoding);
468 ctxt->input->encoding = guess;
469 handler = xmlFindCharEncodingHandler((const char *) guess);
470 if (handler != NULL) {
471 xmlSwitchToEncoding(ctxt, handler);
472 } else {
473 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
474 "Unsupported encoding %s", guess, NULL);
475 }
476 }
477 ctxt->charset = XML_CHAR_ENCODING_UTF8;
478 }
479
Owen Taylor3473f882001-02-23 17:55:21 +0000480 return(xmlCurrentChar(ctxt, len));
481
482encoding_error:
483 /*
484 * If we detect an UTF8 error that probably mean that the
485 * input encoding didn't get properly advertized in the
486 * declaration header. Report the error and switch the encoding
487 * to ISO-Latin-1 (if you don't like this policy, just declare the
488 * encoding !)
489 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000490 {
491 char buffer[150];
492
Daniel Veillard861101d2007-06-12 08:38:57 +0000493 if (ctxt->input->end - ctxt->input->cur >= 4) {
494 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
495 ctxt->input->cur[0], ctxt->input->cur[1],
496 ctxt->input->cur[2], ctxt->input->cur[3]);
497 } else {
498 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
499 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000500 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
501 "Input is not proper UTF-8, indicate encoding !\n",
502 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000503 }
504
Daniel Veillarde77db162009-08-22 11:32:38 +0200505 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000506 *len = 1;
507 return((int) *ctxt->input->cur);
508}
509
510/**
Owen Taylor3473f882001-02-23 17:55:21 +0000511 * htmlSkipBlankChars:
512 * @ctxt: the HTML parser context
513 *
514 * skip all blanks character found at that point in the input streams.
515 *
516 * Returns the number of space chars skipped
517 */
518
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000519static int
Owen Taylor3473f882001-02-23 17:55:21 +0000520htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
521 int res = 0;
522
William M. Brack76e95df2003-10-18 16:20:14 +0000523 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000524 if ((*ctxt->input->cur == 0) &&
525 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
526 xmlPopInput(ctxt);
527 } else {
528 if (*(ctxt->input->cur) == '\n') {
529 ctxt->input->line++; ctxt->input->col = 1;
530 } else ctxt->input->col++;
531 ctxt->input->cur++;
532 ctxt->nbChars++;
533 if (*ctxt->input->cur == 0)
534 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
535 }
536 res++;
537 }
538 return(res);
539}
540
541
542
543/************************************************************************
544 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200545 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000546 * *
547 ************************************************************************/
548
549/*
550 * Start Tag: 1 means the start tag can be ommited
551 * End Tag: 1 means the end tag can be ommited
552 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000553 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000554 * Depr: this element is deprecated
555 * DTD: 1 means that this element is valid only in the Loose DTD
556 * 2 means that this element is valid only in the Frameset DTD
557 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000558 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000559 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000560 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000561
562/* Definitions and a couple of vars for HTML Elements */
563
564#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000565#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000566#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000567#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000568#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
569#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000570#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000571#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
572#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
573#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000575#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000576#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000577#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000578#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000579#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000580#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000581#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000582#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000583#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000584#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000585#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000586#define EMPTY NULL
587
588
Daniel Veillard065abe82006-07-03 08:55:04 +0000589static const char* const html_flow[] = { FLOW, NULL } ;
590static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000591
592/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000593static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000594#define html_cdata html_pcdata
595
596
597/* ... and for HTML Attributes */
598
599#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000600#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000601#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000602#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000604#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000605#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000606#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000607#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000608#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000609#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000610#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000611
Daniel Veillard065abe82006-07-03 08:55:04 +0000612static const char* const html_attrs[] = { ATTRS, NULL } ;
613static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
614static const char* const core_attrs[] = { COREATTRS, NULL } ;
615static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000616
617
618/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000619static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
621 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000622static const char* const target_attr[] = { "target", NULL } ;
623static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
624static const char* const alt_attr[] = { "alt", NULL } ;
625static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
626static const char* const href_attrs[] = { "href", NULL } ;
627static const char* const clear_attrs[] = { "clear", NULL } ;
628static const char* const inline_p[] = { INLINE, "p", NULL } ;
629
630static const char* const flow_param[] = { FLOW, "param", NULL } ;
631static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000632 "archive", "alt", "name", "height", "width", "align",
633 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000634static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000635 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000636static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000637 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000638static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
639static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
640static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
641static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000642 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000643static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000644 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
645
646
Daniel Veillard065abe82006-07-03 08:55:04 +0000647static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
648static const char* const col_elt[] = { "col", NULL } ;
649static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
650static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
651static const char* const dl_contents[] = { "dt", "dd", NULL } ;
652static const char* const compact_attr[] = { "compact", NULL } ;
653static const char* const label_attr[] = { "label", NULL } ;
654static const char* const fieldset_contents[] = { FLOW, "legend" } ;
655static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
656static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
657static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
658static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
659static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
660static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
661static const char* const head_attrs[] = { I18N, "profile", NULL } ;
662static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
663static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
664static const char* const version_attr[] = { "version", NULL } ;
665static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
666static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
667static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000668static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000669static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
670static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
671static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
672static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
673static const char* const align_attr[] = { "align", NULL } ;
674static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
675static const char* const map_contents[] = { BLOCK, "area", NULL } ;
676static const char* const name_attr[] = { "name", NULL } ;
677static const char* const action_attr[] = { "action", NULL } ;
678static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
679static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
680static const char* const content_attr[] = { "content", NULL } ;
681static const char* const type_attr[] = { "type", NULL } ;
682static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
683static const char* const object_contents[] = { FLOW, "param", NULL } ;
684static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
685static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
686static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
687static const char* const option_elt[] = { "option", NULL } ;
688static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
689static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
690static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
691static const char* const width_attr[] = { "width", NULL } ;
692static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
693static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
694static const char* const language_attr[] = { "language", NULL } ;
695static const char* const select_content[] = { "optgroup", "option", NULL } ;
696static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
697static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200698static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000699static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
700static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
701static const char* const tr_elt[] = { "tr", NULL } ;
702static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
703static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
704static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
705static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
706static const char* const tr_contents[] = { "th", "td", NULL } ;
707static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
708static const char* const li_elt[] = { "li", NULL } ;
709static const char* const ul_depr[] = { "type", "compact", NULL} ;
710static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000711
712#define DECL (const char**)
713
Daniel Veillard22090732001-07-16 00:06:07 +0000714static const htmlElemDesc
715html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000716{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
717 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
718},
719{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
720 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
721},
722{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
723 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
724},
725{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
726 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
727},
728{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
729 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
730},
731{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
732 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
733},
734{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
735 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
736},
737{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
738 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
739},
740{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
741 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
742},
743{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
744 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
745},
746{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
747 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
748},
749{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
750 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
751},
752{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
753 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
754},
755{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
756 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
757},
758{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
759 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
760},
761{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
762 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
763},
764{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
765 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
766},
767{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
768 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
769},
770{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772},
773{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
774 EMPTY , NULL , DECL col_attrs , NULL, NULL
775},
776{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
777 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
778},
779{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
780 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
781},
782{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
783 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
784},
785{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787},
788{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
789 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
790},
791{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
792 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
793},
794{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000795 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000796},
797{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
798 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
799},
800{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
801 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
802},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000803{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000804 EMPTY, NULL, DECL embed_attrs, NULL, NULL
805},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000806{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
807 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
808},
809{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
810 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
811},
812{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
813 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
814},
815{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
816 EMPTY, NULL, NULL, DECL frame_attrs, NULL
817},
818{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
819 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
820},
821{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
822 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
823},
824{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
825 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
826},
827{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
828 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
829},
830{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
831 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
832},
833{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
834 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
835},
836{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
837 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
838},
839{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
840 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
841},
842{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
843 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
844},
845{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
846 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
847},
848{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850},
851{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
852 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
853},
854{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000855 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000856},
857{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
858 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
859},
860{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
861 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
862},
863{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
864 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
865},
866{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
867 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
868},
869{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
870 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
871},
872{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
873 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
874},
875{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
876 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
877},
878{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
879 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
880},
881{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000882 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000883},
884{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
885 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
886},
887{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
888 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
889},
890{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
891 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
892},
893{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
894 DECL html_flow, "div", DECL html_attrs, NULL, NULL
895},
896{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
897 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
898},
899{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
900 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
901},
902{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000903 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000904},
905{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
906 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
907},
908{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910},
911{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000912 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000913},
914{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
915 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
916},
917{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
918 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
919},
920{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
921 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
922},
923{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
924 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
925},
926{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
927 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
928},
929{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
930 DECL select_content, NULL, DECL select_attrs, NULL, NULL
931},
932{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
933 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
934},
935{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
936 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
937},
938{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
939 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
940},
941{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
942 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
943},
944{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
945 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
946},
947{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949},
950{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
951 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
952},
953{ "table", 0, 0, 0, 0, 0, 0, 0, "",
954 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
955},
956{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
957 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
958},
959{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
960 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
961},
962{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
963 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
964},
965{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
966 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
967},
968{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
969 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
970},
971{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
972 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
973},
974{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
975 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
976},
977{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
978 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
979},
980{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
981 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
982},
983{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
984 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
985},
986{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
987 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
988},
989{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
990 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991}
Owen Taylor3473f882001-02-23 17:55:21 +0000992};
993
994/*
Owen Taylor3473f882001-02-23 17:55:21 +0000995 * start tags that imply the end of current element
996 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000997static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000998"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
999 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1000 "listing", "xmp", "head", NULL,
1001"head", "p", NULL,
1002"title", "p", NULL,
1003"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001004"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001005"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1006 "pre", "listing", "xmp", "head", "li", NULL,
1007"hr", "p", "head", NULL,
1008"h1", "p", "head", NULL,
1009"h2", "p", "head", NULL,
1010"h3", "p", "head", NULL,
1011"h4", "p", "head", NULL,
1012"h5", "p", "head", NULL,
1013"h6", "p", "head", NULL,
1014"dir", "p", "head", NULL,
1015"address", "p", "head", "ul", NULL,
1016"pre", "p", "head", "ul", NULL,
1017"listing", "p", "head", NULL,
1018"xmp", "p", "head", NULL,
1019"blockquote", "p", "head", NULL,
1020"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1021 "xmp", "head", NULL,
1022"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1023 "head", "dd", NULL,
1024"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1025 "head", "dt", NULL,
1026"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1027 "listing", "xmp", NULL,
1028"ol", "p", "head", "ul", NULL,
1029"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001030"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001031"div", "p", "head", NULL,
1032"noscript", "p", "head", NULL,
1033"center", "font", "b", "i", "p", "head", NULL,
1034"a", "a", NULL,
1035"caption", "p", NULL,
1036"colgroup", "caption", "colgroup", "col", "p", NULL,
1037"col", "caption", "col", "p", NULL,
1038"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1039 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001040"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001041"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001042"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1043"thead", "caption", "col", "colgroup", NULL,
1044"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1045 "tbody", "p", NULL,
1046"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1047 "tfoot", "tbody", "p", NULL,
1048"optgroup", "option", NULL,
1049"option", "option", NULL,
1050"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1051 "pre", "listing", "xmp", "a", NULL,
1052NULL
1053};
1054
1055/*
1056 * The list of HTML elements which are supposed not to have
1057 * CDATA content and where a p element will be implied
1058 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001059 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001060 * implied paragraph
1061 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001062static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001063 "html",
1064 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001065 NULL
1066};
1067
1068/*
1069 * The list of HTML attributes which are of content %Script;
1070 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1071 * it assumes the name starts with 'on'
1072 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001073static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001074 "onclick",
1075 "ondblclick",
1076 "onmousedown",
1077 "onmouseup",
1078 "onmouseover",
1079 "onmousemove",
1080 "onmouseout",
1081 "onkeypress",
1082 "onkeydown",
1083 "onkeyup",
1084 "onload",
1085 "onunload",
1086 "onfocus",
1087 "onblur",
1088 "onsubmit",
1089 "onrest",
1090 "onchange",
1091 "onselect"
1092};
1093
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001094/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001095 * This table is used by the htmlparser to know what to do with
1096 * broken html pages. By assigning different priorities to different
1097 * elements the parser can decide how to handle extra endtags.
1098 * Endtags are only allowed to close elements with lower or equal
1099 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001100 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001101
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001102typedef struct {
1103 const char *name;
1104 int priority;
1105} elementPriority;
1106
Daniel Veillard22090732001-07-16 00:06:07 +00001107static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001108 {"div", 150},
1109 {"td", 160},
1110 {"th", 160},
1111 {"tr", 170},
1112 {"thead", 180},
1113 {"tbody", 180},
1114 {"tfoot", 180},
1115 {"table", 190},
1116 {"head", 200},
1117 {"body", 200},
1118 {"html", 220},
1119 {NULL, 100} /* Default priority */
1120};
Owen Taylor3473f882001-02-23 17:55:21 +00001121
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001122static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001123static int htmlStartCloseIndexinitialized = 0;
1124
1125/************************************************************************
1126 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001127 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001128 * *
1129 ************************************************************************/
1130
1131/**
1132 * htmlInitAutoClose:
1133 *
1134 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1135 * This is not reentrant. Call xmlInitParser() once before processing in
1136 * case of use in multithreaded programs.
1137 */
1138void
1139htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001140 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001141
1142 if (htmlStartCloseIndexinitialized) return;
1143
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001144 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1145 indx = 0;
1146 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001147 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001148 while (htmlStartClose[i] != NULL) i++;
1149 i++;
1150 }
1151 htmlStartCloseIndexinitialized = 1;
1152}
1153
1154/**
1155 * htmlTagLookup:
1156 * @tag: The tag name in lowercase
1157 *
1158 * Lookup the HTML tag in the ElementTable
1159 *
1160 * Returns the related htmlElemDescPtr or NULL if not found.
1161 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001162const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001163htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001164 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001165
1166 for (i = 0; i < (sizeof(html40ElementTable) /
1167 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001168 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001169 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001170 }
1171 return(NULL);
1172}
1173
1174/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001175 * htmlGetEndPriority:
1176 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001177 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001178 * Return value: The "endtag" priority.
1179 **/
1180static int
1181htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001182 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001183
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001184 while ((htmlEndPriority[i].name != NULL) &&
1185 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1186 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001187
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001188 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001189}
1190
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001191
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001192/**
Owen Taylor3473f882001-02-23 17:55:21 +00001193 * htmlCheckAutoClose:
1194 * @newtag: The new tag name
1195 * @oldtag: The old tag name
1196 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001197 * Checks whether the new tag is one of the registered valid tags for
1198 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001199 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1200 *
1201 * Returns 0 if no, 1 if yes.
1202 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001203static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001204htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1205{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001206 int i, indx;
1207 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001208
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001209 if (htmlStartCloseIndexinitialized == 0)
1210 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001211
1212 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001213 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001214 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001215 if (closed == NULL)
1216 return (0);
1217 if (xmlStrEqual(BAD_CAST * closed, newtag))
1218 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001219 }
1220
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001221 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001222 i++;
1223 while (htmlStartClose[i] != NULL) {
1224 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001225 return (1);
1226 }
1227 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001228 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001230}
1231
1232/**
1233 * htmlAutoCloseOnClose:
1234 * @ctxt: an HTML parser context
1235 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001236 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001237 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001238 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001239 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001240static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001241htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1242{
1243 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001244 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001245
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001246 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001247
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001249
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001250 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1251 break;
1252 /*
1253 * A missplaced endtag can only close elements with lower
1254 * or equal priority, so if we find an element with higher
1255 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001256 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001257 */
1258 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1259 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001260 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001261 if (i < 0)
1262 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001263
1264 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001265 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001266 if ((info != NULL) && (info->endTag == 3)) {
1267 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1268 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001269 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001270 }
1271 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1272 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001273 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001274 }
1275}
1276
1277/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001278 * htmlAutoCloseOnEnd:
1279 * @ctxt: an HTML parser context
1280 *
1281 * Close all remaining tags at the end of the stream
1282 */
1283static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001284htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1285{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001286 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001287
William M. Brack899e64a2003-09-26 18:03:42 +00001288 if (ctxt->nameNr == 0)
1289 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001290 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001291 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1292 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001293 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001294 }
1295}
1296
1297/**
Owen Taylor3473f882001-02-23 17:55:21 +00001298 * htmlAutoClose:
1299 * @ctxt: an HTML parser context
1300 * @newtag: The new tag name or NULL
1301 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001302 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001303 * The list is kept in htmlStartClose array. This function is
1304 * called when a new tag has been detected and generates the
1305 * appropriates closes if possible/needed.
1306 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001307 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001308 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001309static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001310htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1311{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001313 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001314 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1315 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001316 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001317 }
1318 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001319 htmlAutoCloseOnEnd(ctxt);
1320 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001321 }
1322 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001323 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1324 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1325 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001326 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1327 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001328 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001329 }
Owen Taylor3473f882001-02-23 17:55:21 +00001330}
1331
1332/**
1333 * htmlAutoCloseTag:
1334 * @doc: the HTML document
1335 * @name: The tag name
1336 * @elem: the HTML element
1337 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001338 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001339 * The list is kept in htmlStartClose array. This function checks
1340 * if the element or one of it's children would autoclose the
1341 * given tag.
1342 *
1343 * Returns 1 if autoclose, 0 otherwise
1344 */
1345int
1346htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1347 htmlNodePtr child;
1348
1349 if (elem == NULL) return(1);
1350 if (xmlStrEqual(name, elem->name)) return(0);
1351 if (htmlCheckAutoClose(elem->name, name)) return(1);
1352 child = elem->children;
1353 while (child != NULL) {
1354 if (htmlAutoCloseTag(doc, name, child)) return(1);
1355 child = child->next;
1356 }
1357 return(0);
1358}
1359
1360/**
1361 * htmlIsAutoClosed:
1362 * @doc: the HTML document
1363 * @elem: the HTML element
1364 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001365 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001366 * The list is kept in htmlStartClose array. This function checks
1367 * if a tag is autoclosed by one of it's child
1368 *
1369 * Returns 1 if autoclosed, 0 otherwise
1370 */
1371int
1372htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1373 htmlNodePtr child;
1374
1375 if (elem == NULL) return(1);
1376 child = elem->children;
1377 while (child != NULL) {
1378 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1379 child = child->next;
1380 }
1381 return(0);
1382}
1383
1384/**
1385 * htmlCheckImplied:
1386 * @ctxt: an HTML parser context
1387 * @newtag: The new tag name
1388 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001389 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001390 * called when a new tag has been detected and generates the
1391 * appropriates implicit tags if missing
1392 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001393static void
Owen Taylor3473f882001-02-23 17:55:21 +00001394htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001395 int i;
1396
Owen Taylor3473f882001-02-23 17:55:21 +00001397 if (!htmlOmittedDefaultValue)
1398 return;
1399 if (xmlStrEqual(newtag, BAD_CAST"html"))
1400 return;
1401 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001402 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001403 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1404 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1405 }
1406 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1407 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001408 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001409 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1410 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1411 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1412 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1413 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1414 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001415 if (ctxt->html >= 3) {
1416 /* we already saw or generated an <head> before */
1417 return;
1418 }
1419 /*
1420 * dropped OBJECT ... i you put it first BODY will be
1421 * assumed !
1422 */
1423 htmlnamePush(ctxt, BAD_CAST"head");
1424 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1425 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001426 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1427 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1428 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001429 if (ctxt->html >= 10) {
1430 /* we already saw or generated a <body> before */
1431 return;
1432 }
Owen Taylor3473f882001-02-23 17:55:21 +00001433 for (i = 0;i < ctxt->nameNr;i++) {
1434 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1435 return;
1436 }
1437 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1438 return;
1439 }
1440 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001441
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001442 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001443 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1444 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1445 }
1446}
1447
1448/**
1449 * htmlCheckParagraph
1450 * @ctxt: an HTML parser context
1451 *
1452 * Check whether a p element need to be implied before inserting
1453 * characters in the current element.
1454 *
1455 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1456 * in case of error.
1457 */
1458
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001459static int
Owen Taylor3473f882001-02-23 17:55:21 +00001460htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1461 const xmlChar *tag;
1462 int i;
1463
1464 if (ctxt == NULL)
1465 return(-1);
1466 tag = ctxt->name;
1467 if (tag == NULL) {
1468 htmlAutoClose(ctxt, BAD_CAST"p");
1469 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001470 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001471 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1472 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1473 return(1);
1474 }
1475 if (!htmlOmittedDefaultValue)
1476 return(0);
1477 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1478 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001479 htmlAutoClose(ctxt, BAD_CAST"p");
1480 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001481 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001482 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1483 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1484 return(1);
1485 }
1486 }
1487 return(0);
1488}
1489
1490/**
1491 * htmlIsScriptAttribute:
1492 * @name: an attribute name
1493 *
1494 * Check if an attribute is of content type Script
1495 *
1496 * Returns 1 is the attribute is a script 0 otherwise
1497 */
1498int
1499htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001500 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001501
1502 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001503 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001504 /*
1505 * all script attributes start with 'on'
1506 */
1507 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001508 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001509 for (i = 0;
1510 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1511 i++) {
1512 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1513 return(1);
1514 }
1515 return(0);
1516}
1517
1518/************************************************************************
1519 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001520 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001521 * *
1522 ************************************************************************/
1523
1524
Daniel Veillard22090732001-07-16 00:06:07 +00001525static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001526/*
1527 * the 4 absolute ones, plus apostrophe.
1528 */
1529{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1530{ 38, "amp", "ampersand, U+0026 ISOnum" },
1531{ 39, "apos", "single quote" },
1532{ 60, "lt", "less-than sign, U+003C ISOnum" },
1533{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1534
1535/*
1536 * A bunch still in the 128-255 range
1537 * Replacing them depend really on the charset used.
1538 */
1539{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1540{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1541{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1542{ 163, "pound","pound sign, U+00A3 ISOnum" },
1543{ 164, "curren","currency sign, U+00A4 ISOnum" },
1544{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1545{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1546{ 167, "sect", "section sign, U+00A7 ISOnum" },
1547{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1548{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1549{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1550{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1551{ 172, "not", "not sign, U+00AC ISOnum" },
1552{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1553{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1554{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1555{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1556{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1557{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1558{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1559{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1560{ 181, "micro","micro sign, U+00B5 ISOnum" },
1561{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1562{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1563{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1564{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1565{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1566{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1567{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1568{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1569{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1570{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1571{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1572{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1573{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1574{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1575{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1576{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1577{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1578{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1579{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1580{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1581{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1582{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1583{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1584{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1585{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1586{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1587{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1588{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1589{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1590{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1591{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1592{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1593{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1594{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1595{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1596{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1597{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1598{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1599{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1600{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1601{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1602{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1603{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1604{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1605{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1606{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1607{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1608{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1609{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1610{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1611{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1612{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1613{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1614{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1615{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1616{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1617{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1618{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1619{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1620{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1621{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1622{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1623{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1624{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1625{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1626{ 247, "divide","division sign, U+00F7 ISOnum" },
1627{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1628{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1629{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1630{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1631{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1632{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1633{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1634{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1635
1636{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1637{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1638{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1639{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1640{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1641
1642/*
1643 * Anything below should really be kept as entities references
1644 */
1645{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1646
1647{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1648{ 732, "tilde","small tilde, U+02DC ISOdia" },
1649
1650{ 913, "Alpha","greek capital letter alpha, U+0391" },
1651{ 914, "Beta", "greek capital letter beta, U+0392" },
1652{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1653{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1654{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1655{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1656{ 919, "Eta", "greek capital letter eta, U+0397" },
1657{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1658{ 921, "Iota", "greek capital letter iota, U+0399" },
1659{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001660{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001661{ 924, "Mu", "greek capital letter mu, U+039C" },
1662{ 925, "Nu", "greek capital letter nu, U+039D" },
1663{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1664{ 927, "Omicron","greek capital letter omicron, U+039F" },
1665{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1666{ 929, "Rho", "greek capital letter rho, U+03A1" },
1667{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1668{ 932, "Tau", "greek capital letter tau, U+03A4" },
1669{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1670{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1671{ 935, "Chi", "greek capital letter chi, U+03A7" },
1672{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1673{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1674
1675{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1676{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1677{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1678{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1679{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1680{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1681{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1682{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1683{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1684{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1685{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1686{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1687{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1688{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1689{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1690{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1691{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1692{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1693{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1694{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1695{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1696{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1697{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1698{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1699{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1700{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1701{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1702{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1703
1704{ 8194, "ensp", "en space, U+2002 ISOpub" },
1705{ 8195, "emsp", "em space, U+2003 ISOpub" },
1706{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1707{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1708{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1709{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1710{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1711{ 8211, "ndash","en dash, U+2013 ISOpub" },
1712{ 8212, "mdash","em dash, U+2014 ISOpub" },
1713{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1714{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1715{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1716{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1717{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1718{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1719{ 8224, "dagger","dagger, U+2020 ISOpub" },
1720{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1721
1722{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1723{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1724
1725{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1726
1727{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1728{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1729
1730{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1731{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1732
1733{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1734{ 8260, "frasl","fraction slash, U+2044 NEW" },
1735
1736{ 8364, "euro", "euro sign, U+20AC NEW" },
1737
1738{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1739{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1740{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1741{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1742{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1743{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1744{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1745{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1746{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1747{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1748{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1749{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1750{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1751{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1752{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1753{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1754
1755{ 8704, "forall","for all, U+2200 ISOtech" },
1756{ 8706, "part", "partial differential, U+2202 ISOtech" },
1757{ 8707, "exist","there exists, U+2203 ISOtech" },
1758{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1759{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1760{ 8712, "isin", "element of, U+2208 ISOtech" },
1761{ 8713, "notin","not an element of, U+2209 ISOtech" },
1762{ 8715, "ni", "contains as member, U+220B ISOtech" },
1763{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001764{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001765{ 8722, "minus","minus sign, U+2212 ISOtech" },
1766{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1767{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1768{ 8733, "prop", "proportional to, U+221D ISOtech" },
1769{ 8734, "infin","infinity, U+221E ISOtech" },
1770{ 8736, "ang", "angle, U+2220 ISOamso" },
1771{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1772{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1773{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1774{ 8746, "cup", "union = cup, U+222A ISOtech" },
1775{ 8747, "int", "integral, U+222B ISOtech" },
1776{ 8756, "there4","therefore, U+2234 ISOtech" },
1777{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1778{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1779{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1780{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1781{ 8801, "equiv","identical to, U+2261 ISOtech" },
1782{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1783{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1784{ 8834, "sub", "subset of, U+2282 ISOtech" },
1785{ 8835, "sup", "superset of, U+2283 ISOtech" },
1786{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1787{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1788{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1789{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1790{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1791{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1792{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1793{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1794{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1795{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1796{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1797{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1798{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1799{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1800
1801{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1802{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1803{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1804{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1805
1806};
1807
1808/************************************************************************
1809 * *
1810 * Commodity functions to handle entities *
1811 * *
1812 ************************************************************************/
1813
1814/*
1815 * Macro used to grow the current buffer.
1816 */
1817#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001818 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001819 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001820 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1821 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001822 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001823 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001824 return(NULL); \
1825 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001826 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001827}
1828
1829/**
1830 * htmlEntityLookup:
1831 * @name: the entity name
1832 *
1833 * Lookup the given entity in EntitiesTable
1834 *
1835 * TODO: the linear scan is really ugly, an hash table is really needed.
1836 *
1837 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1838 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001839const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001840htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001841 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001842
1843 for (i = 0;i < (sizeof(html40EntitiesTable)/
1844 sizeof(html40EntitiesTable[0]));i++) {
1845 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001846 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001847 }
1848 }
1849 return(NULL);
1850}
1851
1852/**
1853 * htmlEntityValueLookup:
1854 * @value: the entity's unicode value
1855 *
1856 * Lookup the given entity in EntitiesTable
1857 *
1858 * TODO: the linear scan is really ugly, an hash table is really needed.
1859 *
1860 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1861 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001862const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001863htmlEntityValueLookup(unsigned int value) {
1864 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001865
1866 for (i = 0;i < (sizeof(html40EntitiesTable)/
1867 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001868 if (html40EntitiesTable[i].value >= value) {
1869 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001870 break;
William M. Brack78637da2003-07-31 14:47:38 +00001871 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001872 }
Owen Taylor3473f882001-02-23 17:55:21 +00001873 }
1874 return(NULL);
1875}
1876
1877/**
1878 * UTF8ToHtml:
1879 * @out: a pointer to an array of bytes to store the result
1880 * @outlen: the length of @out
1881 * @in: a pointer to an array of UTF-8 chars
1882 * @inlen: the length of @in
1883 *
1884 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1885 * plus HTML entities block of chars out.
1886 *
1887 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1888 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001889 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001890 * The value of @outlen after return is the number of octets consumed.
1891 */
1892int
1893UTF8ToHtml(unsigned char* out, int *outlen,
1894 const unsigned char* in, int *inlen) {
1895 const unsigned char* processed = in;
1896 const unsigned char* outend;
1897 const unsigned char* outstart = out;
1898 const unsigned char* instart = in;
1899 const unsigned char* inend;
1900 unsigned int c, d;
1901 int trailing;
1902
Daniel Veillardce682bc2004-11-05 17:22:25 +00001903 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001904 if (in == NULL) {
1905 /*
1906 * initialization nothing to do
1907 */
1908 *outlen = 0;
1909 *inlen = 0;
1910 return(0);
1911 }
1912 inend = in + (*inlen);
1913 outend = out + (*outlen);
1914 while (in < inend) {
1915 d = *in++;
1916 if (d < 0x80) { c= d; trailing= 0; }
1917 else if (d < 0xC0) {
1918 /* trailing byte in leading position */
1919 *outlen = out - outstart;
1920 *inlen = processed - instart;
1921 return(-2);
1922 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1923 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1924 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1925 else {
1926 /* no chance for this in Ascii */
1927 *outlen = out - outstart;
1928 *inlen = processed - instart;
1929 return(-2);
1930 }
1931
1932 if (inend - in < trailing) {
1933 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001934 }
Owen Taylor3473f882001-02-23 17:55:21 +00001935
1936 for ( ; trailing; trailing--) {
1937 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1938 break;
1939 c <<= 6;
1940 c |= d & 0x3F;
1941 }
1942
1943 /* assertion: c is a single UTF-4 value */
1944 if (c < 0x80) {
1945 if (out + 1 >= outend)
1946 break;
1947 *out++ = c;
1948 } else {
1949 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001950 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001951 const char *cp;
1952 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001953
1954 /*
1955 * Try to lookup a predefined HTML entity for it
1956 */
1957
1958 ent = htmlEntityValueLookup(c);
1959 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001960 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1961 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001962 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001963 else
1964 cp = ent->name;
1965 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001966 if (out + 2 + len >= outend)
1967 break;
1968 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001969 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001970 out += len;
1971 *out++ = ';';
1972 }
1973 processed = in;
1974 }
1975 *outlen = out - outstart;
1976 *inlen = processed - instart;
1977 return(0);
1978}
1979
1980/**
1981 * htmlEncodeEntities:
1982 * @out: a pointer to an array of bytes to store the result
1983 * @outlen: the length of @out
1984 * @in: a pointer to an array of UTF-8 chars
1985 * @inlen: the length of @in
1986 * @quoteChar: the quote character to escape (' or ") or zero.
1987 *
1988 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1989 * plus HTML entities block of chars out.
1990 *
1991 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1992 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001993 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001994 * The value of @outlen after return is the number of octets consumed.
1995 */
1996int
1997htmlEncodeEntities(unsigned char* out, int *outlen,
1998 const unsigned char* in, int *inlen, int quoteChar) {
1999 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002000 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002001 const unsigned char* outstart = out;
2002 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002003 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002004 unsigned int c, d;
2005 int trailing;
2006
Daniel Veillardce682bc2004-11-05 17:22:25 +00002007 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2008 return(-1);
2009 outend = out + (*outlen);
2010 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002011 while (in < inend) {
2012 d = *in++;
2013 if (d < 0x80) { c= d; trailing= 0; }
2014 else if (d < 0xC0) {
2015 /* trailing byte in leading position */
2016 *outlen = out - outstart;
2017 *inlen = processed - instart;
2018 return(-2);
2019 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2020 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2021 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2022 else {
2023 /* no chance for this in Ascii */
2024 *outlen = out - outstart;
2025 *inlen = processed - instart;
2026 return(-2);
2027 }
2028
2029 if (inend - in < trailing)
2030 break;
2031
2032 while (trailing--) {
2033 if (((d= *in++) & 0xC0) != 0x80) {
2034 *outlen = out - outstart;
2035 *inlen = processed - instart;
2036 return(-2);
2037 }
2038 c <<= 6;
2039 c |= d & 0x3F;
2040 }
2041
2042 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002043 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2044 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002045 if (out >= outend)
2046 break;
2047 *out++ = c;
2048 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002049 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002050 const char *cp;
2051 char nbuf[16];
2052 int len;
2053
2054 /*
2055 * Try to lookup a predefined HTML entity for it
2056 */
2057 ent = htmlEntityValueLookup(c);
2058 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002059 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002060 cp = nbuf;
2061 }
2062 else
2063 cp = ent->name;
2064 len = strlen(cp);
2065 if (out + 2 + len > outend)
2066 break;
2067 *out++ = '&';
2068 memcpy(out, cp, len);
2069 out += len;
2070 *out++ = ';';
2071 }
2072 processed = in;
2073 }
2074 *outlen = out - outstart;
2075 *inlen = processed - instart;
2076 return(0);
2077}
2078
Owen Taylor3473f882001-02-23 17:55:21 +00002079/************************************************************************
2080 * *
2081 * Commodity functions to handle streams *
2082 * *
2083 ************************************************************************/
2084
2085/**
Owen Taylor3473f882001-02-23 17:55:21 +00002086 * htmlNewInputStream:
2087 * @ctxt: an HTML parser context
2088 *
2089 * Create a new input stream structure
2090 * Returns the new input stream or NULL
2091 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002092static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002093htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2094 htmlParserInputPtr input;
2095
2096 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2097 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002098 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002099 return(NULL);
2100 }
2101 memset(input, 0, sizeof(htmlParserInput));
2102 input->filename = NULL;
2103 input->directory = NULL;
2104 input->base = NULL;
2105 input->cur = NULL;
2106 input->buf = NULL;
2107 input->line = 1;
2108 input->col = 1;
2109 input->buf = NULL;
2110 input->free = NULL;
2111 input->version = NULL;
2112 input->consumed = 0;
2113 input->length = 0;
2114 return(input);
2115}
2116
2117
2118/************************************************************************
2119 * *
2120 * Commodity functions, cleanup needed ? *
2121 * *
2122 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002123/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002124 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002125 * NOTE: it might be more apropriate to integrate this information
2126 * into the html40ElementTable array but I don't want to risk any
2127 * binary incomptibility
2128 */
2129static const char *allowPCData[] = {
2130 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2131 "blockquote", "body", "button", "caption", "center", "cite", "code",
2132 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2133 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2134 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2135 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2136};
Owen Taylor3473f882001-02-23 17:55:21 +00002137
2138/**
2139 * areBlanks:
2140 * @ctxt: an HTML parser context
2141 * @str: a xmlChar *
2142 * @len: the size of @str
2143 *
2144 * Is this a sequence of blank chars that one can ignore ?
2145 *
2146 * Returns 1 if ignorable 0 otherwise.
2147 */
2148
2149static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002150 unsigned int i;
2151 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002152 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002153 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002154
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002155 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002156 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002157
2158 if (CUR == 0) return(1);
2159 if (CUR != '<') return(0);
2160 if (ctxt->name == NULL)
2161 return(1);
2162 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2163 return(1);
2164 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2165 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002166
2167 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2168 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2169 dtd = xmlGetIntSubset(ctxt->myDoc);
2170 if (dtd != NULL && dtd->ExternalID != NULL) {
2171 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2172 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2173 return(1);
2174 }
2175 }
2176
Owen Taylor3473f882001-02-23 17:55:21 +00002177 if (ctxt->node == NULL) return(0);
2178 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002179 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2180 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002181 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002182 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2183 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002184 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002185 for all tags "b" allowing PCDATA */
2186 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2187 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2188 return(0);
2189 }
2190 }
Owen Taylor3473f882001-02-23 17:55:21 +00002191 } else if (xmlNodeIsText(lastChild)) {
2192 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002193 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002194 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002195 for all tags "p" allowing PCDATA */
2196 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2197 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2198 return(0);
2199 }
2200 }
Owen Taylor3473f882001-02-23 17:55:21 +00002201 }
2202 return(1);
2203}
2204
2205/**
Owen Taylor3473f882001-02-23 17:55:21 +00002206 * htmlNewDocNoDtD:
2207 * @URI: URI for the dtd, or NULL
2208 * @ExternalID: the external ID of the DTD, or NULL
2209 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002210 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2211 * are NULL
2212 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002213 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002214 */
2215htmlDocPtr
2216htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2217 xmlDocPtr cur;
2218
2219 /*
2220 * Allocate a new document and fill the fields.
2221 */
2222 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2223 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002224 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002225 return(NULL);
2226 }
2227 memset(cur, 0, sizeof(xmlDoc));
2228
2229 cur->type = XML_HTML_DOCUMENT_NODE;
2230 cur->version = NULL;
2231 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002232 cur->doc = cur;
2233 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002234 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002235 cur->extSubset = NULL;
2236 cur->oldNs = NULL;
2237 cur->encoding = NULL;
2238 cur->standalone = 1;
2239 cur->compression = 0;
2240 cur->ids = NULL;
2241 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002242 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002243 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002244 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002245 if ((ExternalID != NULL) ||
2246 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002247 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002248 return(cur);
2249}
2250
2251/**
2252 * htmlNewDoc:
2253 * @URI: URI for the dtd, or NULL
2254 * @ExternalID: the external ID of the DTD, or NULL
2255 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002256 * Creates a new HTML document
2257 *
Owen Taylor3473f882001-02-23 17:55:21 +00002258 * Returns a new document
2259 */
2260htmlDocPtr
2261htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2262 if ((URI == NULL) && (ExternalID == NULL))
2263 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002264 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2265 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002266
2267 return(htmlNewDocNoDtD(URI, ExternalID));
2268}
2269
2270
2271/************************************************************************
2272 * *
2273 * The parser itself *
2274 * Relates to http://www.w3.org/TR/html40 *
2275 * *
2276 ************************************************************************/
2277
2278/************************************************************************
2279 * *
2280 * The parser itself *
2281 * *
2282 ************************************************************************/
2283
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002284static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002285
Owen Taylor3473f882001-02-23 17:55:21 +00002286/**
2287 * htmlParseHTMLName:
2288 * @ctxt: an HTML parser context
2289 *
2290 * parse an HTML tag or attribute name, note that we convert it to lowercase
2291 * since HTML names are not case-sensitive.
2292 *
2293 * Returns the Tag Name parsed or NULL
2294 */
2295
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002296static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002297htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002298 int i = 0;
2299 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2300
William M. Brackd1757ab2004-10-02 22:07:48 +00002301 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002302 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002303
2304 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002305 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002306 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2307 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002308 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2309 else loc[i] = CUR;
2310 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002311
Owen Taylor3473f882001-02-23 17:55:21 +00002312 NEXT;
2313 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002314
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002315 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002316}
2317
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002318
2319/**
2320 * htmlParseHTMLName_nonInvasive:
2321 * @ctxt: an HTML parser context
2322 *
2323 * parse an HTML tag or attribute name, note that we convert it to lowercase
2324 * since HTML names are not case-sensitive, this doesn't consume the data
2325 * from the stream, it's a look-ahead
2326 *
2327 * Returns the Tag Name parsed or NULL
2328 */
2329
2330static const xmlChar *
2331htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2332 int i = 0;
2333 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2334
2335 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2336 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002337
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002338 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2339 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2340 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2341 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2342 else loc[i] = NXT(1+i);
2343 i++;
2344 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002345
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002346 return(xmlDictLookup(ctxt->dict, loc, i));
2347}
2348
2349
Owen Taylor3473f882001-02-23 17:55:21 +00002350/**
2351 * htmlParseName:
2352 * @ctxt: an HTML parser context
2353 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002354 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002355 *
2356 * Returns the Name parsed or NULL
2357 */
2358
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002359static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002360htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002361 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002362 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002363 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002364
2365 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002366
2367 /*
2368 * Accelerator for simple ASCII names
2369 */
2370 in = ctxt->input->cur;
2371 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2372 ((*in >= 0x41) && (*in <= 0x5A)) ||
2373 (*in == '_') || (*in == ':')) {
2374 in++;
2375 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2376 ((*in >= 0x41) && (*in <= 0x5A)) ||
2377 ((*in >= 0x30) && (*in <= 0x39)) ||
2378 (*in == '_') || (*in == '-') ||
2379 (*in == ':') || (*in == '.'))
2380 in++;
2381 if ((*in > 0) && (*in < 0x80)) {
2382 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002383 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002384 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002385 ctxt->nbChars += count;
2386 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002387 return(ret);
2388 }
2389 }
2390 return(htmlParseNameComplex(ctxt));
2391}
2392
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002393static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002394htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002395 int len = 0, l;
2396 int c;
2397 int count = 0;
2398
2399 /*
2400 * Handler for more complex cases
2401 */
2402 GROW;
2403 c = CUR_CHAR(l);
2404 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2405 (!IS_LETTER(c) && (c != '_') &&
2406 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002407 return(NULL);
2408 }
2409
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002410 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2411 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2412 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002413 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002414 (IS_COMBINING(c)) ||
2415 (IS_EXTENDER(c)))) {
2416 if (count++ > 100) {
2417 count = 0;
2418 GROW;
2419 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002420 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002421 NEXTL(l);
2422 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002423 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002424 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002425}
2426
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002427
Owen Taylor3473f882001-02-23 17:55:21 +00002428/**
2429 * htmlParseHTMLAttribute:
2430 * @ctxt: an HTML parser context
2431 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002432 *
Owen Taylor3473f882001-02-23 17:55:21 +00002433 * parse an HTML attribute value till the stop (quote), if
2434 * stop is 0 then it stops at the first space
2435 *
2436 * Returns the attribute parsed or NULL
2437 */
2438
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002439static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002440htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2441 xmlChar *buffer = NULL;
2442 int buffer_size = 0;
2443 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002444 const xmlChar *name = NULL;
2445 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002446 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002447
2448 /*
2449 * allocate a translation buffer.
2450 */
2451 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002452 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002453 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002454 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002455 return(NULL);
2456 }
2457 out = buffer;
2458
2459 /*
2460 * Ok loop until we reach one of the ending chars
2461 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002462 while ((CUR != 0) && (CUR != stop)) {
2463 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002464 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002465 if (CUR == '&') {
2466 if (NXT(1) == '#') {
2467 unsigned int c;
2468 int bits;
2469
2470 c = htmlParseCharRef(ctxt);
2471 if (c < 0x80)
2472 { *out++ = c; bits= -6; }
2473 else if (c < 0x800)
2474 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2475 else if (c < 0x10000)
2476 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002477 else
Owen Taylor3473f882001-02-23 17:55:21 +00002478 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002479
Owen Taylor3473f882001-02-23 17:55:21 +00002480 for ( ; bits >= 0; bits-= 6) {
2481 *out++ = ((c >> bits) & 0x3F) | 0x80;
2482 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002483
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002484 if (out - buffer > buffer_size - 100) {
2485 int indx = out - buffer;
2486
2487 growBuffer(buffer);
2488 out = &buffer[indx];
2489 }
Owen Taylor3473f882001-02-23 17:55:21 +00002490 } else {
2491 ent = htmlParseEntityRef(ctxt, &name);
2492 if (name == NULL) {
2493 *out++ = '&';
2494 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002495 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002496
2497 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002498 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002499 }
2500 } else if (ent == NULL) {
2501 *out++ = '&';
2502 cur = name;
2503 while (*cur != 0) {
2504 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002505 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002506
2507 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002508 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002509 }
2510 *out++ = *cur++;
2511 }
Owen Taylor3473f882001-02-23 17:55:21 +00002512 } else {
2513 unsigned int c;
2514 int bits;
2515
2516 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002517 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002518
2519 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002520 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002521 }
Daniel Veillard48519092006-10-17 15:56:35 +00002522 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002523 if (c < 0x80)
2524 { *out++ = c; bits= -6; }
2525 else if (c < 0x800)
2526 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2527 else if (c < 0x10000)
2528 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002529 else
Owen Taylor3473f882001-02-23 17:55:21 +00002530 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002531
Owen Taylor3473f882001-02-23 17:55:21 +00002532 for ( ; bits >= 0; bits-= 6) {
2533 *out++ = ((c >> bits) & 0x3F) | 0x80;
2534 }
Owen Taylor3473f882001-02-23 17:55:21 +00002535 }
2536 }
2537 } else {
2538 unsigned int c;
2539 int bits, l;
2540
2541 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002542 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002543
2544 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002545 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002546 }
2547 c = CUR_CHAR(l);
2548 if (c < 0x80)
2549 { *out++ = c; bits= -6; }
2550 else if (c < 0x800)
2551 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2552 else if (c < 0x10000)
2553 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002554 else
Owen Taylor3473f882001-02-23 17:55:21 +00002555 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002556
Owen Taylor3473f882001-02-23 17:55:21 +00002557 for ( ; bits >= 0; bits-= 6) {
2558 *out++ = ((c >> bits) & 0x3F) | 0x80;
2559 }
2560 NEXT;
2561 }
2562 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002563 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002564 return(buffer);
2565}
2566
2567/**
Owen Taylor3473f882001-02-23 17:55:21 +00002568 * htmlParseEntityRef:
2569 * @ctxt: an HTML parser context
2570 * @str: location to store the entity name
2571 *
2572 * parse an HTML ENTITY references
2573 *
2574 * [68] EntityRef ::= '&' Name ';'
2575 *
2576 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2577 * if non-NULL *str will have to be freed by the caller.
2578 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002579const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002580htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2581 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002582 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002583
2584 if (str != NULL) *str = NULL;
2585 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002586
2587 if (CUR == '&') {
2588 NEXT;
2589 name = htmlParseName(ctxt);
2590 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002591 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2592 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002593 } else {
2594 GROW;
2595 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002596 if (str != NULL)
2597 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002598
2599 /*
2600 * Lookup the entity in the table.
2601 */
2602 ent = htmlEntityLookup(name);
2603 if (ent != NULL) /* OK that's ugly !!! */
2604 NEXT;
2605 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002606 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2607 "htmlParseEntityRef: expecting ';'\n",
2608 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002609 if (str != NULL)
2610 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002611 }
2612 }
2613 }
2614 return(ent);
2615}
2616
2617/**
2618 * htmlParseAttValue:
2619 * @ctxt: an HTML parser context
2620 *
2621 * parse a value for an attribute
2622 * Note: the parser won't do substitution of entities here, this
2623 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002624 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002625 *
2626 * Returns the AttValue parsed or NULL.
2627 */
2628
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002629static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002630htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2631 xmlChar *ret = NULL;
2632
2633 if (CUR == '"') {
2634 NEXT;
2635 ret = htmlParseHTMLAttribute(ctxt, '"');
2636 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002637 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2638 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002639 } else
2640 NEXT;
2641 } else if (CUR == '\'') {
2642 NEXT;
2643 ret = htmlParseHTMLAttribute(ctxt, '\'');
2644 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002645 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2646 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002647 } else
2648 NEXT;
2649 } else {
2650 /*
2651 * That's an HTMLism, the attribute value may not be quoted
2652 */
2653 ret = htmlParseHTMLAttribute(ctxt, 0);
2654 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002655 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2656 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002657 }
2658 }
2659 return(ret);
2660}
2661
2662/**
2663 * htmlParseSystemLiteral:
2664 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002665 *
Owen Taylor3473f882001-02-23 17:55:21 +00002666 * parse an HTML Literal
2667 *
2668 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2669 *
2670 * Returns the SystemLiteral parsed or NULL
2671 */
2672
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002673static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002674htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2675 const xmlChar *q;
2676 xmlChar *ret = NULL;
2677
2678 if (CUR == '"') {
2679 NEXT;
2680 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002681 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002682 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002683 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002684 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2685 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002686 } else {
2687 ret = xmlStrndup(q, CUR_PTR - q);
2688 NEXT;
2689 }
2690 } else if (CUR == '\'') {
2691 NEXT;
2692 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002693 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002694 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002695 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002696 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2697 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002698 } else {
2699 ret = xmlStrndup(q, CUR_PTR - q);
2700 NEXT;
2701 }
2702 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002703 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2704 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002705 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002706
Owen Taylor3473f882001-02-23 17:55:21 +00002707 return(ret);
2708}
2709
2710/**
2711 * htmlParsePubidLiteral:
2712 * @ctxt: an HTML parser context
2713 *
2714 * parse an HTML public literal
2715 *
2716 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2717 *
2718 * Returns the PubidLiteral parsed or NULL.
2719 */
2720
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002721static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002722htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2723 const xmlChar *q;
2724 xmlChar *ret = NULL;
2725 /*
2726 * Name ::= (Letter | '_') (NameChar)*
2727 */
2728 if (CUR == '"') {
2729 NEXT;
2730 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002731 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002732 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002733 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2734 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002735 } else {
2736 ret = xmlStrndup(q, CUR_PTR - q);
2737 NEXT;
2738 }
2739 } else if (CUR == '\'') {
2740 NEXT;
2741 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002742 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002743 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002744 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002745 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2746 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002747 } else {
2748 ret = xmlStrndup(q, CUR_PTR - q);
2749 NEXT;
2750 }
2751 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002752 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2753 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002754 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002755
Owen Taylor3473f882001-02-23 17:55:21 +00002756 return(ret);
2757}
2758
2759/**
2760 * htmlParseScript:
2761 * @ctxt: an HTML parser context
2762 *
2763 * parse the content of an HTML SCRIPT or STYLE element
2764 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2765 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2766 * http://www.w3.org/TR/html4/types.html#type-script
2767 * http://www.w3.org/TR/html4/types.html#h-6.15
2768 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2769 *
2770 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2771 * element and the value of intrinsic event attributes. User agents must
2772 * not evaluate script data as HTML markup but instead must pass it on as
2773 * data to a script engine.
2774 * NOTES:
2775 * - The content is passed like CDATA
2776 * - the attributes for style and scripting "onXXX" are also described
2777 * as CDATA but SGML allows entities references in attributes so their
2778 * processing is identical as other attributes
2779 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002780static void
Owen Taylor3473f882001-02-23 17:55:21 +00002781htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002782 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002783 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002784 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002785
2786 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002787 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002788 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002789 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002790 /*
2791 * One should break here, the specification is clear:
2792 * Authors should therefore escape "</" within the content.
2793 * Escape mechanisms are specific to each scripting or
2794 * style sheet language.
2795 *
2796 * In recovery mode, only break if end tag match the
2797 * current tag, effectively ignoring all tags inside the
2798 * script/style block and treating the entire block as
2799 * CDATA.
2800 */
2801 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002802 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2803 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002804 {
2805 break; /* while */
2806 } else {
2807 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002808 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002809 ctxt->name, NULL);
2810 }
2811 } else {
2812 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002813 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002814 {
2815 break; /* while */
2816 }
2817 }
Owen Taylor3473f882001-02-23 17:55:21 +00002818 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002819 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002820 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2821 if (ctxt->sax->cdataBlock!= NULL) {
2822 /*
2823 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2824 */
2825 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002826 } else if (ctxt->sax->characters != NULL) {
2827 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002828 }
2829 nbchar = 0;
2830 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002831 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002832 NEXTL(l);
2833 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002834 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002835
Daniel Veillard68716a72006-10-16 09:32:17 +00002836 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002837 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2838 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002839 NEXT;
2840 }
2841
2842 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2843 if (ctxt->sax->cdataBlock!= NULL) {
2844 /*
2845 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2846 */
2847 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002848 } else if (ctxt->sax->characters != NULL) {
2849 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002850 }
2851 }
2852}
2853
2854
2855/**
2856 * htmlParseCharData:
2857 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002858 *
2859 * parse a CharData section.
2860 * if we are within a CDATA section ']]>' marks an end of section.
2861 *
2862 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2863 */
2864
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002865static void
2866htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002867 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2868 int nbchar = 0;
2869 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002870 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002871
2872 SHRINK;
2873 cur = CUR_CHAR(l);
2874 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002875 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002876 (cur != 0)) {
2877 if (!(IS_CHAR(cur))) {
2878 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2879 "Invalid char in CDATA 0x%X\n", cur);
2880 } else {
2881 COPY_BUF(l,buf,nbchar,cur);
2882 }
Owen Taylor3473f882001-02-23 17:55:21 +00002883 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2884 /*
2885 * Ok the segment is to be consumed as chars.
2886 */
2887 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2888 if (areBlanks(ctxt, buf, nbchar)) {
2889 if (ctxt->sax->ignorableWhitespace != NULL)
2890 ctxt->sax->ignorableWhitespace(ctxt->userData,
2891 buf, nbchar);
2892 } else {
2893 htmlCheckParagraph(ctxt);
2894 if (ctxt->sax->characters != NULL)
2895 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2896 }
2897 }
2898 nbchar = 0;
2899 }
2900 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002901 chunk++;
2902 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2903 chunk = 0;
2904 SHRINK;
2905 GROW;
2906 }
Owen Taylor3473f882001-02-23 17:55:21 +00002907 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002908 if (cur == 0) {
2909 SHRINK;
2910 GROW;
2911 cur = CUR_CHAR(l);
2912 }
Owen Taylor3473f882001-02-23 17:55:21 +00002913 }
2914 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002915 buf[nbchar] = 0;
2916
Owen Taylor3473f882001-02-23 17:55:21 +00002917 /*
2918 * Ok the segment is to be consumed as chars.
2919 */
2920 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2921 if (areBlanks(ctxt, buf, nbchar)) {
2922 if (ctxt->sax->ignorableWhitespace != NULL)
2923 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2924 } else {
2925 htmlCheckParagraph(ctxt);
2926 if (ctxt->sax->characters != NULL)
2927 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2928 }
2929 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002930 } else {
2931 /*
2932 * Loop detection
2933 */
2934 if (cur == 0)
2935 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002936 }
2937}
2938
2939/**
2940 * htmlParseExternalID:
2941 * @ctxt: an HTML parser context
2942 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002943 *
2944 * Parse an External ID or a Public ID
2945 *
Owen Taylor3473f882001-02-23 17:55:21 +00002946 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2947 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2948 *
2949 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2950 *
2951 * Returns the function returns SystemLiteral and in the second
2952 * case publicID receives PubidLiteral, is strict is off
2953 * it is possible to return NULL and have publicID set.
2954 */
2955
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002956static xmlChar *
2957htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002958 xmlChar *URI = NULL;
2959
2960 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2961 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2962 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2963 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002964 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002965 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2966 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002967 }
2968 SKIP_BLANKS;
2969 URI = htmlParseSystemLiteral(ctxt);
2970 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002971 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2972 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002973 }
2974 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2975 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2976 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2977 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002978 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002979 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2980 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002981 }
2982 SKIP_BLANKS;
2983 *publicID = htmlParsePubidLiteral(ctxt);
2984 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002985 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2986 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2987 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002988 }
2989 SKIP_BLANKS;
2990 if ((CUR == '"') || (CUR == '\'')) {
2991 URI = htmlParseSystemLiteral(ctxt);
2992 }
2993 }
2994 return(URI);
2995}
2996
2997/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002998 * xmlParsePI:
2999 * @ctxt: an XML parser context
3000 *
3001 * parse an XML Processing Instruction.
3002 *
3003 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3004 */
3005static void
3006htmlParsePI(htmlParserCtxtPtr ctxt) {
3007 xmlChar *buf = NULL;
3008 int len = 0;
3009 int size = HTML_PARSER_BUFFER_SIZE;
3010 int cur, l;
3011 const xmlChar *target;
3012 xmlParserInputState state;
3013 int count = 0;
3014
3015 if ((RAW == '<') && (NXT(1) == '?')) {
3016 state = ctxt->instate;
3017 ctxt->instate = XML_PARSER_PI;
3018 /*
3019 * this is a Processing Instruction.
3020 */
3021 SKIP(2);
3022 SHRINK;
3023
3024 /*
3025 * Parse the target name and check for special support like
3026 * namespace.
3027 */
3028 target = htmlParseName(ctxt);
3029 if (target != NULL) {
3030 if (RAW == '>') {
3031 SKIP(1);
3032
3033 /*
3034 * SAX: PI detected.
3035 */
3036 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3037 (ctxt->sax->processingInstruction != NULL))
3038 ctxt->sax->processingInstruction(ctxt->userData,
3039 target, NULL);
3040 ctxt->instate = state;
3041 return;
3042 }
3043 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3044 if (buf == NULL) {
3045 htmlErrMemory(ctxt, NULL);
3046 ctxt->instate = state;
3047 return;
3048 }
3049 cur = CUR;
3050 if (!IS_BLANK(cur)) {
3051 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3052 "ParsePI: PI %s space expected\n", target, NULL);
3053 }
3054 SKIP_BLANKS;
3055 cur = CUR_CHAR(l);
3056 while (IS_CHAR(cur) && (cur != '>')) {
3057 if (len + 5 >= size) {
3058 xmlChar *tmp;
3059
3060 size *= 2;
3061 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3062 if (tmp == NULL) {
3063 htmlErrMemory(ctxt, NULL);
3064 xmlFree(buf);
3065 ctxt->instate = state;
3066 return;
3067 }
3068 buf = tmp;
3069 }
3070 count++;
3071 if (count > 50) {
3072 GROW;
3073 count = 0;
3074 }
3075 COPY_BUF(l,buf,len,cur);
3076 NEXTL(l);
3077 cur = CUR_CHAR(l);
3078 if (cur == 0) {
3079 SHRINK;
3080 GROW;
3081 cur = CUR_CHAR(l);
3082 }
3083 }
3084 buf[len] = 0;
3085 if (cur != '>') {
3086 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3087 "ParsePI: PI %s never end ...\n", target, NULL);
3088 } else {
3089 SKIP(1);
3090
3091 /*
3092 * SAX: PI detected.
3093 */
3094 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3095 (ctxt->sax->processingInstruction != NULL))
3096 ctxt->sax->processingInstruction(ctxt->userData,
3097 target, buf);
3098 }
3099 xmlFree(buf);
3100 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003101 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003102 "PI is not started correctly", NULL, NULL);
3103 }
3104 ctxt->instate = state;
3105 }
3106}
3107
3108/**
Owen Taylor3473f882001-02-23 17:55:21 +00003109 * htmlParseComment:
3110 * @ctxt: an HTML parser context
3111 *
3112 * Parse an XML (SGML) comment <!-- .... -->
3113 *
3114 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3115 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003116static void
Owen Taylor3473f882001-02-23 17:55:21 +00003117htmlParseComment(htmlParserCtxtPtr ctxt) {
3118 xmlChar *buf = NULL;
3119 int len;
3120 int size = HTML_PARSER_BUFFER_SIZE;
3121 int q, ql;
3122 int r, rl;
3123 int cur, l;
3124 xmlParserInputState state;
3125
3126 /*
3127 * Check that there is a comment right here.
3128 */
3129 if ((RAW != '<') || (NXT(1) != '!') ||
3130 (NXT(2) != '-') || (NXT(3) != '-')) return;
3131
3132 state = ctxt->instate;
3133 ctxt->instate = XML_PARSER_COMMENT;
3134 SHRINK;
3135 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003136 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003137 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003138 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003139 ctxt->instate = state;
3140 return;
3141 }
3142 q = CUR_CHAR(ql);
3143 NEXTL(ql);
3144 r = CUR_CHAR(rl);
3145 NEXTL(rl);
3146 cur = CUR_CHAR(l);
3147 len = 0;
3148 while (IS_CHAR(cur) &&
3149 ((cur != '>') ||
3150 (r != '-') || (q != '-'))) {
3151 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003152 xmlChar *tmp;
3153
Owen Taylor3473f882001-02-23 17:55:21 +00003154 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003155 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3156 if (tmp == NULL) {
3157 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003158 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003159 ctxt->instate = state;
3160 return;
3161 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003162 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003163 }
3164 COPY_BUF(ql,buf,len,q);
3165 q = r;
3166 ql = rl;
3167 r = cur;
3168 rl = l;
3169 NEXTL(l);
3170 cur = CUR_CHAR(l);
3171 if (cur == 0) {
3172 SHRINK;
3173 GROW;
3174 cur = CUR_CHAR(l);
3175 }
3176 }
3177 buf[len] = 0;
3178 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003179 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3180 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003181 xmlFree(buf);
3182 } else {
3183 NEXT;
3184 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3185 (!ctxt->disableSAX))
3186 ctxt->sax->comment(ctxt->userData, buf);
3187 xmlFree(buf);
3188 }
3189 ctxt->instate = state;
3190}
3191
3192/**
3193 * htmlParseCharRef:
3194 * @ctxt: an HTML parser context
3195 *
3196 * parse Reference declarations
3197 *
3198 * [66] CharRef ::= '&#' [0-9]+ ';' |
3199 * '&#x' [0-9a-fA-F]+ ';'
3200 *
3201 * Returns the value parsed (as an int)
3202 */
3203int
3204htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3205 int val = 0;
3206
Daniel Veillarda03e3652004-11-02 18:45:30 +00003207 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3208 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3209 "htmlParseCharRef: context error\n",
3210 NULL, NULL);
3211 return(0);
3212 }
Owen Taylor3473f882001-02-23 17:55:21 +00003213 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003214 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003215 SKIP(3);
3216 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003217 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003218 val = val * 16 + (CUR - '0');
3219 else if ((CUR >= 'a') && (CUR <= 'f'))
3220 val = val * 16 + (CUR - 'a') + 10;
3221 else if ((CUR >= 'A') && (CUR <= 'F'))
3222 val = val * 16 + (CUR - 'A') + 10;
3223 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003224 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003225 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003226 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003227 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003228 }
3229 NEXT;
3230 }
3231 if (CUR == ';')
3232 NEXT;
3233 } else if ((CUR == '&') && (NXT(1) == '#')) {
3234 SKIP(2);
3235 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003236 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003237 val = val * 10 + (CUR - '0');
3238 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003239 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003240 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003241 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003242 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003243 }
3244 NEXT;
3245 }
3246 if (CUR == ';')
3247 NEXT;
3248 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003249 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3250 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003251 }
3252 /*
3253 * Check the value IS_CHAR ...
3254 */
3255 if (IS_CHAR(val)) {
3256 return(val);
3257 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003258 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3259 "htmlParseCharRef: invalid xmlChar value %d\n",
3260 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003261 }
3262 return(0);
3263}
3264
3265
3266/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003267 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003268 * @ctxt: an HTML parser context
3269 *
3270 * parse a DOCTYPE declaration
3271 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003272 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003273 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3274 */
3275
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003276static void
Owen Taylor3473f882001-02-23 17:55:21 +00003277htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003278 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003279 xmlChar *ExternalID = NULL;
3280 xmlChar *URI = NULL;
3281
3282 /*
3283 * We know that '<!DOCTYPE' has been detected.
3284 */
3285 SKIP(9);
3286
3287 SKIP_BLANKS;
3288
3289 /*
3290 * Parse the DOCTYPE name.
3291 */
3292 name = htmlParseName(ctxt);
3293 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003294 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3295 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3296 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003297 }
3298 /*
3299 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3300 */
3301
3302 SKIP_BLANKS;
3303
3304 /*
3305 * Check for SystemID and ExternalID
3306 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003307 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003308 SKIP_BLANKS;
3309
3310 /*
3311 * We should be at the end of the DOCTYPE declaration.
3312 */
3313 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003314 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3315 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003316 /* We shouldn't try to resynchronize ... */
3317 }
3318 NEXT;
3319
3320 /*
3321 * Create or update the document accordingly to the DOCTYPE
3322 */
3323 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3324 (!ctxt->disableSAX))
3325 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3326
3327 /*
3328 * Cleanup, since we don't use all those identifiers
3329 */
3330 if (URI != NULL) xmlFree(URI);
3331 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003332}
3333
3334/**
3335 * htmlParseAttribute:
3336 * @ctxt: an HTML parser context
3337 * @value: a xmlChar ** used to store the value of the attribute
3338 *
3339 * parse an attribute
3340 *
3341 * [41] Attribute ::= Name Eq AttValue
3342 *
3343 * [25] Eq ::= S? '=' S?
3344 *
3345 * With namespace:
3346 *
3347 * [NS 11] Attribute ::= QName Eq AttValue
3348 *
3349 * Also the case QName == xmlns:??? is handled independently as a namespace
3350 * definition.
3351 *
3352 * Returns the attribute name, and the value in *value.
3353 */
3354
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003355static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003356htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003357 const xmlChar *name;
3358 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003359
3360 *value = NULL;
3361 name = htmlParseHTMLName(ctxt);
3362 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003363 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3364 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003365 return(NULL);
3366 }
3367
3368 /*
3369 * read the value
3370 */
3371 SKIP_BLANKS;
3372 if (CUR == '=') {
3373 NEXT;
3374 SKIP_BLANKS;
3375 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003376 } else if (htmlIsBooleanAttr(name)) {
3377 /*
3378 * assume a minimized attribute
3379 */
3380 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003381 }
3382
3383 *value = val;
3384 return(name);
3385}
3386
3387/**
3388 * htmlCheckEncoding:
3389 * @ctxt: an HTML parser context
3390 * @attvalue: the attribute value
3391 *
3392 * Checks an http-equiv attribute from a Meta tag to detect
3393 * the encoding
3394 * If a new encoding is detected the parser is switched to decode
3395 * it and pass UTF8
3396 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003397static void
Owen Taylor3473f882001-02-23 17:55:21 +00003398htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3399 const xmlChar *encoding;
3400
3401 if ((ctxt == NULL) || (attvalue == NULL))
3402 return;
3403
Daniel Veillarde77db162009-08-22 11:32:38 +02003404 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003405 if (ctxt->input->encoding != NULL)
3406 return;
3407
3408 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3409 if (encoding != NULL) {
3410 encoding += 8;
3411 } else {
3412 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3413 if (encoding != NULL)
3414 encoding += 9;
3415 }
3416 if (encoding != NULL) {
3417 xmlCharEncoding enc;
3418 xmlCharEncodingHandlerPtr handler;
3419
3420 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3421
3422 if (ctxt->input->encoding != NULL)
3423 xmlFree((xmlChar *) ctxt->input->encoding);
3424 ctxt->input->encoding = xmlStrdup(encoding);
3425
3426 enc = xmlParseCharEncoding((const char *) encoding);
3427 /*
3428 * registered set of known encodings
3429 */
3430 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003431 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003432 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3433 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3434 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3435 (ctxt->input->buf != NULL) &&
3436 (ctxt->input->buf->encoder == NULL)) {
3437 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3438 "htmlCheckEncoding: wrong encoding meta\n",
3439 NULL, NULL);
3440 } else {
3441 xmlSwitchEncoding(ctxt, enc);
3442 }
Owen Taylor3473f882001-02-23 17:55:21 +00003443 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3444 } else {
3445 /*
3446 * fallback for unknown encodings
3447 */
3448 handler = xmlFindCharEncodingHandler((const char *) encoding);
3449 if (handler != NULL) {
3450 xmlSwitchToEncoding(ctxt, handler);
3451 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3452 } else {
3453 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3454 }
3455 }
3456
3457 if ((ctxt->input->buf != NULL) &&
3458 (ctxt->input->buf->encoder != NULL) &&
3459 (ctxt->input->buf->raw != NULL) &&
3460 (ctxt->input->buf->buffer != NULL)) {
3461 int nbchars;
3462 int processed;
3463
3464 /*
3465 * convert as much as possible to the parser reading buffer.
3466 */
3467 processed = ctxt->input->cur - ctxt->input->base;
3468 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3469 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3470 ctxt->input->buf->buffer,
3471 ctxt->input->buf->raw);
3472 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003473 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3474 "htmlCheckEncoding: encoder error\n",
3475 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003476 }
3477 ctxt->input->base =
3478 ctxt->input->cur = ctxt->input->buf->buffer->content;
3479 }
3480 }
3481}
3482
3483/**
3484 * htmlCheckMeta:
3485 * @ctxt: an HTML parser context
3486 * @atts: the attributes values
3487 *
3488 * Checks an attributes from a Meta tag
3489 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003490static void
Owen Taylor3473f882001-02-23 17:55:21 +00003491htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3492 int i;
3493 const xmlChar *att, *value;
3494 int http = 0;
3495 const xmlChar *content = NULL;
3496
3497 if ((ctxt == NULL) || (atts == NULL))
3498 return;
3499
3500 i = 0;
3501 att = atts[i++];
3502 while (att != NULL) {
3503 value = atts[i++];
3504 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3505 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3506 http = 1;
3507 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3508 content = value;
3509 att = atts[i++];
3510 }
3511 if ((http) && (content != NULL))
3512 htmlCheckEncoding(ctxt, content);
3513
3514}
3515
3516/**
3517 * htmlParseStartTag:
3518 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003519 *
Owen Taylor3473f882001-02-23 17:55:21 +00003520 * parse a start of tag either for rule element or
3521 * EmptyElement. In both case we don't parse the tag closing chars.
3522 *
3523 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3524 *
3525 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3526 *
3527 * With namespace:
3528 *
3529 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3530 *
3531 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3532 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003533 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003534 */
3535
Daniel Veillard597f1c12005-07-03 23:00:18 +00003536static int
Owen Taylor3473f882001-02-23 17:55:21 +00003537htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003538 const xmlChar *name;
3539 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003540 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003541 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003542 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003543 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003544 int meta = 0;
3545 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003546 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003547
Daniel Veillarde77db162009-08-22 11:32:38 +02003548 if (ctxt->instate == XML_PARSER_EOF)
3549 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003550 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3551 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3552 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003553 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003554 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003555 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003556 NEXT;
3557
Daniel Veillard30e76072006-03-09 14:13:55 +00003558 atts = ctxt->atts;
3559 maxatts = ctxt->maxatts;
3560
Owen Taylor3473f882001-02-23 17:55:21 +00003561 GROW;
3562 name = htmlParseHTMLName(ctxt);
3563 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003564 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3565 "htmlParseStartTag: invalid element name\n",
3566 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003567 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003568 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3569 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003570 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003571 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003572 }
3573 if (xmlStrEqual(name, BAD_CAST"meta"))
3574 meta = 1;
3575
3576 /*
3577 * Check for auto-closure of HTML elements.
3578 */
3579 htmlAutoClose(ctxt, name);
3580
3581 /*
3582 * Check for implied HTML elements.
3583 */
3584 htmlCheckImplied(ctxt, name);
3585
3586 /*
3587 * Avoid html at any level > 0, head at any level != 1
3588 * or any attempt to recurse body
3589 */
3590 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003591 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3592 "htmlParseStartTag: misplaced <html> tag\n",
3593 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003594 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003595 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003596 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003597 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003598 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003599 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3600 "htmlParseStartTag: misplaced <head> tag\n",
3601 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003602 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003603 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003604 }
3605 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003606 int indx;
3607 for (indx = 0;indx < ctxt->nameNr;indx++) {
3608 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003609 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3610 "htmlParseStartTag: misplaced <body> tag\n",
3611 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003612 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003613 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003614 }
3615 }
3616 }
3617
3618 /*
3619 * Now parse the attributes, it ends up with the ending
3620 *
3621 * (S Attribute)* S?
3622 */
3623 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003624 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003625 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003626 ((CUR != '/') || (NXT(1) != '>'))) {
3627 long cons = ctxt->nbChars;
3628
3629 GROW;
3630 attname = htmlParseAttribute(ctxt, &attvalue);
3631 if (attname != NULL) {
3632
3633 /*
3634 * Well formedness requires at most one declaration of an attribute
3635 */
3636 for (i = 0; i < nbatts;i += 2) {
3637 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003638 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3639 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003640 if (attvalue != NULL)
3641 xmlFree(attvalue);
3642 goto failed;
3643 }
3644 }
3645
3646 /*
3647 * Add the pair to atts
3648 */
3649 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003650 maxatts = 22; /* allow for 10 attrs by default */
3651 atts = (const xmlChar **)
3652 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003653 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003654 htmlErrMemory(ctxt, NULL);
3655 if (attvalue != NULL)
3656 xmlFree(attvalue);
3657 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003658 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003659 ctxt->atts = atts;
3660 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003661 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003662 const xmlChar **n;
3663
Owen Taylor3473f882001-02-23 17:55:21 +00003664 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003665 n = (const xmlChar **) xmlRealloc((void *) atts,
3666 maxatts * sizeof(const xmlChar *));
3667 if (n == NULL) {
3668 htmlErrMemory(ctxt, NULL);
3669 if (attvalue != NULL)
3670 xmlFree(attvalue);
3671 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003672 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003673 atts = n;
3674 ctxt->atts = atts;
3675 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003676 }
3677 atts[nbatts++] = attname;
3678 atts[nbatts++] = attvalue;
3679 atts[nbatts] = NULL;
3680 atts[nbatts + 1] = NULL;
3681 }
3682 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003683 if (attvalue != NULL)
3684 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003685 /* Dump the bogus attribute string up to the next blank or
3686 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003687 while ((IS_CHAR_CH(CUR)) &&
3688 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003689 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003690 NEXT;
3691 }
3692
3693failed:
3694 SKIP_BLANKS;
3695 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003696 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3697 "htmlParseStartTag: problem parsing attributes\n",
3698 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003699 break;
3700 }
3701 }
3702
3703 /*
3704 * Handle specific association to the META tag
3705 */
William M. Bracke978ae22007-03-21 06:16:02 +00003706 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003707 htmlCheckMeta(ctxt, atts);
3708
3709 /*
3710 * SAX: Start of Element !
3711 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003712 if (!discardtag) {
3713 htmlnamePush(ctxt, name);
3714 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3715 if (nbatts != 0)
3716 ctxt->sax->startElement(ctxt->userData, name, atts);
3717 else
3718 ctxt->sax->startElement(ctxt->userData, name, NULL);
3719 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003720 }
Owen Taylor3473f882001-02-23 17:55:21 +00003721
3722 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003723 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003724 if (atts[i] != NULL)
3725 xmlFree((xmlChar *) atts[i]);
3726 }
Owen Taylor3473f882001-02-23 17:55:21 +00003727 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003728
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003729 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003730}
3731
3732/**
3733 * htmlParseEndTag:
3734 * @ctxt: an HTML parser context
3735 *
3736 * parse an end of tag
3737 *
3738 * [42] ETag ::= '</' Name S? '>'
3739 *
3740 * With namespace
3741 *
3742 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003743 *
3744 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003745 */
3746
Daniel Veillardf420ac52001-07-04 16:04:09 +00003747static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003748htmlParseEndTag(htmlParserCtxtPtr ctxt)
3749{
3750 const xmlChar *name;
3751 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003752 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003753
3754 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003755 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3756 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003757 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003758 }
3759 SKIP(2);
3760
3761 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003762 if (name == NULL)
3763 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003764 /*
3765 * We should definitely be at the ending "S? '>'" part
3766 */
3767 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003768 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003769 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3770 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003771 if (ctxt->recovery) {
3772 /*
3773 * We're not at the ending > !!
3774 * Error, unless in recover mode where we search forwards
3775 * until we find a >
3776 */
3777 while (CUR != '\0' && CUR != '>') NEXT;
3778 NEXT;
3779 }
Owen Taylor3473f882001-02-23 17:55:21 +00003780 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003781 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003782
3783 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003784 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3785 * out now.
3786 */
3787 if ((ctxt->depth > 0) &&
3788 (xmlStrEqual(name, BAD_CAST "html") ||
3789 xmlStrEqual(name, BAD_CAST "body") ||
3790 xmlStrEqual(name, BAD_CAST "head"))) {
3791 ctxt->depth--;
3792 return (0);
3793 }
3794
3795 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003796 * If the name read is not one of the element in the parsing stack
3797 * then return, it's just an error.
3798 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003799 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3800 if (xmlStrEqual(name, ctxt->nameTab[i]))
3801 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003802 }
3803 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003804 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3805 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003806 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003807 }
3808
3809
3810 /*
3811 * Check for auto-closure of HTML elements.
3812 */
3813
3814 htmlAutoCloseOnClose(ctxt, name);
3815
3816 /*
3817 * Well formedness constraints, opening and closing must match.
3818 * With the exception that the autoclose may have popped stuff out
3819 * of the stack.
3820 */
3821 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003822 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003823 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3824 "Opening and ending tag mismatch: %s and %s\n",
3825 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003826 }
3827 }
3828
3829 /*
3830 * SAX: End of Tag
3831 */
3832 oldname = ctxt->name;
3833 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003834 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3835 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003836 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003837 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003838 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003839 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003840 }
3841
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003842 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003843}
3844
3845
3846/**
3847 * htmlParseReference:
3848 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003849 *
Owen Taylor3473f882001-02-23 17:55:21 +00003850 * parse and handle entity references in content,
3851 * this will end-up in a call to character() since this is either a
3852 * CharRef, or a predefined entity.
3853 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003854static void
Owen Taylor3473f882001-02-23 17:55:21 +00003855htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003856 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003857 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003858 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003859 if (CUR != '&') return;
3860
3861 if (NXT(1) == '#') {
3862 unsigned int c;
3863 int bits, i = 0;
3864
3865 c = htmlParseCharRef(ctxt);
3866 if (c == 0)
3867 return;
3868
3869 if (c < 0x80) { out[i++]= c; bits= -6; }
3870 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3871 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3872 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003873
Owen Taylor3473f882001-02-23 17:55:21 +00003874 for ( ; bits >= 0; bits-= 6) {
3875 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3876 }
3877 out[i] = 0;
3878
3879 htmlCheckParagraph(ctxt);
3880 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3881 ctxt->sax->characters(ctxt->userData, out, i);
3882 } else {
3883 ent = htmlParseEntityRef(ctxt, &name);
3884 if (name == NULL) {
3885 htmlCheckParagraph(ctxt);
3886 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3887 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3888 return;
3889 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003890 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003891 htmlCheckParagraph(ctxt);
3892 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3893 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3894 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3895 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3896 }
3897 } else {
3898 unsigned int c;
3899 int bits, i = 0;
3900
3901 c = ent->value;
3902 if (c < 0x80)
3903 { out[i++]= c; bits= -6; }
3904 else if (c < 0x800)
3905 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3906 else if (c < 0x10000)
3907 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003908 else
Owen Taylor3473f882001-02-23 17:55:21 +00003909 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003910
Owen Taylor3473f882001-02-23 17:55:21 +00003911 for ( ; bits >= 0; bits-= 6) {
3912 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3913 }
3914 out[i] = 0;
3915
3916 htmlCheckParagraph(ctxt);
3917 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3918 ctxt->sax->characters(ctxt->userData, out, i);
3919 }
Owen Taylor3473f882001-02-23 17:55:21 +00003920 }
3921}
3922
3923/**
3924 * htmlParseContent:
3925 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003926 *
3927 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003928 */
3929
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003930static void
Owen Taylor3473f882001-02-23 17:55:21 +00003931htmlParseContent(htmlParserCtxtPtr ctxt) {
3932 xmlChar *currentNode;
3933 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003934 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003935
3936 currentNode = xmlStrdup(ctxt->name);
3937 depth = ctxt->nameNr;
3938 while (1) {
3939 long cons = ctxt->nbChars;
3940
3941 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003942
3943 if (ctxt->instate == XML_PARSER_EOF)
3944 break;
3945
Owen Taylor3473f882001-02-23 17:55:21 +00003946 /*
3947 * Our tag or one of it's parent or children is ending.
3948 */
3949 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003950 if (htmlParseEndTag(ctxt) &&
3951 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3952 if (currentNode != NULL)
3953 xmlFree(currentNode);
3954 return;
3955 }
3956 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003957 }
3958
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003959 else if ((CUR == '<') &&
3960 ((IS_ASCII_LETTER(NXT(1))) ||
3961 (NXT(1) == '_') || (NXT(1) == ':'))) {
3962 name = htmlParseHTMLName_nonInvasive(ctxt);
3963 if (name == NULL) {
3964 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3965 "htmlParseStartTag: invalid element name\n",
3966 NULL, NULL);
3967 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003968 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003969 NEXT;
3970
3971 if (currentNode != NULL)
3972 xmlFree(currentNode);
3973 return;
3974 }
3975
3976 if (ctxt->name != NULL) {
3977 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3978 htmlAutoClose(ctxt, name);
3979 continue;
3980 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003981 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003982 }
3983
Owen Taylor3473f882001-02-23 17:55:21 +00003984 /*
3985 * Has this node been popped out during parsing of
3986 * the next element
3987 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003988 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3989 (!xmlStrEqual(currentNode, ctxt->name)))
3990 {
Owen Taylor3473f882001-02-23 17:55:21 +00003991 if (currentNode != NULL) xmlFree(currentNode);
3992 return;
3993 }
3994
Daniel Veillardf9533d12001-03-03 10:04:57 +00003995 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3996 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003997 /*
3998 * Handle SCRIPT/STYLE separately
3999 */
4000 htmlParseScript(ctxt);
4001 } else {
4002 /*
4003 * Sometimes DOCTYPE arrives in the middle of the document
4004 */
4005 if ((CUR == '<') && (NXT(1) == '!') &&
4006 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4007 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4008 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4009 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004010 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4011 "Misplaced DOCTYPE declaration\n",
4012 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004013 htmlParseDocTypeDecl(ctxt);
4014 }
4015
4016 /*
4017 * First case : a comment
4018 */
4019 if ((CUR == '<') && (NXT(1) == '!') &&
4020 (NXT(2) == '-') && (NXT(3) == '-')) {
4021 htmlParseComment(ctxt);
4022 }
4023
4024 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004025 * Second case : a Processing Instruction.
4026 */
4027 else if ((CUR == '<') && (NXT(1) == '?')) {
4028 htmlParsePI(ctxt);
4029 }
4030
4031 /*
4032 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004033 */
4034 else if (CUR == '<') {
4035 htmlParseElement(ctxt);
4036 }
4037
4038 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004039 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004040 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004041 */
4042 else if (CUR == '&') {
4043 htmlParseReference(ctxt);
4044 }
4045
4046 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004047 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004048 */
4049 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004050 htmlAutoCloseOnEnd(ctxt);
4051 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004052 }
4053
4054 /*
4055 * Last case, text. Note that References are handled directly.
4056 */
4057 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004058 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004059 }
4060
4061 if (cons == ctxt->nbChars) {
4062 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004063 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4064 "detected an error in element content\n",
4065 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004066 }
4067 break;
4068 }
4069 }
4070 GROW;
4071 }
4072 if (currentNode != NULL) xmlFree(currentNode);
4073}
4074
4075/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004076 * htmlParseContent:
4077 * @ctxt: an HTML parser context
4078 *
4079 * Parse a content: comment, sub-element, reference or text.
4080 */
4081
4082void
4083__htmlParseContent(void *ctxt) {
4084 if (ctxt != NULL)
4085 htmlParseContent((htmlParserCtxtPtr) ctxt);
4086}
4087
4088/**
Owen Taylor3473f882001-02-23 17:55:21 +00004089 * htmlParseElement:
4090 * @ctxt: an HTML parser context
4091 *
4092 * parse an HTML element, this is highly recursive
4093 *
4094 * [39] element ::= EmptyElemTag | STag content ETag
4095 *
4096 * [41] Attribute ::= Name Eq AttValue
4097 */
4098
4099void
4100htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004101 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004102 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004103 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004104 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004105 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004106 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004107 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004108
Daniel Veillarda03e3652004-11-02 18:45:30 +00004109 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4110 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004111 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004112 return;
4113 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004114
4115 if (ctxt->instate == XML_PARSER_EOF)
4116 return;
4117
Owen Taylor3473f882001-02-23 17:55:21 +00004118 /* Capture start position */
4119 if (ctxt->record_info) {
4120 node_info.begin_pos = ctxt->input->consumed +
4121 (CUR_PTR - ctxt->input->base);
4122 node_info.begin_line = ctxt->input->line;
4123 }
4124
Daniel Veillard597f1c12005-07-03 23:00:18 +00004125 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004126 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004127 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004128 if (CUR == '>')
4129 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004130 return;
4131 }
Owen Taylor3473f882001-02-23 17:55:21 +00004132
4133 /*
4134 * Lookup the info for that element.
4135 */
4136 info = htmlTagLookup(name);
4137 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004138 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4139 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004140 }
4141
4142 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004143 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004144 */
4145 if ((CUR == '/') && (NXT(1) == '>')) {
4146 SKIP(2);
4147 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4148 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004149 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004150 return;
4151 }
4152
4153 if (CUR == '>') {
4154 NEXT;
4155 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004156 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4157 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004158
4159 /*
4160 * end of parsing of this node.
4161 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004162 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004163 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004164 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004165 }
Owen Taylor3473f882001-02-23 17:55:21 +00004166
4167 /*
4168 * Capture end position and add node
4169 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004170 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004171 node_info.end_pos = ctxt->input->consumed +
4172 (CUR_PTR - ctxt->input->base);
4173 node_info.end_line = ctxt->input->line;
4174 node_info.node = ctxt->node;
4175 xmlParserAddNodeInfo(ctxt, &node_info);
4176 }
4177 return;
4178 }
4179
4180 /*
4181 * Check for an Empty Element from DTD definition
4182 */
4183 if ((info != NULL) && (info->empty)) {
4184 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4185 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004186 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004187 return;
4188 }
4189
4190 /*
4191 * Parse the content of the element:
4192 */
4193 currentNode = xmlStrdup(ctxt->name);
4194 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004195 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004196 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004197 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004198 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004199 if (ctxt->nameNr < depth) break;
4200 }
Owen Taylor3473f882001-02-23 17:55:21 +00004201
Owen Taylor3473f882001-02-23 17:55:21 +00004202 /*
4203 * Capture end position and add node
4204 */
4205 if ( currentNode != NULL && ctxt->record_info ) {
4206 node_info.end_pos = ctxt->input->consumed +
4207 (CUR_PTR - ctxt->input->base);
4208 node_info.end_line = ctxt->input->line;
4209 node_info.node = ctxt->node;
4210 xmlParserAddNodeInfo(ctxt, &node_info);
4211 }
William M. Brack76e95df2003-10-18 16:20:14 +00004212 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004213 htmlAutoCloseOnEnd(ctxt);
4214 }
4215
Owen Taylor3473f882001-02-23 17:55:21 +00004216 if (currentNode != NULL)
4217 xmlFree(currentNode);
4218}
4219
4220/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004221 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004222 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004223 *
Owen Taylor3473f882001-02-23 17:55:21 +00004224 * parse an HTML document (and build a tree if using the standard SAX
4225 * interface).
4226 *
4227 * Returns 0, -1 in case of error. the parser context is augmented
4228 * as a result of the parsing.
4229 */
4230
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004231int
Owen Taylor3473f882001-02-23 17:55:21 +00004232htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004233 xmlChar start[4];
4234 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004235 xmlDtdPtr dtd;
4236
Daniel Veillardd0463562001-10-13 09:15:48 +00004237 xmlInitParser();
4238
Owen Taylor3473f882001-02-23 17:55:21 +00004239 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004240
Daniel Veillarda03e3652004-11-02 18:45:30 +00004241 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4242 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4243 "htmlParseDocument: context error\n", NULL, NULL);
4244 return(XML_ERR_INTERNAL_ERROR);
4245 }
4246 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004247 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004248 GROW;
4249 /*
4250 * SAX: beginning of the document processing.
4251 */
4252 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4253 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4254
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004255 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4256 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4257 /*
4258 * Get the 4 first bytes and decode the charset
4259 * if enc != XML_CHAR_ENCODING_NONE
4260 * plug some encoding conversion routines.
4261 */
4262 start[0] = RAW;
4263 start[1] = NXT(1);
4264 start[2] = NXT(2);
4265 start[3] = NXT(3);
4266 enc = xmlDetectCharEncoding(&start[0], 4);
4267 if (enc != XML_CHAR_ENCODING_NONE) {
4268 xmlSwitchEncoding(ctxt, enc);
4269 }
4270 }
4271
Owen Taylor3473f882001-02-23 17:55:21 +00004272 /*
4273 * Wipe out everything which is before the first '<'
4274 */
4275 SKIP_BLANKS;
4276 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004277 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004278 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004279 }
4280
4281 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4282 ctxt->sax->startDocument(ctxt->userData);
4283
4284
4285 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004286 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004287 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004288 while (((CUR == '<') && (NXT(1) == '!') &&
4289 (NXT(2) == '-') && (NXT(3) == '-')) ||
4290 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004291 htmlParseComment(ctxt);
4292 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004293 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004294 }
Owen Taylor3473f882001-02-23 17:55:21 +00004295
4296
4297 /*
4298 * Then possibly doc type declaration(s) and more Misc
4299 * (doctypedecl Misc*)?
4300 */
4301 if ((CUR == '<') && (NXT(1) == '!') &&
4302 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4303 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4304 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4305 (UPP(8) == 'E')) {
4306 htmlParseDocTypeDecl(ctxt);
4307 }
4308 SKIP_BLANKS;
4309
4310 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004311 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004312 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004313 while (((CUR == '<') && (NXT(1) == '!') &&
4314 (NXT(2) == '-') && (NXT(3) == '-')) ||
4315 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004316 htmlParseComment(ctxt);
4317 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004318 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004319 }
Owen Taylor3473f882001-02-23 17:55:21 +00004320
4321 /*
4322 * Time to start parsing the tree itself
4323 */
4324 htmlParseContent(ctxt);
4325
4326 /*
4327 * autoclose
4328 */
4329 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004330 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004331
4332
4333 /*
4334 * SAX: end of the document processing.
4335 */
4336 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4337 ctxt->sax->endDocument(ctxt->userData);
4338
4339 if (ctxt->myDoc != NULL) {
4340 dtd = xmlGetIntSubset(ctxt->myDoc);
4341 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004342 ctxt->myDoc->intSubset =
4343 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004344 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4345 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4346 }
4347 if (! ctxt->wellFormed) return(-1);
4348 return(0);
4349}
4350
4351
4352/************************************************************************
4353 * *
4354 * Parser contexts handling *
4355 * *
4356 ************************************************************************/
4357
4358/**
William M. Brackedb65a72004-02-06 07:36:04 +00004359 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004360 * @ctxt: an HTML parser context
4361 *
4362 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004363 *
4364 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004365 */
4366
Daniel Veillardf403d292003-10-05 13:51:35 +00004367static int
Owen Taylor3473f882001-02-23 17:55:21 +00004368htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4369{
4370 htmlSAXHandler *sax;
4371
Daniel Veillardf403d292003-10-05 13:51:35 +00004372 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004373 memset(ctxt, 0, sizeof(htmlParserCtxt));
4374
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004375 ctxt->dict = xmlDictCreate();
4376 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004377 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4378 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004379 }
Owen Taylor3473f882001-02-23 17:55:21 +00004380 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4381 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004382 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4383 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004384 }
4385 else
4386 memset(sax, 0, sizeof(htmlSAXHandler));
4387
4388 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004389 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004390 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4391 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004392 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004393 ctxt->inputNr = 0;
4394 ctxt->inputMax = 0;
4395 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004396 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004397 }
4398 ctxt->inputNr = 0;
4399 ctxt->inputMax = 5;
4400 ctxt->input = NULL;
4401 ctxt->version = NULL;
4402 ctxt->encoding = NULL;
4403 ctxt->standalone = -1;
4404 ctxt->instate = XML_PARSER_START;
4405
4406 /* Allocate the Node stack */
4407 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4408 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004409 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004410 ctxt->nodeNr = 0;
4411 ctxt->nodeMax = 0;
4412 ctxt->node = NULL;
4413 ctxt->inputNr = 0;
4414 ctxt->inputMax = 0;
4415 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004416 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004417 }
4418 ctxt->nodeNr = 0;
4419 ctxt->nodeMax = 10;
4420 ctxt->node = NULL;
4421
4422 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004423 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004424 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004425 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004426 ctxt->nameNr = 0;
4427 ctxt->nameMax = 10;
4428 ctxt->name = NULL;
4429 ctxt->nodeNr = 0;
4430 ctxt->nodeMax = 0;
4431 ctxt->node = NULL;
4432 ctxt->inputNr = 0;
4433 ctxt->inputMax = 0;
4434 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004435 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004436 }
4437 ctxt->nameNr = 0;
4438 ctxt->nameMax = 10;
4439 ctxt->name = NULL;
4440
Daniel Veillard092643b2003-09-25 14:29:29 +00004441 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004442 else {
4443 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004444 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004445 }
4446 ctxt->userData = ctxt;
4447 ctxt->myDoc = NULL;
4448 ctxt->wellFormed = 1;
4449 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004450 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004451 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004452 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004453 ctxt->vctxt.userData = ctxt;
4454 ctxt->vctxt.error = xmlParserValidityError;
4455 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004456 ctxt->record_info = 0;
4457 ctxt->validate = 0;
4458 ctxt->nbChars = 0;
4459 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004460 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004461 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004462 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004463}
4464
4465/**
4466 * htmlFreeParserCtxt:
4467 * @ctxt: an HTML parser context
4468 *
4469 * Free all the memory used by a parser context. However the parsed
4470 * document in ctxt->myDoc is not freed.
4471 */
4472
4473void
4474htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4475{
4476 xmlFreeParserCtxt(ctxt);
4477}
4478
4479/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004480 * htmlNewParserCtxt:
4481 *
4482 * Allocate and initialize a new parser context.
4483 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004484 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004485 */
4486
Daniel Veillard34c647c2006-09-21 06:53:59 +00004487htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004488htmlNewParserCtxt(void)
4489{
4490 xmlParserCtxtPtr ctxt;
4491
4492 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4493 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004494 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004495 return(NULL);
4496 }
4497 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004498 if (htmlInitParserCtxt(ctxt) < 0) {
4499 htmlFreeParserCtxt(ctxt);
4500 return(NULL);
4501 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004502 return(ctxt);
4503}
4504
4505/**
4506 * htmlCreateMemoryParserCtxt:
4507 * @buffer: a pointer to a char array
4508 * @size: the size of the array
4509 *
4510 * Create a parser context for an HTML in-memory document.
4511 *
4512 * Returns the new parser context or NULL
4513 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004514htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004515htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4516 xmlParserCtxtPtr ctxt;
4517 xmlParserInputPtr input;
4518 xmlParserInputBufferPtr buf;
4519
4520 if (buffer == NULL)
4521 return(NULL);
4522 if (size <= 0)
4523 return(NULL);
4524
4525 ctxt = htmlNewParserCtxt();
4526 if (ctxt == NULL)
4527 return(NULL);
4528
4529 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4530 if (buf == NULL) return(NULL);
4531
4532 input = xmlNewInputStream(ctxt);
4533 if (input == NULL) {
4534 xmlFreeParserCtxt(ctxt);
4535 return(NULL);
4536 }
4537
4538 input->filename = NULL;
4539 input->buf = buf;
4540 input->base = input->buf->buffer->content;
4541 input->cur = input->buf->buffer->content;
4542 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4543
4544 inputPush(ctxt, input);
4545 return(ctxt);
4546}
4547
4548/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004549 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004550 * @cur: a pointer to an array of xmlChar
4551 * @encoding: a free form C string describing the HTML document encoding, or NULL
4552 *
4553 * Create a parser context for an HTML document.
4554 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004555 * TODO: check the need to add encoding handling there
4556 *
Owen Taylor3473f882001-02-23 17:55:21 +00004557 * Returns the new parser context or NULL
4558 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004559static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004560htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004561 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004562 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004563
Daniel Veillard1d995272002-07-22 16:43:32 +00004564 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004565 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004566 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004567 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004568 if (ctxt == NULL)
4569 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004570
4571 if (encoding != NULL) {
4572 xmlCharEncoding enc;
4573 xmlCharEncodingHandlerPtr handler;
4574
4575 if (ctxt->input->encoding != NULL)
4576 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004577 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004578
4579 enc = xmlParseCharEncoding(encoding);
4580 /*
4581 * registered set of known encodings
4582 */
4583 if (enc != XML_CHAR_ENCODING_ERROR) {
4584 xmlSwitchEncoding(ctxt, enc);
4585 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004586 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004587 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004588 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004589 }
4590 } else {
4591 /*
4592 * fallback for unknown encodings
4593 */
4594 handler = xmlFindCharEncodingHandler((const char *) encoding);
4595 if (handler != NULL) {
4596 xmlSwitchToEncoding(ctxt, handler);
4597 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004598 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4599 "Unsupported encoding %s\n",
4600 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004601 }
4602 }
4603 }
4604 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004605}
4606
Daniel Veillard73b013f2003-09-30 12:36:01 +00004607#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004608/************************************************************************
4609 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004610 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004611 * *
4612 ************************************************************************/
4613
4614/**
4615 * htmlParseLookupSequence:
4616 * @ctxt: an HTML parser context
4617 * @first: the first char to lookup
4618 * @next: the next char to lookup or zero
4619 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004620 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004621 *
4622 * Try to find if a sequence (first, next, third) or just (first next) or
4623 * (first) is available in the input stream.
4624 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4625 * to avoid rescanning sequences of bytes, it DOES change the state of the
4626 * parser, do not use liberally.
4627 * This is basically similar to xmlParseLookupSequence()
4628 *
4629 * Returns the index to the current parsing point if the full sequence
4630 * is available, -1 otherwise.
4631 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004632static int
Owen Taylor3473f882001-02-23 17:55:21 +00004633htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004634 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02004635 int ignoreattrval)
4636{
Owen Taylor3473f882001-02-23 17:55:21 +00004637 int base, len;
4638 htmlParserInputPtr in;
4639 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004640 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004641 int invalue = 0;
4642 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004643
4644 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004645 if (in == NULL)
4646 return (-1);
4647
Owen Taylor3473f882001-02-23 17:55:21 +00004648 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004649 if (base < 0)
4650 return (-1);
4651
Owen Taylor3473f882001-02-23 17:55:21 +00004652 if (ctxt->checkIndex > base)
4653 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004654
Owen Taylor3473f882001-02-23 17:55:21 +00004655 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004656 buf = in->base;
4657 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00004658 } else {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004659 buf = in->buf->buffer->content;
4660 len = in->buf->buffer->use;
Owen Taylor3473f882001-02-23 17:55:21 +00004661 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02004662
Owen Taylor3473f882001-02-23 17:55:21 +00004663 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02004664 if (third)
4665 len -= 2;
4666 else if (next)
4667 len--;
4668 for (; base < len; base++) {
4669 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
4670 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4671 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4672 incomment = 1;
4673 /* do not increment past <! - some people use <!--> */
4674 base += 2;
4675 }
4676 }
4677 if (ignoreattrval) {
4678 if (buf[base] == '"' || buf[base] == '\'') {
4679 if (invalue) {
4680 if (buf[base] == valdellim) {
4681 invalue = 0;
4682 continue;
4683 }
4684 } else {
4685 valdellim = buf[base];
4686 invalue = 1;
4687 continue;
4688 }
4689 } else if (invalue) {
4690 continue;
4691 }
4692 }
4693 if (incomment) {
4694 if (base + 3 > len)
4695 return (-1);
4696 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4697 (buf[base + 2] == '>')) {
4698 incomment = 0;
4699 base += 2;
4700 }
4701 continue;
4702 }
Owen Taylor3473f882001-02-23 17:55:21 +00004703 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004704 if (third != 0) {
4705 if ((buf[base + 1] != next) || (buf[base + 2] != third))
4706 continue;
4707 } else if (next != 0) {
4708 if (buf[base + 1] != next)
4709 continue;
4710 }
4711 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004712#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02004713 if (next == 0)
4714 xmlGenericError(xmlGenericErrorContext,
4715 "HPP: lookup '%c' found at %d\n",
4716 first, base);
4717 else if (third == 0)
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: lookup '%c%c' found at %d\n",
4720 first, next, base);
4721 else
4722 xmlGenericError(xmlGenericErrorContext,
4723 "HPP: lookup '%c%c%c' found at %d\n",
4724 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00004725#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02004726 return (base - (in->cur - in->base));
4727 }
Owen Taylor3473f882001-02-23 17:55:21 +00004728 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02004729 if ((!incomment) && (!invalue))
4730 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00004731#ifdef DEBUG_PUSH
4732 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02004733 xmlGenericError(xmlGenericErrorContext,
4734 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00004735 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02004736 xmlGenericError(xmlGenericErrorContext,
4737 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004738 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02004739 xmlGenericError(xmlGenericErrorContext,
4740 "HPP: lookup '%c%c%c' failed\n", first, next,
4741 third);
Owen Taylor3473f882001-02-23 17:55:21 +00004742#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02004743 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004744}
4745
4746/**
Markus Kull56a03032009-08-24 19:00:23 +02004747 * htmlParseLookupChars:
4748 * @ctxt: an HTML parser context
4749 * @stop: Array of chars, which stop the lookup.
4750 * @stopLen: Length of stop-Array
4751 *
4752 * Try to find if any char of the stop-Array is available in the input
4753 * stream.
4754 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4755 * to avoid rescanning sequences of bytes, it DOES change the state of the
4756 * parser, do not use liberally.
4757 *
4758 * Returns the index to the current parsing point if a stopChar
4759 * is available, -1 otherwise.
4760 */
4761static int
4762htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
4763 int stopLen)
4764{
4765 int base, len;
4766 htmlParserInputPtr in;
4767 const xmlChar *buf;
4768 int incomment = 0;
4769 int i;
4770
4771 in = ctxt->input;
4772 if (in == NULL)
4773 return (-1);
4774
4775 base = in->cur - in->base;
4776 if (base < 0)
4777 return (-1);
4778
4779 if (ctxt->checkIndex > base)
4780 base = ctxt->checkIndex;
4781
4782 if (in->buf == NULL) {
4783 buf = in->base;
4784 len = in->length;
4785 } else {
4786 buf = in->buf->buffer->content;
4787 len = in->buf->buffer->use;
4788 }
4789
4790 for (; base < len; base++) {
4791 if (!incomment && (base + 4 < len)) {
4792 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4793 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4794 incomment = 1;
4795 /* do not increment past <! - some people use <!--> */
4796 base += 2;
4797 }
4798 }
4799 if (incomment) {
4800 if (base + 3 > len)
4801 return (-1);
4802 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4803 (buf[base + 2] == '>')) {
4804 incomment = 0;
4805 base += 2;
4806 }
4807 continue;
4808 }
4809 for (i = 0; i < stopLen; ++i) {
4810 if (buf[base] == stop[i]) {
4811 ctxt->checkIndex = 0;
4812 return (base - (in->cur - in->base));
4813 }
4814 }
4815 }
4816 ctxt->checkIndex = base;
4817 return (-1);
4818}
4819
4820/**
Owen Taylor3473f882001-02-23 17:55:21 +00004821 * htmlParseTryOrFinish:
4822 * @ctxt: an HTML parser context
4823 * @terminate: last chunk indicator
4824 *
4825 * Try to progress on parsing
4826 *
4827 * Returns zero if no parsing was possible
4828 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004829static int
Owen Taylor3473f882001-02-23 17:55:21 +00004830htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4831 int ret = 0;
4832 htmlParserInputPtr in;
4833 int avail = 0;
4834 xmlChar cur, next;
4835
4836#ifdef DEBUG_PUSH
4837 switch (ctxt->instate) {
4838 case XML_PARSER_EOF:
4839 xmlGenericError(xmlGenericErrorContext,
4840 "HPP: try EOF\n"); break;
4841 case XML_PARSER_START:
4842 xmlGenericError(xmlGenericErrorContext,
4843 "HPP: try START\n"); break;
4844 case XML_PARSER_MISC:
4845 xmlGenericError(xmlGenericErrorContext,
4846 "HPP: try MISC\n");break;
4847 case XML_PARSER_COMMENT:
4848 xmlGenericError(xmlGenericErrorContext,
4849 "HPP: try COMMENT\n");break;
4850 case XML_PARSER_PROLOG:
4851 xmlGenericError(xmlGenericErrorContext,
4852 "HPP: try PROLOG\n");break;
4853 case XML_PARSER_START_TAG:
4854 xmlGenericError(xmlGenericErrorContext,
4855 "HPP: try START_TAG\n");break;
4856 case XML_PARSER_CONTENT:
4857 xmlGenericError(xmlGenericErrorContext,
4858 "HPP: try CONTENT\n");break;
4859 case XML_PARSER_CDATA_SECTION:
4860 xmlGenericError(xmlGenericErrorContext,
4861 "HPP: try CDATA_SECTION\n");break;
4862 case XML_PARSER_END_TAG:
4863 xmlGenericError(xmlGenericErrorContext,
4864 "HPP: try END_TAG\n");break;
4865 case XML_PARSER_ENTITY_DECL:
4866 xmlGenericError(xmlGenericErrorContext,
4867 "HPP: try ENTITY_DECL\n");break;
4868 case XML_PARSER_ENTITY_VALUE:
4869 xmlGenericError(xmlGenericErrorContext,
4870 "HPP: try ENTITY_VALUE\n");break;
4871 case XML_PARSER_ATTRIBUTE_VALUE:
4872 xmlGenericError(xmlGenericErrorContext,
4873 "HPP: try ATTRIBUTE_VALUE\n");break;
4874 case XML_PARSER_DTD:
4875 xmlGenericError(xmlGenericErrorContext,
4876 "HPP: try DTD\n");break;
4877 case XML_PARSER_EPILOG:
4878 xmlGenericError(xmlGenericErrorContext,
4879 "HPP: try EPILOG\n");break;
4880 case XML_PARSER_PI:
4881 xmlGenericError(xmlGenericErrorContext,
4882 "HPP: try PI\n");break;
4883 case XML_PARSER_SYSTEM_LITERAL:
4884 xmlGenericError(xmlGenericErrorContext,
4885 "HPP: try SYSTEM_LITERAL\n");break;
4886 }
4887#endif
4888
4889 while (1) {
4890
4891 in = ctxt->input;
4892 if (in == NULL) break;
4893 if (in->buf == NULL)
4894 avail = in->length - (in->cur - in->base);
4895 else
4896 avail = in->buf->buffer->use - (in->cur - in->base);
4897 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004898 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004899 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004900 /*
4901 * SAX: end of the document processing.
4902 */
4903 ctxt->instate = XML_PARSER_EOF;
4904 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4905 ctxt->sax->endDocument(ctxt->userData);
4906 }
4907 }
4908 if (avail < 1)
4909 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004910 cur = in->cur[0];
4911 if (cur == 0) {
4912 SKIP(1);
4913 continue;
4914 }
4915
Owen Taylor3473f882001-02-23 17:55:21 +00004916 switch (ctxt->instate) {
4917 case XML_PARSER_EOF:
4918 /*
4919 * Document parsing is done !
4920 */
4921 goto done;
4922 case XML_PARSER_START:
4923 /*
4924 * Very first chars read from the document flow.
4925 */
4926 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004927 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004928 SKIP_BLANKS;
4929 if (in->buf == NULL)
4930 avail = in->length - (in->cur - in->base);
4931 else
4932 avail = in->buf->buffer->use - (in->cur - in->base);
4933 }
4934 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4935 ctxt->sax->setDocumentLocator(ctxt->userData,
4936 &xmlDefaultSAXLocator);
4937 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4938 (!ctxt->disableSAX))
4939 ctxt->sax->startDocument(ctxt->userData);
4940
4941 cur = in->cur[0];
4942 next = in->cur[1];
4943 if ((cur == '<') && (next == '!') &&
4944 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4945 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4946 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4947 (UPP(8) == 'E')) {
4948 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004949 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004950 goto done;
4951#ifdef DEBUG_PUSH
4952 xmlGenericError(xmlGenericErrorContext,
4953 "HPP: Parsing internal subset\n");
4954#endif
4955 htmlParseDocTypeDecl(ctxt);
4956 ctxt->instate = XML_PARSER_PROLOG;
4957#ifdef DEBUG_PUSH
4958 xmlGenericError(xmlGenericErrorContext,
4959 "HPP: entering PROLOG\n");
4960#endif
4961 } else {
4962 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004963#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004964 xmlGenericError(xmlGenericErrorContext,
4965 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004966#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004967 }
Owen Taylor3473f882001-02-23 17:55:21 +00004968 break;
4969 case XML_PARSER_MISC:
4970 SKIP_BLANKS;
4971 if (in->buf == NULL)
4972 avail = in->length - (in->cur - in->base);
4973 else
4974 avail = in->buf->buffer->use - (in->cur - in->base);
4975 if (avail < 2)
4976 goto done;
4977 cur = in->cur[0];
4978 next = in->cur[1];
4979 if ((cur == '<') && (next == '!') &&
4980 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4981 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004982 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004983 goto done;
4984#ifdef DEBUG_PUSH
4985 xmlGenericError(xmlGenericErrorContext,
4986 "HPP: Parsing Comment\n");
4987#endif
4988 htmlParseComment(ctxt);
4989 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004990 } else if ((cur == '<') && (next == '?')) {
4991 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004992 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004993 goto done;
4994#ifdef DEBUG_PUSH
4995 xmlGenericError(xmlGenericErrorContext,
4996 "HPP: Parsing PI\n");
4997#endif
4998 htmlParsePI(ctxt);
4999 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005000 } else if ((cur == '<') && (next == '!') &&
5001 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5002 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5003 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5004 (UPP(8) == 'E')) {
5005 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005006 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005007 goto done;
5008#ifdef DEBUG_PUSH
5009 xmlGenericError(xmlGenericErrorContext,
5010 "HPP: Parsing internal subset\n");
5011#endif
5012 htmlParseDocTypeDecl(ctxt);
5013 ctxt->instate = XML_PARSER_PROLOG;
5014#ifdef DEBUG_PUSH
5015 xmlGenericError(xmlGenericErrorContext,
5016 "HPP: entering PROLOG\n");
5017#endif
5018 } else if ((cur == '<') && (next == '!') &&
5019 (avail < 9)) {
5020 goto done;
5021 } else {
5022 ctxt->instate = XML_PARSER_START_TAG;
5023#ifdef DEBUG_PUSH
5024 xmlGenericError(xmlGenericErrorContext,
5025 "HPP: entering START_TAG\n");
5026#endif
5027 }
5028 break;
5029 case XML_PARSER_PROLOG:
5030 SKIP_BLANKS;
5031 if (in->buf == NULL)
5032 avail = in->length - (in->cur - in->base);
5033 else
5034 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005035 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005036 goto done;
5037 cur = in->cur[0];
5038 next = in->cur[1];
5039 if ((cur == '<') && (next == '!') &&
5040 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5041 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005042 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005043 goto done;
5044#ifdef DEBUG_PUSH
5045 xmlGenericError(xmlGenericErrorContext,
5046 "HPP: Parsing Comment\n");
5047#endif
5048 htmlParseComment(ctxt);
5049 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005050 } else if ((cur == '<') && (next == '?')) {
5051 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005052 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005053 goto done;
5054#ifdef DEBUG_PUSH
5055 xmlGenericError(xmlGenericErrorContext,
5056 "HPP: Parsing PI\n");
5057#endif
5058 htmlParsePI(ctxt);
5059 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005060 } else if ((cur == '<') && (next == '!') &&
5061 (avail < 4)) {
5062 goto done;
5063 } else {
5064 ctxt->instate = XML_PARSER_START_TAG;
5065#ifdef DEBUG_PUSH
5066 xmlGenericError(xmlGenericErrorContext,
5067 "HPP: entering START_TAG\n");
5068#endif
5069 }
5070 break;
5071 case XML_PARSER_EPILOG:
5072 if (in->buf == NULL)
5073 avail = in->length - (in->cur - in->base);
5074 else
5075 avail = in->buf->buffer->use - (in->cur - in->base);
5076 if (avail < 1)
5077 goto done;
5078 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005079 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005080 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005081 goto done;
5082 }
5083 if (avail < 2)
5084 goto done;
5085 next = in->cur[1];
5086 if ((cur == '<') && (next == '!') &&
5087 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5088 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005089 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005090 goto done;
5091#ifdef DEBUG_PUSH
5092 xmlGenericError(xmlGenericErrorContext,
5093 "HPP: Parsing Comment\n");
5094#endif
5095 htmlParseComment(ctxt);
5096 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005097 } else if ((cur == '<') && (next == '?')) {
5098 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005099 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005100 goto done;
5101#ifdef DEBUG_PUSH
5102 xmlGenericError(xmlGenericErrorContext,
5103 "HPP: Parsing PI\n");
5104#endif
5105 htmlParsePI(ctxt);
5106 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005107 } else if ((cur == '<') && (next == '!') &&
5108 (avail < 4)) {
5109 goto done;
5110 } else {
5111 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005112 ctxt->wellFormed = 0;
5113 ctxt->instate = XML_PARSER_EOF;
5114#ifdef DEBUG_PUSH
5115 xmlGenericError(xmlGenericErrorContext,
5116 "HPP: entering EOF\n");
5117#endif
5118 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5119 ctxt->sax->endDocument(ctxt->userData);
5120 goto done;
5121 }
5122 break;
5123 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005124 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005125 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005126 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005127
5128 if (avail < 2)
5129 goto done;
5130 cur = in->cur[0];
5131 if (cur != '<') {
5132 ctxt->instate = XML_PARSER_CONTENT;
5133#ifdef DEBUG_PUSH
5134 xmlGenericError(xmlGenericErrorContext,
5135 "HPP: entering CONTENT\n");
5136#endif
5137 break;
5138 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005139 if (in->cur[1] == '/') {
5140 ctxt->instate = XML_PARSER_END_TAG;
5141 ctxt->checkIndex = 0;
5142#ifdef DEBUG_PUSH
5143 xmlGenericError(xmlGenericErrorContext,
5144 "HPP: entering END_TAG\n");
5145#endif
5146 break;
5147 }
Owen Taylor3473f882001-02-23 17:55:21 +00005148 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005149 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005150 goto done;
5151
Daniel Veillard597f1c12005-07-03 23:00:18 +00005152 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005153 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005154 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005155 (name == NULL)) {
5156 if (CUR == '>')
5157 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005158 break;
5159 }
Owen Taylor3473f882001-02-23 17:55:21 +00005160
5161 /*
5162 * Lookup the info for that element.
5163 */
5164 info = htmlTagLookup(name);
5165 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005166 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5167 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005168 }
5169
5170 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005171 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005172 */
5173 if ((CUR == '/') && (NXT(1) == '>')) {
5174 SKIP(2);
5175 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5176 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005177 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005178 ctxt->instate = XML_PARSER_CONTENT;
5179#ifdef DEBUG_PUSH
5180 xmlGenericError(xmlGenericErrorContext,
5181 "HPP: entering CONTENT\n");
5182#endif
5183 break;
5184 }
5185
5186 if (CUR == '>') {
5187 NEXT;
5188 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005189 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5190 "Couldn't find end of Start Tag %s\n",
5191 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005192
5193 /*
5194 * end of parsing of this node.
5195 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005196 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005197 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005198 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005199 }
Owen Taylor3473f882001-02-23 17:55:21 +00005200
5201 ctxt->instate = XML_PARSER_CONTENT;
5202#ifdef DEBUG_PUSH
5203 xmlGenericError(xmlGenericErrorContext,
5204 "HPP: entering CONTENT\n");
5205#endif
5206 break;
5207 }
5208
5209 /*
5210 * Check for an Empty Element from DTD definition
5211 */
5212 if ((info != NULL) && (info->empty)) {
5213 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5214 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005215 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005216 }
5217 ctxt->instate = XML_PARSER_CONTENT;
5218#ifdef DEBUG_PUSH
5219 xmlGenericError(xmlGenericErrorContext,
5220 "HPP: entering CONTENT\n");
5221#endif
5222 break;
5223 }
5224 case XML_PARSER_CONTENT: {
5225 long cons;
5226 /*
5227 * Handle preparsed entities and charRef
5228 */
5229 if (ctxt->token != 0) {
5230 xmlChar chr[2] = { 0 , 0 } ;
5231
5232 chr[0] = (xmlChar) ctxt->token;
5233 htmlCheckParagraph(ctxt);
5234 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5235 ctxt->sax->characters(ctxt->userData, chr, 1);
5236 ctxt->token = 0;
5237 ctxt->checkIndex = 0;
5238 }
5239 if ((avail == 1) && (terminate)) {
5240 cur = in->cur[0];
5241 if ((cur != '<') && (cur != '&')) {
5242 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005243 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005244 if (ctxt->sax->ignorableWhitespace != NULL)
5245 ctxt->sax->ignorableWhitespace(
5246 ctxt->userData, &cur, 1);
5247 } else {
5248 htmlCheckParagraph(ctxt);
5249 if (ctxt->sax->characters != NULL)
5250 ctxt->sax->characters(
5251 ctxt->userData, &cur, 1);
5252 }
5253 }
5254 ctxt->token = 0;
5255 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005256 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005257 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005258 }
Owen Taylor3473f882001-02-23 17:55:21 +00005259 }
5260 if (avail < 2)
5261 goto done;
5262 cur = in->cur[0];
5263 next = in->cur[1];
5264 cons = ctxt->nbChars;
5265 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5266 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5267 /*
5268 * Handle SCRIPT/STYLE separately
5269 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005270 if (!terminate) {
5271 int idx;
5272 xmlChar val;
5273
Jiri Netolicky446e1262009-08-07 17:05:36 +02005274 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005275 if (idx < 0)
5276 goto done;
5277 val = in->cur[idx + 2];
5278 if (val == 0) /* bad cut of input */
5279 goto done;
5280 }
Owen Taylor3473f882001-02-23 17:55:21 +00005281 htmlParseScript(ctxt);
5282 if ((cur == '<') && (next == '/')) {
5283 ctxt->instate = XML_PARSER_END_TAG;
5284 ctxt->checkIndex = 0;
5285#ifdef DEBUG_PUSH
5286 xmlGenericError(xmlGenericErrorContext,
5287 "HPP: entering END_TAG\n");
5288#endif
5289 break;
5290 }
5291 } else {
5292 /*
5293 * Sometimes DOCTYPE arrives in the middle of the document
5294 */
5295 if ((cur == '<') && (next == '!') &&
5296 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5297 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5298 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5299 (UPP(8) == 'E')) {
5300 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005301 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005302 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005303 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5304 "Misplaced DOCTYPE declaration\n",
5305 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005306 htmlParseDocTypeDecl(ctxt);
5307 } else if ((cur == '<') && (next == '!') &&
5308 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5309 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005310 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005311 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005312 goto done;
5313#ifdef DEBUG_PUSH
5314 xmlGenericError(xmlGenericErrorContext,
5315 "HPP: Parsing Comment\n");
5316#endif
5317 htmlParseComment(ctxt);
5318 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005319 } else if ((cur == '<') && (next == '?')) {
5320 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005321 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005322 goto done;
5323#ifdef DEBUG_PUSH
5324 xmlGenericError(xmlGenericErrorContext,
5325 "HPP: Parsing PI\n");
5326#endif
5327 htmlParsePI(ctxt);
5328 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005329 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5330 goto done;
5331 } else if ((cur == '<') && (next == '/')) {
5332 ctxt->instate = XML_PARSER_END_TAG;
5333 ctxt->checkIndex = 0;
5334#ifdef DEBUG_PUSH
5335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: entering END_TAG\n");
5337#endif
5338 break;
5339 } else if (cur == '<') {
5340 ctxt->instate = XML_PARSER_START_TAG;
5341 ctxt->checkIndex = 0;
5342#ifdef DEBUG_PUSH
5343 xmlGenericError(xmlGenericErrorContext,
5344 "HPP: entering START_TAG\n");
5345#endif
5346 break;
5347 } else if (cur == '&') {
5348 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005349 (htmlParseLookupChars(ctxt,
5350 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005351 goto done;
5352#ifdef DEBUG_PUSH
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: Parsing Reference\n");
5355#endif
5356 /* TODO: check generation of subtrees if noent !!! */
5357 htmlParseReference(ctxt);
5358 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005359 /*
5360 * check that the text sequence is complete
5361 * before handing out the data to the parser
5362 * to avoid problems with erroneous end of
5363 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005364 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005365 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005366 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005367 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005368 ctxt->checkIndex = 0;
5369#ifdef DEBUG_PUSH
5370 xmlGenericError(xmlGenericErrorContext,
5371 "HPP: Parsing char data\n");
5372#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005373 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005374 }
5375 }
5376 if (cons == ctxt->nbChars) {
5377 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005378 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5379 "detected an error in element content\n",
5380 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005381 }
5382 NEXT;
5383 break;
5384 }
5385
5386 break;
5387 }
5388 case XML_PARSER_END_TAG:
5389 if (avail < 2)
5390 goto done;
5391 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005392 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005393 goto done;
5394 htmlParseEndTag(ctxt);
5395 if (ctxt->nameNr == 0) {
5396 ctxt->instate = XML_PARSER_EPILOG;
5397 } else {
5398 ctxt->instate = XML_PARSER_CONTENT;
5399 }
5400 ctxt->checkIndex = 0;
5401#ifdef DEBUG_PUSH
5402 xmlGenericError(xmlGenericErrorContext,
5403 "HPP: entering CONTENT\n");
5404#endif
5405 break;
5406 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005407 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5408 "HPP: internal error, state == CDATA\n",
5409 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005410 ctxt->instate = XML_PARSER_CONTENT;
5411 ctxt->checkIndex = 0;
5412#ifdef DEBUG_PUSH
5413 xmlGenericError(xmlGenericErrorContext,
5414 "HPP: entering CONTENT\n");
5415#endif
5416 break;
5417 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005418 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5419 "HPP: internal error, state == DTD\n",
5420 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005421 ctxt->instate = XML_PARSER_CONTENT;
5422 ctxt->checkIndex = 0;
5423#ifdef DEBUG_PUSH
5424 xmlGenericError(xmlGenericErrorContext,
5425 "HPP: entering CONTENT\n");
5426#endif
5427 break;
5428 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005429 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5430 "HPP: internal error, state == COMMENT\n",
5431 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005432 ctxt->instate = XML_PARSER_CONTENT;
5433 ctxt->checkIndex = 0;
5434#ifdef DEBUG_PUSH
5435 xmlGenericError(xmlGenericErrorContext,
5436 "HPP: entering CONTENT\n");
5437#endif
5438 break;
5439 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005440 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5441 "HPP: internal error, state == PI\n",
5442 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005443 ctxt->instate = XML_PARSER_CONTENT;
5444 ctxt->checkIndex = 0;
5445#ifdef DEBUG_PUSH
5446 xmlGenericError(xmlGenericErrorContext,
5447 "HPP: entering CONTENT\n");
5448#endif
5449 break;
5450 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005451 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5452 "HPP: internal error, state == ENTITY_DECL\n",
5453 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005454 ctxt->instate = XML_PARSER_CONTENT;
5455 ctxt->checkIndex = 0;
5456#ifdef DEBUG_PUSH
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: entering CONTENT\n");
5459#endif
5460 break;
5461 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005462 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5463 "HPP: internal error, state == ENTITY_VALUE\n",
5464 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005465 ctxt->instate = XML_PARSER_CONTENT;
5466 ctxt->checkIndex = 0;
5467#ifdef DEBUG_PUSH
5468 xmlGenericError(xmlGenericErrorContext,
5469 "HPP: entering DTD\n");
5470#endif
5471 break;
5472 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005473 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5474 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5475 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005476 ctxt->instate = XML_PARSER_START_TAG;
5477 ctxt->checkIndex = 0;
5478#ifdef DEBUG_PUSH
5479 xmlGenericError(xmlGenericErrorContext,
5480 "HPP: entering START_TAG\n");
5481#endif
5482 break;
5483 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005484 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5485 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5486 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005487 ctxt->instate = XML_PARSER_CONTENT;
5488 ctxt->checkIndex = 0;
5489#ifdef DEBUG_PUSH
5490 xmlGenericError(xmlGenericErrorContext,
5491 "HPP: entering CONTENT\n");
5492#endif
5493 break;
5494 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005495 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5496 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5497 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005498 ctxt->instate = XML_PARSER_CONTENT;
5499 ctxt->checkIndex = 0;
5500#ifdef DEBUG_PUSH
5501 xmlGenericError(xmlGenericErrorContext,
5502 "HPP: entering CONTENT\n");
5503#endif
5504 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005505 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005506 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5507 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5508 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005509 ctxt->instate = XML_PARSER_CONTENT;
5510 ctxt->checkIndex = 0;
5511#ifdef DEBUG_PUSH
5512 xmlGenericError(xmlGenericErrorContext,
5513 "HPP: entering CONTENT\n");
5514#endif
5515 break;
5516
Owen Taylor3473f882001-02-23 17:55:21 +00005517 }
5518 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005519done:
Owen Taylor3473f882001-02-23 17:55:21 +00005520 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005521 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005522 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005523 /*
5524 * SAX: end of the document processing.
5525 */
5526 ctxt->instate = XML_PARSER_EOF;
5527 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5528 ctxt->sax->endDocument(ctxt->userData);
5529 }
5530 }
5531 if ((ctxt->myDoc != NULL) &&
5532 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5533 (ctxt->instate == XML_PARSER_EPILOG))) {
5534 xmlDtdPtr dtd;
5535 dtd = xmlGetIntSubset(ctxt->myDoc);
5536 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005537 ctxt->myDoc->intSubset =
5538 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005539 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5540 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5541 }
5542#ifdef DEBUG_PUSH
5543 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5544#endif
5545 return(ret);
5546}
5547
5548/**
Owen Taylor3473f882001-02-23 17:55:21 +00005549 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005550 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005551 * @chunk: an char array
5552 * @size: the size in byte of the chunk
5553 * @terminate: last chunk indicator
5554 *
5555 * Parse a Chunk of memory
5556 *
5557 * Returns zero if no error, the xmlParserErrors otherwise.
5558 */
5559int
5560htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5561 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005562 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5563 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5564 "htmlParseChunk: context error\n", NULL, NULL);
5565 return(XML_ERR_INTERNAL_ERROR);
5566 }
Owen Taylor3473f882001-02-23 17:55:21 +00005567 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5568 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5569 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5570 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005571 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005572
5573 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005574 if (res < 0) {
5575 ctxt->errNo = XML_PARSER_EOF;
5576 ctxt->disableSAX = 1;
5577 return (XML_PARSER_EOF);
5578 }
Owen Taylor3473f882001-02-23 17:55:21 +00005579 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5580 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005581 ctxt->input->end =
5582 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005583#ifdef DEBUG_PUSH
5584 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5585#endif
5586
Daniel Veillard14f752c2003-08-09 11:44:50 +00005587#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005588 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5589 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005590#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005591 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005592 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5593 xmlParserInputBufferPtr in = ctxt->input->buf;
5594 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5595 (in->raw != NULL)) {
5596 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005597
Daniel Veillard14f752c2003-08-09 11:44:50 +00005598 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5599 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005600 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5601 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005602 return(XML_ERR_INVALID_ENCODING);
5603 }
5604 }
5605 }
Owen Taylor3473f882001-02-23 17:55:21 +00005606 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005607 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005608 if (terminate) {
5609 if ((ctxt->instate != XML_PARSER_EOF) &&
5610 (ctxt->instate != XML_PARSER_EPILOG) &&
5611 (ctxt->instate != XML_PARSER_MISC)) {
5612 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005613 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005614 }
Owen Taylor3473f882001-02-23 17:55:21 +00005615 if (ctxt->instate != XML_PARSER_EOF) {
5616 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5617 ctxt->sax->endDocument(ctxt->userData);
5618 }
5619 ctxt->instate = XML_PARSER_EOF;
5620 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005621 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005622}
5623
5624/************************************************************************
5625 * *
5626 * User entry points *
5627 * *
5628 ************************************************************************/
5629
5630/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005631 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005632 * @sax: a SAX handler
5633 * @user_data: The user data returned on SAX callbacks
5634 * @chunk: a pointer to an array of chars
5635 * @size: number of chars in the array
5636 * @filename: an optional file name or URI
5637 * @enc: an optional encoding
5638 *
5639 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005640 * The value of @filename is used for fetching external entities
5641 * and error/warning reports.
5642 *
5643 * Returns the new parser context or NULL
5644 */
5645htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005646htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005647 const char *chunk, int size, const char *filename,
5648 xmlCharEncoding enc) {
5649 htmlParserCtxtPtr ctxt;
5650 htmlParserInputPtr inputStream;
5651 xmlParserInputBufferPtr buf;
5652
Daniel Veillardd0463562001-10-13 09:15:48 +00005653 xmlInitParser();
5654
Owen Taylor3473f882001-02-23 17:55:21 +00005655 buf = xmlAllocParserInputBuffer(enc);
5656 if (buf == NULL) return(NULL);
5657
Daniel Veillardf403d292003-10-05 13:51:35 +00005658 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005659 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005660 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005661 return(NULL);
5662 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005663 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5664 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005665 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005666 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005667 xmlFree(ctxt->sax);
5668 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5669 if (ctxt->sax == NULL) {
5670 xmlFree(buf);
5671 xmlFree(ctxt);
5672 return(NULL);
5673 }
5674 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5675 if (user_data != NULL)
5676 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005677 }
Owen Taylor3473f882001-02-23 17:55:21 +00005678 if (filename == NULL) {
5679 ctxt->directory = NULL;
5680 } else {
5681 ctxt->directory = xmlParserGetDirectory(filename);
5682 }
5683
5684 inputStream = htmlNewInputStream(ctxt);
5685 if (inputStream == NULL) {
5686 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005687 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005688 return(NULL);
5689 }
5690
5691 if (filename == NULL)
5692 inputStream->filename = NULL;
5693 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005694 inputStream->filename = (char *)
5695 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005696 inputStream->buf = buf;
5697 inputStream->base = inputStream->buf->buffer->content;
5698 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005699 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005700 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005701
5702 inputPush(ctxt, inputStream);
5703
5704 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005705 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005706 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5707 int cur = ctxt->input->cur - ctxt->input->base;
5708
Daniel Veillarde77db162009-08-22 11:32:38 +02005709 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005710
5711 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5712 ctxt->input->cur = ctxt->input->base + cur;
5713 ctxt->input->end =
5714 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005715#ifdef DEBUG_PUSH
5716 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5717#endif
5718 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005719 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005720
5721 return(ctxt);
5722}
William M. Brack21e4ef22005-01-02 09:53:13 +00005723#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005724
5725/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005726 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005727 * @cur: a pointer to an array of xmlChar
5728 * @encoding: a free form C string describing the HTML document encoding, or NULL
5729 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005730 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005731 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005732 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5733 * to handle parse events. If sax is NULL, fallback to the default DOM
5734 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005735 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005736 * Returns the resulting document tree unless SAX is NULL or the document is
5737 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005738 */
5739
5740htmlDocPtr
5741htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5742 htmlDocPtr ret;
5743 htmlParserCtxtPtr ctxt;
5744
Daniel Veillardd0463562001-10-13 09:15:48 +00005745 xmlInitParser();
5746
Owen Taylor3473f882001-02-23 17:55:21 +00005747 if (cur == NULL) return(NULL);
5748
5749
5750 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5751 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005752 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005753 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005754 ctxt->sax = sax;
5755 ctxt->userData = userData;
5756 }
5757
5758 htmlParseDocument(ctxt);
5759 ret = ctxt->myDoc;
5760 if (sax != NULL) {
5761 ctxt->sax = NULL;
5762 ctxt->userData = NULL;
5763 }
5764 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005765
Owen Taylor3473f882001-02-23 17:55:21 +00005766 return(ret);
5767}
5768
5769/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005770 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005771 * @cur: a pointer to an array of xmlChar
5772 * @encoding: a free form C string describing the HTML document encoding, or NULL
5773 *
5774 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005775 *
Owen Taylor3473f882001-02-23 17:55:21 +00005776 * Returns the resulting document tree
5777 */
5778
5779htmlDocPtr
5780htmlParseDoc(xmlChar *cur, const char *encoding) {
5781 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5782}
5783
5784
5785/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005786 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005787 * @filename: the filename
5788 * @encoding: a free form C string describing the HTML document encoding, or NULL
5789 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005790 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005791 * Automatic support for ZLIB/Compress compressed document is provided
5792 * by default if found at compile-time.
5793 *
5794 * Returns the new parser context or NULL
5795 */
5796htmlParserCtxtPtr
5797htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5798{
5799 htmlParserCtxtPtr ctxt;
5800 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005801 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005802 /* htmlCharEncoding enc; */
5803 xmlChar *content, *content_line = (xmlChar *) "charset=";
5804
Daniel Veillarda03e3652004-11-02 18:45:30 +00005805 if (filename == NULL)
5806 return(NULL);
5807
Daniel Veillardf403d292003-10-05 13:51:35 +00005808 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005809 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005810 return(NULL);
5811 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005812 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5813 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005814#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005815 if (xmlDefaultSAXHandler.error != NULL) {
5816 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5817 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005818#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005819 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005820 return(NULL);
5821 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005822
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005823 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5824 xmlFree(canonicFilename);
5825 if (inputStream == NULL) {
5826 xmlFreeParserCtxt(ctxt);
5827 return(NULL);
5828 }
Owen Taylor3473f882001-02-23 17:55:21 +00005829
5830 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005831
Owen Taylor3473f882001-02-23 17:55:21 +00005832 /* set encoding */
5833 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005834 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005835 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005836 strcpy ((char *)content, (char *)content_line);
5837 strcat ((char *)content, (char *)encoding);
5838 htmlCheckEncoding (ctxt, content);
5839 xmlFree (content);
5840 }
5841 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005842
Owen Taylor3473f882001-02-23 17:55:21 +00005843 return(ctxt);
5844}
5845
5846/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005847 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005848 * @filename: the filename
5849 * @encoding: a free form C string describing the HTML document encoding, or NULL
5850 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005851 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005852 *
5853 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5854 * compressed document is provided by default if found at compile-time.
5855 * It use the given SAX function block to handle the parsing callback.
5856 * If sax is NULL, fallback to the default DOM tree building routines.
5857 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005858 * Returns the resulting document tree unless SAX is NULL or the document is
5859 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005860 */
5861
5862htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005863htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005864 void *userData) {
5865 htmlDocPtr ret;
5866 htmlParserCtxtPtr ctxt;
5867 htmlSAXHandlerPtr oldsax = NULL;
5868
Daniel Veillardd0463562001-10-13 09:15:48 +00005869 xmlInitParser();
5870
Owen Taylor3473f882001-02-23 17:55:21 +00005871 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5872 if (ctxt == NULL) return(NULL);
5873 if (sax != NULL) {
5874 oldsax = ctxt->sax;
5875 ctxt->sax = sax;
5876 ctxt->userData = userData;
5877 }
5878
5879 htmlParseDocument(ctxt);
5880
5881 ret = ctxt->myDoc;
5882 if (sax != NULL) {
5883 ctxt->sax = oldsax;
5884 ctxt->userData = NULL;
5885 }
5886 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005887
Owen Taylor3473f882001-02-23 17:55:21 +00005888 return(ret);
5889}
5890
5891/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005892 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005893 * @filename: the filename
5894 * @encoding: a free form C string describing the HTML document encoding, or NULL
5895 *
5896 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5897 * compressed document is provided by default if found at compile-time.
5898 *
5899 * Returns the resulting document tree
5900 */
5901
5902htmlDocPtr
5903htmlParseFile(const char *filename, const char *encoding) {
5904 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5905}
5906
5907/**
5908 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005909 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005910 *
5911 * Set and return the previous value for handling HTML omitted tags.
5912 *
5913 * Returns the last value for 0 for no handling, 1 for auto insertion.
5914 */
5915
5916int
5917htmlHandleOmittedElem(int val) {
5918 int old = htmlOmittedDefaultValue;
5919
5920 htmlOmittedDefaultValue = val;
5921 return(old);
5922}
5923
Daniel Veillard930dfb62003-02-05 10:17:38 +00005924/**
5925 * htmlElementAllowedHere:
5926 * @parent: HTML parent element
5927 * @elt: HTML element
5928 *
5929 * Checks whether an HTML element may be a direct child of a parent element.
5930 * Note - doesn't check for deprecated elements
5931 *
5932 * Returns 1 if allowed; 0 otherwise.
5933 */
5934int
5935htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5936 const char** p ;
5937
5938 if ( ! elt || ! parent || ! parent->subelts )
5939 return 0 ;
5940
5941 for ( p = parent->subelts; *p; ++p )
5942 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5943 return 1 ;
5944
5945 return 0 ;
5946}
5947/**
5948 * htmlElementStatusHere:
5949 * @parent: HTML parent element
5950 * @elt: HTML element
5951 *
5952 * Checks whether an HTML element may be a direct child of a parent element.
5953 * and if so whether it is valid or deprecated.
5954 *
5955 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5956 */
5957htmlStatus
5958htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5959 if ( ! parent || ! elt )
5960 return HTML_INVALID ;
5961 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5962 return HTML_INVALID ;
5963
5964 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5965}
5966/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005967 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005968 * @elt: HTML element
5969 * @attr: HTML attribute
5970 * @legacy: whether to allow deprecated attributes
5971 *
5972 * Checks whether an attribute is valid for an element
5973 * Has full knowledge of Required and Deprecated attributes
5974 *
5975 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5976 */
5977htmlStatus
5978htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5979 const char** p ;
5980
5981 if ( !elt || ! attr )
5982 return HTML_INVALID ;
5983
5984 if ( elt->attrs_req )
5985 for ( p = elt->attrs_req; *p; ++p)
5986 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5987 return HTML_REQUIRED ;
5988
5989 if ( elt->attrs_opt )
5990 for ( p = elt->attrs_opt; *p; ++p)
5991 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5992 return HTML_VALID ;
5993
5994 if ( legacy && elt->attrs_depr )
5995 for ( p = elt->attrs_depr; *p; ++p)
5996 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5997 return HTML_DEPRECATED ;
5998
5999 return HTML_INVALID ;
6000}
6001/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006002 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006003 * @node: an htmlNodePtr in a tree
6004 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006005 * for Element nodes)
6006 *
6007 * Checks whether the tree node is valid. Experimental (the author
6008 * only uses the HTML enhancements in a SAX parser)
6009 *
6010 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6011 * legacy allowed) or htmlElementStatusHere (otherwise).
6012 * for Attribute nodes, a return from htmlAttrAllowed
6013 * for other nodes, HTML_NA (no checks performed)
6014 */
6015htmlStatus
6016htmlNodeStatus(const htmlNodePtr node, int legacy) {
6017 if ( ! node )
6018 return HTML_INVALID ;
6019
6020 switch ( node->type ) {
6021 case XML_ELEMENT_NODE:
6022 return legacy
6023 ? ( htmlElementAllowedHere (
6024 htmlTagLookup(node->parent->name) , node->name
6025 ) ? HTML_VALID : HTML_INVALID )
6026 : htmlElementStatusHere(
6027 htmlTagLookup(node->parent->name) ,
6028 htmlTagLookup(node->name) )
6029 ;
6030 case XML_ATTRIBUTE_NODE:
6031 return htmlAttrAllowed(
6032 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6033 default: return HTML_NA ;
6034 }
6035}
Daniel Veillard9475a352003-09-26 12:47:50 +00006036/************************************************************************
6037 * *
6038 * New set (2.6.0) of simpler and more flexible APIs *
6039 * *
6040 ************************************************************************/
6041/**
6042 * DICT_FREE:
6043 * @str: a string
6044 *
6045 * Free a string if it is not owned by the "dict" dictionnary in the
6046 * current scope
6047 */
6048#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006049 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006050 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6051 xmlFree((char *)(str));
6052
6053/**
6054 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006055 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006056 *
6057 * Reset a parser context
6058 */
6059void
6060htmlCtxtReset(htmlParserCtxtPtr ctxt)
6061{
6062 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006063 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006064
Daniel Veillarda03e3652004-11-02 18:45:30 +00006065 if (ctxt == NULL)
6066 return;
6067
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006068 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006069 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006070
6071 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6072 xmlFreeInputStream(input);
6073 }
6074 ctxt->inputNr = 0;
6075 ctxt->input = NULL;
6076
6077 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006078 if (ctxt->spaceTab != NULL) {
6079 ctxt->spaceTab[0] = -1;
6080 ctxt->space = &ctxt->spaceTab[0];
6081 } else {
6082 ctxt->space = NULL;
6083 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006084
6085
6086 ctxt->nodeNr = 0;
6087 ctxt->node = NULL;
6088
6089 ctxt->nameNr = 0;
6090 ctxt->name = NULL;
6091
6092 DICT_FREE(ctxt->version);
6093 ctxt->version = NULL;
6094 DICT_FREE(ctxt->encoding);
6095 ctxt->encoding = NULL;
6096 DICT_FREE(ctxt->directory);
6097 ctxt->directory = NULL;
6098 DICT_FREE(ctxt->extSubURI);
6099 ctxt->extSubURI = NULL;
6100 DICT_FREE(ctxt->extSubSystem);
6101 ctxt->extSubSystem = NULL;
6102 if (ctxt->myDoc != NULL)
6103 xmlFreeDoc(ctxt->myDoc);
6104 ctxt->myDoc = NULL;
6105
6106 ctxt->standalone = -1;
6107 ctxt->hasExternalSubset = 0;
6108 ctxt->hasPErefs = 0;
6109 ctxt->html = 1;
6110 ctxt->external = 0;
6111 ctxt->instate = XML_PARSER_START;
6112 ctxt->token = 0;
6113
6114 ctxt->wellFormed = 1;
6115 ctxt->nsWellFormed = 1;
6116 ctxt->valid = 1;
6117 ctxt->vctxt.userData = ctxt;
6118 ctxt->vctxt.error = xmlParserValidityError;
6119 ctxt->vctxt.warning = xmlParserValidityWarning;
6120 ctxt->record_info = 0;
6121 ctxt->nbChars = 0;
6122 ctxt->checkIndex = 0;
6123 ctxt->inSubset = 0;
6124 ctxt->errNo = XML_ERR_OK;
6125 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006126 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006127 ctxt->catalogs = NULL;
6128 xmlInitNodeInfoSeq(&ctxt->node_seq);
6129
6130 if (ctxt->attsDefault != NULL) {
6131 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6132 ctxt->attsDefault = NULL;
6133 }
6134 if (ctxt->attsSpecial != NULL) {
6135 xmlHashFree(ctxt->attsSpecial, NULL);
6136 ctxt->attsSpecial = NULL;
6137 }
6138}
6139
6140/**
6141 * htmlCtxtUseOptions:
6142 * @ctxt: an HTML parser context
6143 * @options: a combination of htmlParserOption(s)
6144 *
6145 * Applies the options to the parser context
6146 *
6147 * Returns 0 in case of success, the set of unknown or unimplemented options
6148 * in case of error.
6149 */
6150int
6151htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6152{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006153 if (ctxt == NULL)
6154 return(-1);
6155
Daniel Veillard9475a352003-09-26 12:47:50 +00006156 if (options & HTML_PARSE_NOWARNING) {
6157 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006158 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006159 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006160 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006161 }
6162 if (options & HTML_PARSE_NOERROR) {
6163 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006164 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006165 ctxt->sax->fatalError = NULL;
6166 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006167 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006168 }
6169 if (options & HTML_PARSE_PEDANTIC) {
6170 ctxt->pedantic = 1;
6171 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006172 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006173 } else
6174 ctxt->pedantic = 0;
6175 if (options & XML_PARSE_NOBLANKS) {
6176 ctxt->keepBlanks = 0;
6177 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6178 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006179 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006180 } else
6181 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006182 if (options & HTML_PARSE_RECOVER) {
6183 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006184 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006185 } else
6186 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006187 if (options & HTML_PARSE_COMPACT) {
6188 ctxt->options |= HTML_PARSE_COMPACT;
6189 options -= HTML_PARSE_COMPACT;
6190 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006191 if (options & XML_PARSE_HUGE) {
6192 ctxt->options |= XML_PARSE_HUGE;
6193 options -= XML_PARSE_HUGE;
6194 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006195 ctxt->dictNames = 0;
6196 return (options);
6197}
6198
6199/**
6200 * htmlDoRead:
6201 * @ctxt: an HTML parser context
6202 * @URL: the base URL to use for the document
6203 * @encoding: the document encoding, or NULL
6204 * @options: a combination of htmlParserOption(s)
6205 * @reuse: keep the context for reuse
6206 *
6207 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006208 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006209 * Returns the resulting document tree or NULL
6210 */
6211static htmlDocPtr
6212htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6213 int options, int reuse)
6214{
6215 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006216
Daniel Veillard9475a352003-09-26 12:47:50 +00006217 htmlCtxtUseOptions(ctxt, options);
6218 ctxt->html = 1;
6219 if (encoding != NULL) {
6220 xmlCharEncodingHandlerPtr hdlr;
6221
6222 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006223 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006224 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006225 if (ctxt->input->encoding != NULL)
6226 xmlFree((xmlChar *) ctxt->input->encoding);
6227 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6228 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006229 }
6230 if ((URL != NULL) && (ctxt->input != NULL) &&
6231 (ctxt->input->filename == NULL))
6232 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6233 htmlParseDocument(ctxt);
6234 ret = ctxt->myDoc;
6235 ctxt->myDoc = NULL;
6236 if (!reuse) {
6237 if ((ctxt->dictNames) &&
6238 (ret != NULL) &&
6239 (ret->dict == ctxt->dict))
6240 ctxt->dict = NULL;
6241 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006242 }
6243 return (ret);
6244}
6245
6246/**
6247 * htmlReadDoc:
6248 * @cur: a pointer to a zero terminated string
6249 * @URL: the base URL to use for the document
6250 * @encoding: the document encoding, or NULL
6251 * @options: a combination of htmlParserOption(s)
6252 *
6253 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006254 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006255 * Returns the resulting document tree
6256 */
6257htmlDocPtr
6258htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6259{
6260 htmlParserCtxtPtr ctxt;
6261
6262 if (cur == NULL)
6263 return (NULL);
6264
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006265 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006266 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006267 if (ctxt == NULL)
6268 return (NULL);
6269 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6270}
6271
6272/**
6273 * htmlReadFile:
6274 * @filename: a file or URL
6275 * @encoding: the document encoding, or NULL
6276 * @options: a combination of htmlParserOption(s)
6277 *
6278 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006279 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006280 * Returns the resulting document tree
6281 */
6282htmlDocPtr
6283htmlReadFile(const char *filename, const char *encoding, int options)
6284{
6285 htmlParserCtxtPtr ctxt;
6286
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006287 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006288 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6289 if (ctxt == NULL)
6290 return (NULL);
6291 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6292}
6293
6294/**
6295 * htmlReadMemory:
6296 * @buffer: a pointer to a char array
6297 * @size: the size of the array
6298 * @URL: the base URL to use for the document
6299 * @encoding: the document encoding, or NULL
6300 * @options: a combination of htmlParserOption(s)
6301 *
6302 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006303 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006304 * Returns the resulting document tree
6305 */
6306htmlDocPtr
6307htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6308{
6309 htmlParserCtxtPtr ctxt;
6310
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006311 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006312 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6313 if (ctxt == NULL)
6314 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006315 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006316 if (ctxt->sax != NULL)
6317 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006318 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6319}
6320
6321/**
6322 * htmlReadFd:
6323 * @fd: an open file descriptor
6324 * @URL: the base URL to use for the document
6325 * @encoding: the document encoding, or NULL
6326 * @options: a combination of htmlParserOption(s)
6327 *
6328 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006329 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006330 * Returns the resulting document tree
6331 */
6332htmlDocPtr
6333htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6334{
6335 htmlParserCtxtPtr ctxt;
6336 xmlParserInputBufferPtr input;
6337 xmlParserInputPtr stream;
6338
6339 if (fd < 0)
6340 return (NULL);
6341
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006342 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006343 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6344 if (input == NULL)
6345 return (NULL);
6346 ctxt = xmlNewParserCtxt();
6347 if (ctxt == NULL) {
6348 xmlFreeParserInputBuffer(input);
6349 return (NULL);
6350 }
6351 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6352 if (stream == NULL) {
6353 xmlFreeParserInputBuffer(input);
6354 xmlFreeParserCtxt(ctxt);
6355 return (NULL);
6356 }
6357 inputPush(ctxt, stream);
6358 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6359}
6360
6361/**
6362 * htmlReadIO:
6363 * @ioread: an I/O read function
6364 * @ioclose: an I/O close function
6365 * @ioctx: an I/O handler
6366 * @URL: the base URL to use for the document
6367 * @encoding: the document encoding, or NULL
6368 * @options: a combination of htmlParserOption(s)
6369 *
6370 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006371 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006372 * Returns the resulting document tree
6373 */
6374htmlDocPtr
6375htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6376 void *ioctx, const char *URL, const char *encoding, int options)
6377{
6378 htmlParserCtxtPtr ctxt;
6379 xmlParserInputBufferPtr input;
6380 xmlParserInputPtr stream;
6381
6382 if (ioread == NULL)
6383 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006384 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006385
6386 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6387 XML_CHAR_ENCODING_NONE);
6388 if (input == NULL)
6389 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006390 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006391 if (ctxt == NULL) {
6392 xmlFreeParserInputBuffer(input);
6393 return (NULL);
6394 }
6395 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6396 if (stream == NULL) {
6397 xmlFreeParserInputBuffer(input);
6398 xmlFreeParserCtxt(ctxt);
6399 return (NULL);
6400 }
6401 inputPush(ctxt, stream);
6402 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6403}
6404
6405/**
6406 * htmlCtxtReadDoc:
6407 * @ctxt: an HTML parser context
6408 * @cur: a pointer to a zero terminated string
6409 * @URL: the base URL to use for the document
6410 * @encoding: the document encoding, or NULL
6411 * @options: a combination of htmlParserOption(s)
6412 *
6413 * parse an XML in-memory document and build a tree.
6414 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006415 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006416 * Returns the resulting document tree
6417 */
6418htmlDocPtr
6419htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6420 const char *URL, const char *encoding, int options)
6421{
6422 xmlParserInputPtr stream;
6423
6424 if (cur == NULL)
6425 return (NULL);
6426 if (ctxt == NULL)
6427 return (NULL);
6428
6429 htmlCtxtReset(ctxt);
6430
6431 stream = xmlNewStringInputStream(ctxt, cur);
6432 if (stream == NULL) {
6433 return (NULL);
6434 }
6435 inputPush(ctxt, stream);
6436 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6437}
6438
6439/**
6440 * htmlCtxtReadFile:
6441 * @ctxt: an HTML parser context
6442 * @filename: a file or URL
6443 * @encoding: the document encoding, or NULL
6444 * @options: a combination of htmlParserOption(s)
6445 *
6446 * parse an XML file from the filesystem or the network.
6447 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006448 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006449 * Returns the resulting document tree
6450 */
6451htmlDocPtr
6452htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6453 const char *encoding, int options)
6454{
6455 xmlParserInputPtr stream;
6456
6457 if (filename == NULL)
6458 return (NULL);
6459 if (ctxt == NULL)
6460 return (NULL);
6461
6462 htmlCtxtReset(ctxt);
6463
Daniel Veillard29614c72004-11-26 10:47:26 +00006464 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006465 if (stream == NULL) {
6466 return (NULL);
6467 }
6468 inputPush(ctxt, stream);
6469 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6470}
6471
6472/**
6473 * htmlCtxtReadMemory:
6474 * @ctxt: an HTML parser context
6475 * @buffer: a pointer to a char array
6476 * @size: the size of the array
6477 * @URL: the base URL to use for the document
6478 * @encoding: the document encoding, or NULL
6479 * @options: a combination of htmlParserOption(s)
6480 *
6481 * parse an XML in-memory document and build a tree.
6482 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006483 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006484 * Returns the resulting document tree
6485 */
6486htmlDocPtr
6487htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6488 const char *URL, const char *encoding, int options)
6489{
6490 xmlParserInputBufferPtr input;
6491 xmlParserInputPtr stream;
6492
6493 if (ctxt == NULL)
6494 return (NULL);
6495 if (buffer == NULL)
6496 return (NULL);
6497
6498 htmlCtxtReset(ctxt);
6499
6500 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6501 if (input == NULL) {
6502 return(NULL);
6503 }
6504
6505 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6506 if (stream == NULL) {
6507 xmlFreeParserInputBuffer(input);
6508 return(NULL);
6509 }
6510
6511 inputPush(ctxt, stream);
6512 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6513}
6514
6515/**
6516 * htmlCtxtReadFd:
6517 * @ctxt: an HTML parser context
6518 * @fd: an open file descriptor
6519 * @URL: the base URL to use for the document
6520 * @encoding: the document encoding, or NULL
6521 * @options: a combination of htmlParserOption(s)
6522 *
6523 * parse an XML from a file descriptor and build a tree.
6524 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006525 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006526 * Returns the resulting document tree
6527 */
6528htmlDocPtr
6529htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6530 const char *URL, const char *encoding, int options)
6531{
6532 xmlParserInputBufferPtr input;
6533 xmlParserInputPtr stream;
6534
6535 if (fd < 0)
6536 return (NULL);
6537 if (ctxt == NULL)
6538 return (NULL);
6539
6540 htmlCtxtReset(ctxt);
6541
6542
6543 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6544 if (input == NULL)
6545 return (NULL);
6546 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6547 if (stream == NULL) {
6548 xmlFreeParserInputBuffer(input);
6549 return (NULL);
6550 }
6551 inputPush(ctxt, stream);
6552 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6553}
6554
6555/**
6556 * htmlCtxtReadIO:
6557 * @ctxt: an HTML parser context
6558 * @ioread: an I/O read function
6559 * @ioclose: an I/O close function
6560 * @ioctx: an I/O handler
6561 * @URL: the base URL to use for the document
6562 * @encoding: the document encoding, or NULL
6563 * @options: a combination of htmlParserOption(s)
6564 *
6565 * parse an HTML document from I/O functions and source and build a tree.
6566 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006567 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006568 * Returns the resulting document tree
6569 */
6570htmlDocPtr
6571htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6572 xmlInputCloseCallback ioclose, void *ioctx,
6573 const char *URL,
6574 const char *encoding, int options)
6575{
6576 xmlParserInputBufferPtr input;
6577 xmlParserInputPtr stream;
6578
6579 if (ioread == NULL)
6580 return (NULL);
6581 if (ctxt == NULL)
6582 return (NULL);
6583
6584 htmlCtxtReset(ctxt);
6585
6586 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6587 XML_CHAR_ENCODING_NONE);
6588 if (input == NULL)
6589 return (NULL);
6590 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6591 if (stream == NULL) {
6592 xmlFreeParserInputBuffer(input);
6593 return (NULL);
6594 }
6595 inputPush(ctxt, stream);
6596 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6597}
6598
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006599#define bottom_HTMLparser
6600#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006601#endif /* LIBXML_HTML_ENABLED */