blob: b2c9c525314f95dfe18c2af4c6986831c87eef84 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200276
Owen Taylor3473f882001-02-23 17:55:21 +0000277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200291 * htmlFindEncoding:
292 * @the HTML parser context
293 *
294 * Ty to find and encoding in the current data available in the input
295 * buffer this is needed to try to switch to the proper encoding when
296 * one face a character error.
297 * That's an heuristic, since it's operating outside of parsing it could
298 * try to use a meta which had been commented out, that's the reason it
299 * should only be used in case of error, not as a default.
300 *
301 * Returns an encoding string or NULL if not found, the string need to
302 * be freed
303 */
304static xmlChar *
305htmlFindEncoding(xmlParserCtxtPtr ctxt) {
306 const xmlChar *start, *cur, *end;
307
308 if ((ctxt == NULL) || (ctxt->input == NULL) ||
309 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
310 (ctxt->input->buf->encoder != NULL))
311 return(NULL);
312 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
313 return(NULL);
314
315 start = ctxt->input->cur;
316 end = ctxt->input->end;
317 /* we also expect the input buffer to be zero terminated */
318 if (*end != 0)
319 return(NULL);
320
321 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
322 if (cur == NULL)
323 return(NULL);
324 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
325 if (cur == NULL)
326 return(NULL);
327 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
328 if (cur == NULL)
329 return(NULL);
330 cur += 8;
331 start = cur;
332 while (((*cur >= 'A') && (*cur <= 'Z')) ||
333 ((*cur >= 'a') && (*cur <= 'z')) ||
334 ((*cur >= '0') && (*cur <= '9')) ||
335 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
336 cur++;
337 if (cur == start)
338 return(NULL);
339 return(xmlStrndup(start, cur - start));
340}
341
342/**
Owen Taylor3473f882001-02-23 17:55:21 +0000343 * htmlCurrentChar:
344 * @ctxt: the HTML parser context
345 * @len: pointer to the length of the char read
346 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000347 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000348 * bytes in the input buffer. Implement the end of line normalization:
349 * 2.11 End-of-Line Handling
350 * If the encoding is unspecified, in the case we find an ISO-Latin-1
351 * char, then the encoding converter is plugged in automatically.
352 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000353 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000354 */
355
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000356static int
Owen Taylor3473f882001-02-23 17:55:21 +0000357htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
358 if (ctxt->instate == XML_PARSER_EOF)
359 return(0);
360
361 if (ctxt->token != 0) {
362 *len = 0;
363 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200364 }
Owen Taylor3473f882001-02-23 17:55:21 +0000365 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
366 /*
367 * We are supposed to handle UTF8, check it's valid
368 * From rfc2044: encoding of the Unicode values on UTF-8:
369 *
370 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
371 * 0000 0000-0000 007F 0xxxxxxx
372 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200373 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000374 *
375 * Check for the 0x110000 limit too
376 */
377 const unsigned char *cur = ctxt->input->cur;
378 unsigned char c;
379 unsigned int val;
380
381 c = *cur;
382 if (c & 0x80) {
383 if (cur[1] == 0)
384 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
385 if ((cur[1] & 0xc0) != 0x80)
386 goto encoding_error;
387 if ((c & 0xe0) == 0xe0) {
388
389 if (cur[2] == 0)
390 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
391 if ((cur[2] & 0xc0) != 0x80)
392 goto encoding_error;
393 if ((c & 0xf0) == 0xf0) {
394 if (cur[3] == 0)
395 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
396 if (((c & 0xf8) != 0xf0) ||
397 ((cur[3] & 0xc0) != 0x80))
398 goto encoding_error;
399 /* 4-byte code */
400 *len = 4;
401 val = (cur[0] & 0x7) << 18;
402 val |= (cur[1] & 0x3f) << 12;
403 val |= (cur[2] & 0x3f) << 6;
404 val |= cur[3] & 0x3f;
405 } else {
406 /* 3-byte code */
407 *len = 3;
408 val = (cur[0] & 0xf) << 12;
409 val |= (cur[1] & 0x3f) << 6;
410 val |= cur[2] & 0x3f;
411 }
412 } else {
413 /* 2-byte code */
414 *len = 2;
415 val = (cur[0] & 0x1f) << 6;
416 val |= cur[1] & 0x3f;
417 }
418 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000419 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
420 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200421 }
Owen Taylor3473f882001-02-23 17:55:21 +0000422 return(val);
423 } else {
424 /* 1-byte code */
425 *len = 1;
426 return((int) *ctxt->input->cur);
427 }
428 }
429 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000430 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000431 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000432 * XML constructs only use < 128 chars
433 */
434 *len = 1;
435 if ((int) *ctxt->input->cur < 0x80)
436 return((int) *ctxt->input->cur);
437
438 /*
439 * Humm this is bad, do an automatic flow conversion
440 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200441 {
442 xmlChar * guess;
443 xmlCharEncodingHandlerPtr handler;
444
445 guess = htmlFindEncoding(ctxt);
446 if (guess == NULL) {
447 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
448 } else {
449 if (ctxt->input->encoding != NULL)
450 xmlFree((xmlChar *) ctxt->input->encoding);
451 ctxt->input->encoding = guess;
452 handler = xmlFindCharEncodingHandler((const char *) guess);
453 if (handler != NULL) {
454 xmlSwitchToEncoding(ctxt, handler);
455 } else {
456 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
457 "Unsupported encoding %s", guess, NULL);
458 }
459 }
460 ctxt->charset = XML_CHAR_ENCODING_UTF8;
461 }
462
Owen Taylor3473f882001-02-23 17:55:21 +0000463 return(xmlCurrentChar(ctxt, len));
464
465encoding_error:
466 /*
467 * If we detect an UTF8 error that probably mean that the
468 * input encoding didn't get properly advertized in the
469 * declaration header. Report the error and switch the encoding
470 * to ISO-Latin-1 (if you don't like this policy, just declare the
471 * encoding !)
472 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000473 {
474 char buffer[150];
475
Daniel Veillard861101d2007-06-12 08:38:57 +0000476 if (ctxt->input->end - ctxt->input->cur >= 4) {
477 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
478 ctxt->input->cur[0], ctxt->input->cur[1],
479 ctxt->input->cur[2], ctxt->input->cur[3]);
480 } else {
481 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
482 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000483 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
484 "Input is not proper UTF-8, indicate encoding !\n",
485 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000486 }
487
Daniel Veillarde77db162009-08-22 11:32:38 +0200488 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000489 *len = 1;
490 return((int) *ctxt->input->cur);
491}
492
493/**
Owen Taylor3473f882001-02-23 17:55:21 +0000494 * htmlSkipBlankChars:
495 * @ctxt: the HTML parser context
496 *
497 * skip all blanks character found at that point in the input streams.
498 *
499 * Returns the number of space chars skipped
500 */
501
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000502static int
Owen Taylor3473f882001-02-23 17:55:21 +0000503htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
504 int res = 0;
505
William M. Brack76e95df2003-10-18 16:20:14 +0000506 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000507 if ((*ctxt->input->cur == 0) &&
508 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
509 xmlPopInput(ctxt);
510 } else {
511 if (*(ctxt->input->cur) == '\n') {
512 ctxt->input->line++; ctxt->input->col = 1;
513 } else ctxt->input->col++;
514 ctxt->input->cur++;
515 ctxt->nbChars++;
516 if (*ctxt->input->cur == 0)
517 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
518 }
519 res++;
520 }
521 return(res);
522}
523
524
525
526/************************************************************************
527 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200528 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000529 * *
530 ************************************************************************/
531
532/*
533 * Start Tag: 1 means the start tag can be ommited
534 * End Tag: 1 means the end tag can be ommited
535 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000536 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000537 * Depr: this element is deprecated
538 * DTD: 1 means that this element is valid only in the Loose DTD
539 * 2 means that this element is valid only in the Frameset DTD
540 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000541 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000543 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544
545/* Definitions and a couple of vars for HTML Elements */
546
547#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000548#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000550#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000551#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
552#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000554#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
555#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
556#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000557#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000558#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000559#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000560#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000561#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000562#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000563#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000564#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000565#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000566#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000567#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000568#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000569#define EMPTY NULL
570
571
Daniel Veillard065abe82006-07-03 08:55:04 +0000572static const char* const html_flow[] = { FLOW, NULL } ;
573static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574
575/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000576static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000577#define html_cdata html_pcdata
578
579
580/* ... and for HTML Attributes */
581
582#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000583#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000584#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000585#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000586#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000587#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000588#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000589#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000590#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000591#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000592#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000593#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000594
Daniel Veillard065abe82006-07-03 08:55:04 +0000595static const char* const html_attrs[] = { ATTRS, NULL } ;
596static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
597static const char* const core_attrs[] = { COREATTRS, NULL } ;
598static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000599
600
601/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000602static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
604 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000605static const char* const target_attr[] = { "target", NULL } ;
606static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
607static const char* const alt_attr[] = { "alt", NULL } ;
608static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
609static const char* const href_attrs[] = { "href", NULL } ;
610static const char* const clear_attrs[] = { "clear", NULL } ;
611static const char* const inline_p[] = { INLINE, "p", NULL } ;
612
613static const char* const flow_param[] = { FLOW, "param", NULL } ;
614static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615 "archive", "alt", "name", "height", "width", "align",
616 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000617static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000618 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000619static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000621static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
622static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
623static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
624static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000625 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000626static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
628
629
Daniel Veillard065abe82006-07-03 08:55:04 +0000630static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
631static const char* const col_elt[] = { "col", NULL } ;
632static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
633static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
634static const char* const dl_contents[] = { "dt", "dd", NULL } ;
635static const char* const compact_attr[] = { "compact", NULL } ;
636static const char* const label_attr[] = { "label", NULL } ;
637static const char* const fieldset_contents[] = { FLOW, "legend" } ;
638static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
639static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
640static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
641static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
642static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
643static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
644static const char* const head_attrs[] = { I18N, "profile", NULL } ;
645static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
646static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
647static const char* const version_attr[] = { "version", NULL } ;
648static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
649static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
650static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000651static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000652static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
653static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
654static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
655static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
656static const char* const align_attr[] = { "align", NULL } ;
657static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
658static const char* const map_contents[] = { BLOCK, "area", NULL } ;
659static const char* const name_attr[] = { "name", NULL } ;
660static const char* const action_attr[] = { "action", NULL } ;
661static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
662static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
663static const char* const content_attr[] = { "content", NULL } ;
664static const char* const type_attr[] = { "type", NULL } ;
665static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
666static const char* const object_contents[] = { FLOW, "param", NULL } ;
667static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
668static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
669static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
670static const char* const option_elt[] = { "option", NULL } ;
671static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
672static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
673static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
674static const char* const width_attr[] = { "width", NULL } ;
675static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
676static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
677static const char* const language_attr[] = { "language", NULL } ;
678static const char* const select_content[] = { "optgroup", "option", NULL } ;
679static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
680static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200681static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000682static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
683static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
684static const char* const tr_elt[] = { "tr", NULL } ;
685static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
686static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
687static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
688static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
689static const char* const tr_contents[] = { "th", "td", NULL } ;
690static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
691static const char* const li_elt[] = { "li", NULL } ;
692static const char* const ul_depr[] = { "type", "compact", NULL} ;
693static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000694
695#define DECL (const char**)
696
Daniel Veillard22090732001-07-16 00:06:07 +0000697static const htmlElemDesc
698html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000699{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
700 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
701},
702{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
703 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
704},
705{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
706 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
707},
708{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
709 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
710},
711{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
712 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
713},
714{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
715 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
716},
717{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
718 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
719},
720{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
721 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
722},
723{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
724 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
725},
726{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
727 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
728},
729{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
730 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
731},
732{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
733 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
734},
735{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
736 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
737},
738{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
739 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
740},
741{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
742 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
743},
744{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
745 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
746},
747{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
748 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
749},
750{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
751 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
752},
753{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
754 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
755},
756{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
757 EMPTY , NULL , DECL col_attrs , NULL, NULL
758},
759{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
760 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
761},
762{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
763 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
764},
765{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
766 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
767},
768{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
769 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
770},
771{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
772 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
773},
774{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
775 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
776},
777{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000778 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000779},
780{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
781 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
782},
783{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
784 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
785},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000786{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000787 EMPTY, NULL, DECL embed_attrs, NULL, NULL
788},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000789{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
790 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
791},
792{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
793 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
794},
795{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
796 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
797},
798{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
799 EMPTY, NULL, NULL, DECL frame_attrs, NULL
800},
801{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
802 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
803},
804{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
805 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
806},
807{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
808 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
809},
810{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
814 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
815},
816{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
817 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
818},
819{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
823 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
824},
825{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
826 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
827},
828{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
829 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
830},
831{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
832 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
833},
834{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
835 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
836},
837{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000838 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000839},
840{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
841 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
842},
843{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
844 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
845},
846{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
847 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
848},
849{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
853 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
854},
855{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
856 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
857},
858{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
859 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
860},
861{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
862 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
863},
864{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000865 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000866},
867{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
868 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
869},
870{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
871 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
872},
873{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
874 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
875},
876{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
877 DECL html_flow, "div", DECL html_attrs, NULL, NULL
878},
879{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
880 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
881},
882{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
883 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
884},
885{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000886 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000887},
888{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
889 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
890},
891{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000895 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000896},
897{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
898 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
899},
900{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
901 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
902},
903{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
904 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
905},
906{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
907 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
908},
909{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
910 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
911},
912{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
913 DECL select_content, NULL, DECL select_attrs, NULL, NULL
914},
915{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
916 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
917},
918{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
919 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
920},
921{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
922 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
923},
924{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
925 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
926},
927{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
928 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
929},
930{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
931 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
932},
933{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
934 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
935},
936{ "table", 0, 0, 0, 0, 0, 0, 0, "",
937 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
938},
939{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
940 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
941},
942{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
943 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
944},
945{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
946 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
947},
948{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
949 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
950},
951{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
952 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
953},
954{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
955 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
956},
957{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
958 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
959},
960{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
961 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
962},
963{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
964 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
965},
966{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
967 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
968},
969{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
970 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
971},
972{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
973 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
974}
Owen Taylor3473f882001-02-23 17:55:21 +0000975};
976
977/*
Owen Taylor3473f882001-02-23 17:55:21 +0000978 * start tags that imply the end of current element
979 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000980static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000981"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
982 "dl", "ul", "ol", "menu", "dir", "address", "pre",
983 "listing", "xmp", "head", NULL,
984"head", "p", NULL,
985"title", "p", NULL,
986"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000987"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000988"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
989 "pre", "listing", "xmp", "head", "li", NULL,
990"hr", "p", "head", NULL,
991"h1", "p", "head", NULL,
992"h2", "p", "head", NULL,
993"h3", "p", "head", NULL,
994"h4", "p", "head", NULL,
995"h5", "p", "head", NULL,
996"h6", "p", "head", NULL,
997"dir", "p", "head", NULL,
998"address", "p", "head", "ul", NULL,
999"pre", "p", "head", "ul", NULL,
1000"listing", "p", "head", NULL,
1001"xmp", "p", "head", NULL,
1002"blockquote", "p", "head", NULL,
1003"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1004 "xmp", "head", NULL,
1005"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1006 "head", "dd", NULL,
1007"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1008 "head", "dt", NULL,
1009"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1010 "listing", "xmp", NULL,
1011"ol", "p", "head", "ul", NULL,
1012"menu", "p", "head", "ul", NULL,
1013"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
1014"div", "p", "head", NULL,
1015"noscript", "p", "head", NULL,
1016"center", "font", "b", "i", "p", "head", NULL,
1017"a", "a", NULL,
1018"caption", "p", NULL,
1019"colgroup", "caption", "colgroup", "col", "p", NULL,
1020"col", "caption", "col", "p", NULL,
1021"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1022 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001023"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001024"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001025"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1026"thead", "caption", "col", "colgroup", NULL,
1027"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1028 "tbody", "p", NULL,
1029"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1030 "tfoot", "tbody", "p", NULL,
1031"optgroup", "option", NULL,
1032"option", "option", NULL,
1033"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1034 "pre", "listing", "xmp", "a", NULL,
1035NULL
1036};
1037
1038/*
1039 * The list of HTML elements which are supposed not to have
1040 * CDATA content and where a p element will be implied
1041 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001042 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001043 * implied paragraph
1044 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001045static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001046 "html",
1047 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001048 NULL
1049};
1050
1051/*
1052 * The list of HTML attributes which are of content %Script;
1053 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1054 * it assumes the name starts with 'on'
1055 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001056static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001057 "onclick",
1058 "ondblclick",
1059 "onmousedown",
1060 "onmouseup",
1061 "onmouseover",
1062 "onmousemove",
1063 "onmouseout",
1064 "onkeypress",
1065 "onkeydown",
1066 "onkeyup",
1067 "onload",
1068 "onunload",
1069 "onfocus",
1070 "onblur",
1071 "onsubmit",
1072 "onrest",
1073 "onchange",
1074 "onselect"
1075};
1076
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001077/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * This table is used by the htmlparser to know what to do with
1079 * broken html pages. By assigning different priorities to different
1080 * elements the parser can decide how to handle extra endtags.
1081 * Endtags are only allowed to close elements with lower or equal
1082 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001083 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001084
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001085typedef struct {
1086 const char *name;
1087 int priority;
1088} elementPriority;
1089
Daniel Veillard22090732001-07-16 00:06:07 +00001090static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001091 {"div", 150},
1092 {"td", 160},
1093 {"th", 160},
1094 {"tr", 170},
1095 {"thead", 180},
1096 {"tbody", 180},
1097 {"tfoot", 180},
1098 {"table", 190},
1099 {"head", 200},
1100 {"body", 200},
1101 {"html", 220},
1102 {NULL, 100} /* Default priority */
1103};
Owen Taylor3473f882001-02-23 17:55:21 +00001104
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001105static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001106static int htmlStartCloseIndexinitialized = 0;
1107
1108/************************************************************************
1109 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001110 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001111 * *
1112 ************************************************************************/
1113
1114/**
1115 * htmlInitAutoClose:
1116 *
1117 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1118 * This is not reentrant. Call xmlInitParser() once before processing in
1119 * case of use in multithreaded programs.
1120 */
1121void
1122htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001123 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001124
1125 if (htmlStartCloseIndexinitialized) return;
1126
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001127 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1128 indx = 0;
1129 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001130 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001131 while (htmlStartClose[i] != NULL) i++;
1132 i++;
1133 }
1134 htmlStartCloseIndexinitialized = 1;
1135}
1136
1137/**
1138 * htmlTagLookup:
1139 * @tag: The tag name in lowercase
1140 *
1141 * Lookup the HTML tag in the ElementTable
1142 *
1143 * Returns the related htmlElemDescPtr or NULL if not found.
1144 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001145const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001146htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001147 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
1149 for (i = 0; i < (sizeof(html40ElementTable) /
1150 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001151 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001152 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154 return(NULL);
1155}
1156
1157/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001158 * htmlGetEndPriority:
1159 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001160 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001161 * Return value: The "endtag" priority.
1162 **/
1163static int
1164htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001165 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001166
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001167 while ((htmlEndPriority[i].name != NULL) &&
1168 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1169 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001170
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001172}
1173
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001174
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001175/**
Owen Taylor3473f882001-02-23 17:55:21 +00001176 * htmlCheckAutoClose:
1177 * @newtag: The new tag name
1178 * @oldtag: The old tag name
1179 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001180 * Checks whether the new tag is one of the registered valid tags for
1181 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001182 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1183 *
1184 * Returns 0 if no, 1 if yes.
1185 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1188{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001189 int i, indx;
1190 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001191
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001192 if (htmlStartCloseIndexinitialized == 0)
1193 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001194
1195 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001196 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001197 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 if (closed == NULL)
1199 return (0);
1200 if (xmlStrEqual(BAD_CAST * closed, newtag))
1201 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001202 }
1203
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001204 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001205 i++;
1206 while (htmlStartClose[i] != NULL) {
1207 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001208 return (1);
1209 }
1210 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001211 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001212 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001213}
1214
1215/**
1216 * htmlAutoCloseOnClose:
1217 * @ctxt: an HTML parser context
1218 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001219 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001220 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001221 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001222 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001223static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001224htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1225{
1226 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001227 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001228
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001230
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001231 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001232
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1234 break;
1235 /*
1236 * A missplaced endtag can only close elements with lower
1237 * or equal priority, so if we find an element with higher
1238 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001239 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001240 */
1241 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1242 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001243 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 if (i < 0)
1245 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001246
1247 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001249 if ((info != NULL) && (info->endTag == 3)) {
1250 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1251 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001252 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001253 }
1254 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1255 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001256 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001257 }
1258}
1259
1260/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001261 * htmlAutoCloseOnEnd:
1262 * @ctxt: an HTML parser context
1263 *
1264 * Close all remaining tags at the end of the stream
1265 */
1266static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001267htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1268{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001269 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001270
William M. Brack899e64a2003-09-26 18:03:42 +00001271 if (ctxt->nameNr == 0)
1272 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001273 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001274 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1275 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001276 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001277 }
1278}
1279
1280/**
Owen Taylor3473f882001-02-23 17:55:21 +00001281 * htmlAutoClose:
1282 * @ctxt: an HTML parser context
1283 * @newtag: The new tag name or NULL
1284 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001285 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001286 * The list is kept in htmlStartClose array. This function is
1287 * called when a new tag has been detected and generates the
1288 * appropriates closes if possible/needed.
1289 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001290 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001291 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001292static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001293htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1294{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001295 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001296 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1298 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001299 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001300 }
1301 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001302 htmlAutoCloseOnEnd(ctxt);
1303 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001304 }
1305 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1307 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1308 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001309 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1310 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001311 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 }
Owen Taylor3473f882001-02-23 17:55:21 +00001313}
1314
1315/**
1316 * htmlAutoCloseTag:
1317 * @doc: the HTML document
1318 * @name: The tag name
1319 * @elem: the HTML element
1320 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001321 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001322 * The list is kept in htmlStartClose array. This function checks
1323 * if the element or one of it's children would autoclose the
1324 * given tag.
1325 *
1326 * Returns 1 if autoclose, 0 otherwise
1327 */
1328int
1329htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1330 htmlNodePtr child;
1331
1332 if (elem == NULL) return(1);
1333 if (xmlStrEqual(name, elem->name)) return(0);
1334 if (htmlCheckAutoClose(elem->name, name)) return(1);
1335 child = elem->children;
1336 while (child != NULL) {
1337 if (htmlAutoCloseTag(doc, name, child)) return(1);
1338 child = child->next;
1339 }
1340 return(0);
1341}
1342
1343/**
1344 * htmlIsAutoClosed:
1345 * @doc: the HTML document
1346 * @elem: the HTML element
1347 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001348 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001349 * The list is kept in htmlStartClose array. This function checks
1350 * if a tag is autoclosed by one of it's child
1351 *
1352 * Returns 1 if autoclosed, 0 otherwise
1353 */
1354int
1355htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1356 htmlNodePtr child;
1357
1358 if (elem == NULL) return(1);
1359 child = elem->children;
1360 while (child != NULL) {
1361 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1362 child = child->next;
1363 }
1364 return(0);
1365}
1366
1367/**
1368 * htmlCheckImplied:
1369 * @ctxt: an HTML parser context
1370 * @newtag: The new tag name
1371 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001372 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001373 * called when a new tag has been detected and generates the
1374 * appropriates implicit tags if missing
1375 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001376static void
Owen Taylor3473f882001-02-23 17:55:21 +00001377htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1378 if (!htmlOmittedDefaultValue)
1379 return;
1380 if (xmlStrEqual(newtag, BAD_CAST"html"))
1381 return;
1382 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1386 }
1387 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1388 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001389 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001390 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1391 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1392 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1393 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1394 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1395 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02001396 /*
Owen Taylor3473f882001-02-23 17:55:21 +00001397 * dropped OBJECT ... i you put it first BODY will be
1398 * assumed !
1399 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001400 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001401 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1402 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1403 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1404 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1405 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1406 int i;
1407 for (i = 0;i < ctxt->nameNr;i++) {
1408 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1409 return;
1410 }
1411 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1412 return;
1413 }
1414 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001415
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001416 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001417 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1418 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1419 }
1420}
1421
1422/**
1423 * htmlCheckParagraph
1424 * @ctxt: an HTML parser context
1425 *
1426 * Check whether a p element need to be implied before inserting
1427 * characters in the current element.
1428 *
1429 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1430 * in case of error.
1431 */
1432
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001433static int
Owen Taylor3473f882001-02-23 17:55:21 +00001434htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1435 const xmlChar *tag;
1436 int i;
1437
1438 if (ctxt == NULL)
1439 return(-1);
1440 tag = ctxt->name;
1441 if (tag == NULL) {
1442 htmlAutoClose(ctxt, BAD_CAST"p");
1443 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001444 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001445 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1446 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1447 return(1);
1448 }
1449 if (!htmlOmittedDefaultValue)
1450 return(0);
1451 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1452 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001453 htmlAutoClose(ctxt, BAD_CAST"p");
1454 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001455 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1458 return(1);
1459 }
1460 }
1461 return(0);
1462}
1463
1464/**
1465 * htmlIsScriptAttribute:
1466 * @name: an attribute name
1467 *
1468 * Check if an attribute is of content type Script
1469 *
1470 * Returns 1 is the attribute is a script 0 otherwise
1471 */
1472int
1473htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001474 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001475
1476 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001477 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001478 /*
1479 * all script attributes start with 'on'
1480 */
1481 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001482 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001483 for (i = 0;
1484 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1485 i++) {
1486 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1487 return(1);
1488 }
1489 return(0);
1490}
1491
1492/************************************************************************
1493 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001494 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001495 * *
1496 ************************************************************************/
1497
1498
Daniel Veillard22090732001-07-16 00:06:07 +00001499static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001500/*
1501 * the 4 absolute ones, plus apostrophe.
1502 */
1503{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1504{ 38, "amp", "ampersand, U+0026 ISOnum" },
1505{ 39, "apos", "single quote" },
1506{ 60, "lt", "less-than sign, U+003C ISOnum" },
1507{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1508
1509/*
1510 * A bunch still in the 128-255 range
1511 * Replacing them depend really on the charset used.
1512 */
1513{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1514{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1515{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1516{ 163, "pound","pound sign, U+00A3 ISOnum" },
1517{ 164, "curren","currency sign, U+00A4 ISOnum" },
1518{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1519{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1520{ 167, "sect", "section sign, U+00A7 ISOnum" },
1521{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1522{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1523{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1524{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1525{ 172, "not", "not sign, U+00AC ISOnum" },
1526{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1527{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1528{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1529{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1530{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1531{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1532{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1533{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1534{ 181, "micro","micro sign, U+00B5 ISOnum" },
1535{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1536{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1537{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1538{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1539{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1540{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1541{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1542{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1543{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1544{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1545{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1546{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1547{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1548{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1549{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1550{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1551{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1552{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1553{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1554{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1555{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1556{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1557{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1558{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1559{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1560{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1561{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1562{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1563{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1564{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1565{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1566{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1567{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1568{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1569{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1570{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1571{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1572{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1573{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1574{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1575{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1576{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1577{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1578{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1579{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1580{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1581{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1582{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1583{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1584{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1585{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1586{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1587{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1588{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1589{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1590{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1591{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1592{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1593{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1594{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1595{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1596{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1597{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1598{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1599{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1600{ 247, "divide","division sign, U+00F7 ISOnum" },
1601{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1602{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1603{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1604{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1605{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1606{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1607{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1608{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1609
1610{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1611{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1612{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1613{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1614{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1615
1616/*
1617 * Anything below should really be kept as entities references
1618 */
1619{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1620
1621{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1622{ 732, "tilde","small tilde, U+02DC ISOdia" },
1623
1624{ 913, "Alpha","greek capital letter alpha, U+0391" },
1625{ 914, "Beta", "greek capital letter beta, U+0392" },
1626{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1627{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1628{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1629{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1630{ 919, "Eta", "greek capital letter eta, U+0397" },
1631{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1632{ 921, "Iota", "greek capital letter iota, U+0399" },
1633{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001634{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001635{ 924, "Mu", "greek capital letter mu, U+039C" },
1636{ 925, "Nu", "greek capital letter nu, U+039D" },
1637{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1638{ 927, "Omicron","greek capital letter omicron, U+039F" },
1639{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1640{ 929, "Rho", "greek capital letter rho, U+03A1" },
1641{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1642{ 932, "Tau", "greek capital letter tau, U+03A4" },
1643{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1644{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1645{ 935, "Chi", "greek capital letter chi, U+03A7" },
1646{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1647{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1648
1649{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1650{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1651{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1652{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1653{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1654{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1655{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1656{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1657{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1658{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1659{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1660{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1661{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1662{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1663{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1664{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1665{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1666{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1667{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1668{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1669{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1670{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1671{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1672{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1673{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1674{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1675{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1676{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1677
1678{ 8194, "ensp", "en space, U+2002 ISOpub" },
1679{ 8195, "emsp", "em space, U+2003 ISOpub" },
1680{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1681{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1682{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1683{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1684{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1685{ 8211, "ndash","en dash, U+2013 ISOpub" },
1686{ 8212, "mdash","em dash, U+2014 ISOpub" },
1687{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1688{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1689{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1690{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1691{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1692{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1693{ 8224, "dagger","dagger, U+2020 ISOpub" },
1694{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1695
1696{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1697{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1698
1699{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1700
1701{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1702{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1703
1704{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1705{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1706
1707{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1708{ 8260, "frasl","fraction slash, U+2044 NEW" },
1709
1710{ 8364, "euro", "euro sign, U+20AC NEW" },
1711
1712{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1713{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1714{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1715{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1716{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1717{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1718{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1719{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1720{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1721{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1722{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1723{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1724{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1725{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1726{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1727{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1728
1729{ 8704, "forall","for all, U+2200 ISOtech" },
1730{ 8706, "part", "partial differential, U+2202 ISOtech" },
1731{ 8707, "exist","there exists, U+2203 ISOtech" },
1732{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1733{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1734{ 8712, "isin", "element of, U+2208 ISOtech" },
1735{ 8713, "notin","not an element of, U+2209 ISOtech" },
1736{ 8715, "ni", "contains as member, U+220B ISOtech" },
1737{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001738{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001739{ 8722, "minus","minus sign, U+2212 ISOtech" },
1740{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1741{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1742{ 8733, "prop", "proportional to, U+221D ISOtech" },
1743{ 8734, "infin","infinity, U+221E ISOtech" },
1744{ 8736, "ang", "angle, U+2220 ISOamso" },
1745{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1746{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1747{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1748{ 8746, "cup", "union = cup, U+222A ISOtech" },
1749{ 8747, "int", "integral, U+222B ISOtech" },
1750{ 8756, "there4","therefore, U+2234 ISOtech" },
1751{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1752{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1753{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1754{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1755{ 8801, "equiv","identical to, U+2261 ISOtech" },
1756{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1757{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1758{ 8834, "sub", "subset of, U+2282 ISOtech" },
1759{ 8835, "sup", "superset of, U+2283 ISOtech" },
1760{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1761{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1762{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1763{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1764{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1765{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1766{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1767{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1768{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1769{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1770{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1771{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1772{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1773{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1774
1775{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1776{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1777{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1778{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1779
1780};
1781
1782/************************************************************************
1783 * *
1784 * Commodity functions to handle entities *
1785 * *
1786 ************************************************************************/
1787
1788/*
1789 * Macro used to grow the current buffer.
1790 */
1791#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001792 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001793 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001794 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1795 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001796 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001797 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(NULL); \
1799 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001800 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001801}
1802
1803/**
1804 * htmlEntityLookup:
1805 * @name: the entity name
1806 *
1807 * Lookup the given entity in EntitiesTable
1808 *
1809 * TODO: the linear scan is really ugly, an hash table is really needed.
1810 *
1811 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1812 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001813const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001814htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001815 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 for (i = 0;i < (sizeof(html40EntitiesTable)/
1818 sizeof(html40EntitiesTable[0]));i++) {
1819 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001820 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001821 }
1822 }
1823 return(NULL);
1824}
1825
1826/**
1827 * htmlEntityValueLookup:
1828 * @value: the entity's unicode value
1829 *
1830 * Lookup the given entity in EntitiesTable
1831 *
1832 * TODO: the linear scan is really ugly, an hash table is really needed.
1833 *
1834 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1835 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001836const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001837htmlEntityValueLookup(unsigned int value) {
1838 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001839
1840 for (i = 0;i < (sizeof(html40EntitiesTable)/
1841 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001842 if (html40EntitiesTable[i].value >= value) {
1843 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001844 break;
William M. Brack78637da2003-07-31 14:47:38 +00001845 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001846 }
Owen Taylor3473f882001-02-23 17:55:21 +00001847 }
1848 return(NULL);
1849}
1850
1851/**
1852 * UTF8ToHtml:
1853 * @out: a pointer to an array of bytes to store the result
1854 * @outlen: the length of @out
1855 * @in: a pointer to an array of UTF-8 chars
1856 * @inlen: the length of @in
1857 *
1858 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1859 * plus HTML entities block of chars out.
1860 *
1861 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1862 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001863 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001864 * The value of @outlen after return is the number of octets consumed.
1865 */
1866int
1867UTF8ToHtml(unsigned char* out, int *outlen,
1868 const unsigned char* in, int *inlen) {
1869 const unsigned char* processed = in;
1870 const unsigned char* outend;
1871 const unsigned char* outstart = out;
1872 const unsigned char* instart = in;
1873 const unsigned char* inend;
1874 unsigned int c, d;
1875 int trailing;
1876
Daniel Veillardce682bc2004-11-05 17:22:25 +00001877 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001878 if (in == NULL) {
1879 /*
1880 * initialization nothing to do
1881 */
1882 *outlen = 0;
1883 *inlen = 0;
1884 return(0);
1885 }
1886 inend = in + (*inlen);
1887 outend = out + (*outlen);
1888 while (in < inend) {
1889 d = *in++;
1890 if (d < 0x80) { c= d; trailing= 0; }
1891 else if (d < 0xC0) {
1892 /* trailing byte in leading position */
1893 *outlen = out - outstart;
1894 *inlen = processed - instart;
1895 return(-2);
1896 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1897 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1898 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1899 else {
1900 /* no chance for this in Ascii */
1901 *outlen = out - outstart;
1902 *inlen = processed - instart;
1903 return(-2);
1904 }
1905
1906 if (inend - in < trailing) {
1907 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001908 }
Owen Taylor3473f882001-02-23 17:55:21 +00001909
1910 for ( ; trailing; trailing--) {
1911 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1912 break;
1913 c <<= 6;
1914 c |= d & 0x3F;
1915 }
1916
1917 /* assertion: c is a single UTF-4 value */
1918 if (c < 0x80) {
1919 if (out + 1 >= outend)
1920 break;
1921 *out++ = c;
1922 } else {
1923 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001924 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001925 const char *cp;
1926 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001927
1928 /*
1929 * Try to lookup a predefined HTML entity for it
1930 */
1931
1932 ent = htmlEntityValueLookup(c);
1933 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001934 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1935 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001936 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001937 else
1938 cp = ent->name;
1939 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001940 if (out + 2 + len >= outend)
1941 break;
1942 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001943 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001944 out += len;
1945 *out++ = ';';
1946 }
1947 processed = in;
1948 }
1949 *outlen = out - outstart;
1950 *inlen = processed - instart;
1951 return(0);
1952}
1953
1954/**
1955 * htmlEncodeEntities:
1956 * @out: a pointer to an array of bytes to store the result
1957 * @outlen: the length of @out
1958 * @in: a pointer to an array of UTF-8 chars
1959 * @inlen: the length of @in
1960 * @quoteChar: the quote character to escape (' or ") or zero.
1961 *
1962 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1963 * plus HTML entities block of chars out.
1964 *
1965 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1966 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001967 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001968 * The value of @outlen after return is the number of octets consumed.
1969 */
1970int
1971htmlEncodeEntities(unsigned char* out, int *outlen,
1972 const unsigned char* in, int *inlen, int quoteChar) {
1973 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001974 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001975 const unsigned char* outstart = out;
1976 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001977 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001978 unsigned int c, d;
1979 int trailing;
1980
Daniel Veillardce682bc2004-11-05 17:22:25 +00001981 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1982 return(-1);
1983 outend = out + (*outlen);
1984 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001985 while (in < inend) {
1986 d = *in++;
1987 if (d < 0x80) { c= d; trailing= 0; }
1988 else if (d < 0xC0) {
1989 /* trailing byte in leading position */
1990 *outlen = out - outstart;
1991 *inlen = processed - instart;
1992 return(-2);
1993 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1994 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1995 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1996 else {
1997 /* no chance for this in Ascii */
1998 *outlen = out - outstart;
1999 *inlen = processed - instart;
2000 return(-2);
2001 }
2002
2003 if (inend - in < trailing)
2004 break;
2005
2006 while (trailing--) {
2007 if (((d= *in++) & 0xC0) != 0x80) {
2008 *outlen = out - outstart;
2009 *inlen = processed - instart;
2010 return(-2);
2011 }
2012 c <<= 6;
2013 c |= d & 0x3F;
2014 }
2015
2016 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002017 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2018 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002019 if (out >= outend)
2020 break;
2021 *out++ = c;
2022 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002023 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002024 const char *cp;
2025 char nbuf[16];
2026 int len;
2027
2028 /*
2029 * Try to lookup a predefined HTML entity for it
2030 */
2031 ent = htmlEntityValueLookup(c);
2032 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002033 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002034 cp = nbuf;
2035 }
2036 else
2037 cp = ent->name;
2038 len = strlen(cp);
2039 if (out + 2 + len > outend)
2040 break;
2041 *out++ = '&';
2042 memcpy(out, cp, len);
2043 out += len;
2044 *out++ = ';';
2045 }
2046 processed = in;
2047 }
2048 *outlen = out - outstart;
2049 *inlen = processed - instart;
2050 return(0);
2051}
2052
Owen Taylor3473f882001-02-23 17:55:21 +00002053/************************************************************************
2054 * *
2055 * Commodity functions to handle streams *
2056 * *
2057 ************************************************************************/
2058
2059/**
Owen Taylor3473f882001-02-23 17:55:21 +00002060 * htmlNewInputStream:
2061 * @ctxt: an HTML parser context
2062 *
2063 * Create a new input stream structure
2064 * Returns the new input stream or NULL
2065 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002066static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002067htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2068 htmlParserInputPtr input;
2069
2070 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2071 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002072 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002073 return(NULL);
2074 }
2075 memset(input, 0, sizeof(htmlParserInput));
2076 input->filename = NULL;
2077 input->directory = NULL;
2078 input->base = NULL;
2079 input->cur = NULL;
2080 input->buf = NULL;
2081 input->line = 1;
2082 input->col = 1;
2083 input->buf = NULL;
2084 input->free = NULL;
2085 input->version = NULL;
2086 input->consumed = 0;
2087 input->length = 0;
2088 return(input);
2089}
2090
2091
2092/************************************************************************
2093 * *
2094 * Commodity functions, cleanup needed ? *
2095 * *
2096 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002097/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002098 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002099 * NOTE: it might be more apropriate to integrate this information
2100 * into the html40ElementTable array but I don't want to risk any
2101 * binary incomptibility
2102 */
2103static const char *allowPCData[] = {
2104 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2105 "blockquote", "body", "button", "caption", "center", "cite", "code",
2106 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2107 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2108 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2109 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2110};
Owen Taylor3473f882001-02-23 17:55:21 +00002111
2112/**
2113 * areBlanks:
2114 * @ctxt: an HTML parser context
2115 * @str: a xmlChar *
2116 * @len: the size of @str
2117 *
2118 * Is this a sequence of blank chars that one can ignore ?
2119 *
2120 * Returns 1 if ignorable 0 otherwise.
2121 */
2122
2123static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002124 unsigned int i;
2125 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002126 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002127 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002128
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002129 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002130 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002131
2132 if (CUR == 0) return(1);
2133 if (CUR != '<') return(0);
2134 if (ctxt->name == NULL)
2135 return(1);
2136 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2137 return(1);
2138 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2139 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002140
2141 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2142 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2143 dtd = xmlGetIntSubset(ctxt->myDoc);
2144 if (dtd != NULL && dtd->ExternalID != NULL) {
2145 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2146 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2147 return(1);
2148 }
2149 }
2150
Owen Taylor3473f882001-02-23 17:55:21 +00002151 if (ctxt->node == NULL) return(0);
2152 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002153 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2154 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002155 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002156 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2157 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002158 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002159 for all tags "b" allowing PCDATA */
2160 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2161 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2162 return(0);
2163 }
2164 }
Owen Taylor3473f882001-02-23 17:55:21 +00002165 } else if (xmlNodeIsText(lastChild)) {
2166 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002167 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002168 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002169 for all tags "p" allowing PCDATA */
2170 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2171 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2172 return(0);
2173 }
2174 }
Owen Taylor3473f882001-02-23 17:55:21 +00002175 }
2176 return(1);
2177}
2178
2179/**
Owen Taylor3473f882001-02-23 17:55:21 +00002180 * htmlNewDocNoDtD:
2181 * @URI: URI for the dtd, or NULL
2182 * @ExternalID: the external ID of the DTD, or NULL
2183 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002184 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2185 * are NULL
2186 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002187 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002188 */
2189htmlDocPtr
2190htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2191 xmlDocPtr cur;
2192
2193 /*
2194 * Allocate a new document and fill the fields.
2195 */
2196 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2197 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002198 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002199 return(NULL);
2200 }
2201 memset(cur, 0, sizeof(xmlDoc));
2202
2203 cur->type = XML_HTML_DOCUMENT_NODE;
2204 cur->version = NULL;
2205 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002206 cur->doc = cur;
2207 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002208 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002209 cur->extSubset = NULL;
2210 cur->oldNs = NULL;
2211 cur->encoding = NULL;
2212 cur->standalone = 1;
2213 cur->compression = 0;
2214 cur->ids = NULL;
2215 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002216 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002217 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002218 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002219 if ((ExternalID != NULL) ||
2220 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002221 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002222 return(cur);
2223}
2224
2225/**
2226 * htmlNewDoc:
2227 * @URI: URI for the dtd, or NULL
2228 * @ExternalID: the external ID of the DTD, or NULL
2229 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002230 * Creates a new HTML document
2231 *
Owen Taylor3473f882001-02-23 17:55:21 +00002232 * Returns a new document
2233 */
2234htmlDocPtr
2235htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2236 if ((URI == NULL) && (ExternalID == NULL))
2237 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002238 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2239 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002240
2241 return(htmlNewDocNoDtD(URI, ExternalID));
2242}
2243
2244
2245/************************************************************************
2246 * *
2247 * The parser itself *
2248 * Relates to http://www.w3.org/TR/html40 *
2249 * *
2250 ************************************************************************/
2251
2252/************************************************************************
2253 * *
2254 * The parser itself *
2255 * *
2256 ************************************************************************/
2257
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002258static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002259
Owen Taylor3473f882001-02-23 17:55:21 +00002260/**
2261 * htmlParseHTMLName:
2262 * @ctxt: an HTML parser context
2263 *
2264 * parse an HTML tag or attribute name, note that we convert it to lowercase
2265 * since HTML names are not case-sensitive.
2266 *
2267 * Returns the Tag Name parsed or NULL
2268 */
2269
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002270static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002271htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002272 int i = 0;
2273 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2274
William M. Brackd1757ab2004-10-02 22:07:48 +00002275 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002276 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002277
2278 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002279 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002280 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2281 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002282 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2283 else loc[i] = CUR;
2284 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002285
Owen Taylor3473f882001-02-23 17:55:21 +00002286 NEXT;
2287 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002288
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002289 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002290}
2291
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002292
2293/**
2294 * htmlParseHTMLName_nonInvasive:
2295 * @ctxt: an HTML parser context
2296 *
2297 * parse an HTML tag or attribute name, note that we convert it to lowercase
2298 * since HTML names are not case-sensitive, this doesn't consume the data
2299 * from the stream, it's a look-ahead
2300 *
2301 * Returns the Tag Name parsed or NULL
2302 */
2303
2304static const xmlChar *
2305htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2306 int i = 0;
2307 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2308
2309 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2310 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002311
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002312 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2313 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2314 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2315 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2316 else loc[i] = NXT(1+i);
2317 i++;
2318 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002319
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002320 return(xmlDictLookup(ctxt->dict, loc, i));
2321}
2322
2323
Owen Taylor3473f882001-02-23 17:55:21 +00002324/**
2325 * htmlParseName:
2326 * @ctxt: an HTML parser context
2327 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002328 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002329 *
2330 * Returns the Name parsed or NULL
2331 */
2332
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002333static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002334htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002335 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002336 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002337 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002338
2339 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002340
2341 /*
2342 * Accelerator for simple ASCII names
2343 */
2344 in = ctxt->input->cur;
2345 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2346 ((*in >= 0x41) && (*in <= 0x5A)) ||
2347 (*in == '_') || (*in == ':')) {
2348 in++;
2349 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2350 ((*in >= 0x41) && (*in <= 0x5A)) ||
2351 ((*in >= 0x30) && (*in <= 0x39)) ||
2352 (*in == '_') || (*in == '-') ||
2353 (*in == ':') || (*in == '.'))
2354 in++;
2355 if ((*in > 0) && (*in < 0x80)) {
2356 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002357 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002358 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002359 ctxt->nbChars += count;
2360 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002361 return(ret);
2362 }
2363 }
2364 return(htmlParseNameComplex(ctxt));
2365}
2366
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002367static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002368htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002369 int len = 0, l;
2370 int c;
2371 int count = 0;
2372
2373 /*
2374 * Handler for more complex cases
2375 */
2376 GROW;
2377 c = CUR_CHAR(l);
2378 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2379 (!IS_LETTER(c) && (c != '_') &&
2380 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002381 return(NULL);
2382 }
2383
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002384 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2385 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2386 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002387 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002388 (IS_COMBINING(c)) ||
2389 (IS_EXTENDER(c)))) {
2390 if (count++ > 100) {
2391 count = 0;
2392 GROW;
2393 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002394 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002395 NEXTL(l);
2396 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002397 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002398 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002399}
2400
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002401
Owen Taylor3473f882001-02-23 17:55:21 +00002402/**
2403 * htmlParseHTMLAttribute:
2404 * @ctxt: an HTML parser context
2405 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002406 *
Owen Taylor3473f882001-02-23 17:55:21 +00002407 * parse an HTML attribute value till the stop (quote), if
2408 * stop is 0 then it stops at the first space
2409 *
2410 * Returns the attribute parsed or NULL
2411 */
2412
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002413static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002414htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2415 xmlChar *buffer = NULL;
2416 int buffer_size = 0;
2417 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002418 const xmlChar *name = NULL;
2419 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002420 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002421
2422 /*
2423 * allocate a translation buffer.
2424 */
2425 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002426 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002427 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002428 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002429 return(NULL);
2430 }
2431 out = buffer;
2432
2433 /*
2434 * Ok loop until we reach one of the ending chars
2435 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002436 while ((CUR != 0) && (CUR != stop)) {
2437 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002438 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002439 if (CUR == '&') {
2440 if (NXT(1) == '#') {
2441 unsigned int c;
2442 int bits;
2443
2444 c = htmlParseCharRef(ctxt);
2445 if (c < 0x80)
2446 { *out++ = c; bits= -6; }
2447 else if (c < 0x800)
2448 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2449 else if (c < 0x10000)
2450 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002451 else
Owen Taylor3473f882001-02-23 17:55:21 +00002452 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002453
Owen Taylor3473f882001-02-23 17:55:21 +00002454 for ( ; bits >= 0; bits-= 6) {
2455 *out++ = ((c >> bits) & 0x3F) | 0x80;
2456 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002457
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002458 if (out - buffer > buffer_size - 100) {
2459 int indx = out - buffer;
2460
2461 growBuffer(buffer);
2462 out = &buffer[indx];
2463 }
Owen Taylor3473f882001-02-23 17:55:21 +00002464 } else {
2465 ent = htmlParseEntityRef(ctxt, &name);
2466 if (name == NULL) {
2467 *out++ = '&';
2468 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002469 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002470
2471 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002472 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002473 }
2474 } else if (ent == NULL) {
2475 *out++ = '&';
2476 cur = name;
2477 while (*cur != 0) {
2478 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002479 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002480
2481 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002482 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002483 }
2484 *out++ = *cur++;
2485 }
Owen Taylor3473f882001-02-23 17:55:21 +00002486 } else {
2487 unsigned int c;
2488 int bits;
2489
2490 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002491 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002492
2493 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002494 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002495 }
Daniel Veillard48519092006-10-17 15:56:35 +00002496 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002497 if (c < 0x80)
2498 { *out++ = c; bits= -6; }
2499 else if (c < 0x800)
2500 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2501 else if (c < 0x10000)
2502 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002503 else
Owen Taylor3473f882001-02-23 17:55:21 +00002504 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002505
Owen Taylor3473f882001-02-23 17:55:21 +00002506 for ( ; bits >= 0; bits-= 6) {
2507 *out++ = ((c >> bits) & 0x3F) | 0x80;
2508 }
Owen Taylor3473f882001-02-23 17:55:21 +00002509 }
2510 }
2511 } else {
2512 unsigned int c;
2513 int bits, l;
2514
2515 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002516 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002517
2518 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002519 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002520 }
2521 c = CUR_CHAR(l);
2522 if (c < 0x80)
2523 { *out++ = c; bits= -6; }
2524 else if (c < 0x800)
2525 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2526 else if (c < 0x10000)
2527 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002528 else
Owen Taylor3473f882001-02-23 17:55:21 +00002529 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002530
Owen Taylor3473f882001-02-23 17:55:21 +00002531 for ( ; bits >= 0; bits-= 6) {
2532 *out++ = ((c >> bits) & 0x3F) | 0x80;
2533 }
2534 NEXT;
2535 }
2536 }
2537 *out++ = 0;
2538 return(buffer);
2539}
2540
2541/**
Owen Taylor3473f882001-02-23 17:55:21 +00002542 * htmlParseEntityRef:
2543 * @ctxt: an HTML parser context
2544 * @str: location to store the entity name
2545 *
2546 * parse an HTML ENTITY references
2547 *
2548 * [68] EntityRef ::= '&' Name ';'
2549 *
2550 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2551 * if non-NULL *str will have to be freed by the caller.
2552 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002553const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002554htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2555 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002556 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002557
2558 if (str != NULL) *str = NULL;
2559 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002560
2561 if (CUR == '&') {
2562 NEXT;
2563 name = htmlParseName(ctxt);
2564 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002565 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2566 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002567 } else {
2568 GROW;
2569 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002570 if (str != NULL)
2571 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002572
2573 /*
2574 * Lookup the entity in the table.
2575 */
2576 ent = htmlEntityLookup(name);
2577 if (ent != NULL) /* OK that's ugly !!! */
2578 NEXT;
2579 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002580 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2581 "htmlParseEntityRef: expecting ';'\n",
2582 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002583 if (str != NULL)
2584 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002585 }
2586 }
2587 }
2588 return(ent);
2589}
2590
2591/**
2592 * htmlParseAttValue:
2593 * @ctxt: an HTML parser context
2594 *
2595 * parse a value for an attribute
2596 * Note: the parser won't do substitution of entities here, this
2597 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002598 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002599 *
2600 * Returns the AttValue parsed or NULL.
2601 */
2602
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002603static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002604htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2605 xmlChar *ret = NULL;
2606
2607 if (CUR == '"') {
2608 NEXT;
2609 ret = htmlParseHTMLAttribute(ctxt, '"');
2610 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002611 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2612 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002613 } else
2614 NEXT;
2615 } else if (CUR == '\'') {
2616 NEXT;
2617 ret = htmlParseHTMLAttribute(ctxt, '\'');
2618 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002619 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2620 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002621 } else
2622 NEXT;
2623 } else {
2624 /*
2625 * That's an HTMLism, the attribute value may not be quoted
2626 */
2627 ret = htmlParseHTMLAttribute(ctxt, 0);
2628 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002629 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2630 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002631 }
2632 }
2633 return(ret);
2634}
2635
2636/**
2637 * htmlParseSystemLiteral:
2638 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002639 *
Owen Taylor3473f882001-02-23 17:55:21 +00002640 * parse an HTML Literal
2641 *
2642 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2643 *
2644 * Returns the SystemLiteral parsed or NULL
2645 */
2646
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002647static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002648htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2649 const xmlChar *q;
2650 xmlChar *ret = NULL;
2651
2652 if (CUR == '"') {
2653 NEXT;
2654 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002655 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002656 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002657 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002658 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2659 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002660 } else {
2661 ret = xmlStrndup(q, CUR_PTR - q);
2662 NEXT;
2663 }
2664 } else if (CUR == '\'') {
2665 NEXT;
2666 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002667 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002668 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002669 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002670 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2671 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002672 } else {
2673 ret = xmlStrndup(q, CUR_PTR - q);
2674 NEXT;
2675 }
2676 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002677 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2678 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002679 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002680
Owen Taylor3473f882001-02-23 17:55:21 +00002681 return(ret);
2682}
2683
2684/**
2685 * htmlParsePubidLiteral:
2686 * @ctxt: an HTML parser context
2687 *
2688 * parse an HTML public literal
2689 *
2690 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2691 *
2692 * Returns the PubidLiteral parsed or NULL.
2693 */
2694
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002695static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002696htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2697 const xmlChar *q;
2698 xmlChar *ret = NULL;
2699 /*
2700 * Name ::= (Letter | '_') (NameChar)*
2701 */
2702 if (CUR == '"') {
2703 NEXT;
2704 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002705 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002706 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002707 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2708 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002709 } else {
2710 ret = xmlStrndup(q, CUR_PTR - q);
2711 NEXT;
2712 }
2713 } else if (CUR == '\'') {
2714 NEXT;
2715 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002716 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002717 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002718 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002719 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2720 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002721 } else {
2722 ret = xmlStrndup(q, CUR_PTR - q);
2723 NEXT;
2724 }
2725 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002726 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2727 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002728 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002729
Owen Taylor3473f882001-02-23 17:55:21 +00002730 return(ret);
2731}
2732
2733/**
2734 * htmlParseScript:
2735 * @ctxt: an HTML parser context
2736 *
2737 * parse the content of an HTML SCRIPT or STYLE element
2738 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2739 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2740 * http://www.w3.org/TR/html4/types.html#type-script
2741 * http://www.w3.org/TR/html4/types.html#h-6.15
2742 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2743 *
2744 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2745 * element and the value of intrinsic event attributes. User agents must
2746 * not evaluate script data as HTML markup but instead must pass it on as
2747 * data to a script engine.
2748 * NOTES:
2749 * - The content is passed like CDATA
2750 * - the attributes for style and scripting "onXXX" are also described
2751 * as CDATA but SGML allows entities references in attributes so their
2752 * processing is identical as other attributes
2753 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002754static void
Owen Taylor3473f882001-02-23 17:55:21 +00002755htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002756 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002757 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002758 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002759
2760 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002761 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002762 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002763 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002764 /*
2765 * One should break here, the specification is clear:
2766 * Authors should therefore escape "</" within the content.
2767 * Escape mechanisms are specific to each scripting or
2768 * style sheet language.
2769 *
2770 * In recovery mode, only break if end tag match the
2771 * current tag, effectively ignoring all tags inside the
2772 * script/style block and treating the entire block as
2773 * CDATA.
2774 */
2775 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002776 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2777 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002778 {
2779 break; /* while */
2780 } else {
2781 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002782 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002783 ctxt->name, NULL);
2784 }
2785 } else {
2786 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002787 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002788 {
2789 break; /* while */
2790 }
2791 }
Owen Taylor3473f882001-02-23 17:55:21 +00002792 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002793 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002794 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2795 if (ctxt->sax->cdataBlock!= NULL) {
2796 /*
2797 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2798 */
2799 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002800 } else if (ctxt->sax->characters != NULL) {
2801 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002802 }
2803 nbchar = 0;
2804 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002805 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002806 NEXTL(l);
2807 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002808 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002809
Daniel Veillard68716a72006-10-16 09:32:17 +00002810 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002811 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2812 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002813 NEXT;
2814 }
2815
2816 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2817 if (ctxt->sax->cdataBlock!= NULL) {
2818 /*
2819 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2820 */
2821 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002822 } else if (ctxt->sax->characters != NULL) {
2823 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002824 }
2825 }
2826}
2827
2828
2829/**
2830 * htmlParseCharData:
2831 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002832 *
2833 * parse a CharData section.
2834 * if we are within a CDATA section ']]>' marks an end of section.
2835 *
2836 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2837 */
2838
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002839static void
2840htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002841 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2842 int nbchar = 0;
2843 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002844 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002845
2846 SHRINK;
2847 cur = CUR_CHAR(l);
2848 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002849 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002850 (cur != 0)) {
2851 if (!(IS_CHAR(cur))) {
2852 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2853 "Invalid char in CDATA 0x%X\n", cur);
2854 } else {
2855 COPY_BUF(l,buf,nbchar,cur);
2856 }
Owen Taylor3473f882001-02-23 17:55:21 +00002857 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2858 /*
2859 * Ok the segment is to be consumed as chars.
2860 */
2861 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2862 if (areBlanks(ctxt, buf, nbchar)) {
2863 if (ctxt->sax->ignorableWhitespace != NULL)
2864 ctxt->sax->ignorableWhitespace(ctxt->userData,
2865 buf, nbchar);
2866 } else {
2867 htmlCheckParagraph(ctxt);
2868 if (ctxt->sax->characters != NULL)
2869 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2870 }
2871 }
2872 nbchar = 0;
2873 }
2874 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002875 chunk++;
2876 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2877 chunk = 0;
2878 SHRINK;
2879 GROW;
2880 }
Owen Taylor3473f882001-02-23 17:55:21 +00002881 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002882 if (cur == 0) {
2883 SHRINK;
2884 GROW;
2885 cur = CUR_CHAR(l);
2886 }
Owen Taylor3473f882001-02-23 17:55:21 +00002887 }
2888 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002889 buf[nbchar] = 0;
2890
Owen Taylor3473f882001-02-23 17:55:21 +00002891 /*
2892 * Ok the segment is to be consumed as chars.
2893 */
2894 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2895 if (areBlanks(ctxt, buf, nbchar)) {
2896 if (ctxt->sax->ignorableWhitespace != NULL)
2897 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2898 } else {
2899 htmlCheckParagraph(ctxt);
2900 if (ctxt->sax->characters != NULL)
2901 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2902 }
2903 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002904 } else {
2905 /*
2906 * Loop detection
2907 */
2908 if (cur == 0)
2909 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002910 }
2911}
2912
2913/**
2914 * htmlParseExternalID:
2915 * @ctxt: an HTML parser context
2916 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002917 *
2918 * Parse an External ID or a Public ID
2919 *
Owen Taylor3473f882001-02-23 17:55:21 +00002920 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2921 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2922 *
2923 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2924 *
2925 * Returns the function returns SystemLiteral and in the second
2926 * case publicID receives PubidLiteral, is strict is off
2927 * it is possible to return NULL and have publicID set.
2928 */
2929
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002930static xmlChar *
2931htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002932 xmlChar *URI = NULL;
2933
2934 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2935 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2936 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2937 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002938 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002939 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2940 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002941 }
2942 SKIP_BLANKS;
2943 URI = htmlParseSystemLiteral(ctxt);
2944 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002945 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2946 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002947 }
2948 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2949 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2950 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2951 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002952 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002953 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2954 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002955 }
2956 SKIP_BLANKS;
2957 *publicID = htmlParsePubidLiteral(ctxt);
2958 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002959 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2960 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2961 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002962 }
2963 SKIP_BLANKS;
2964 if ((CUR == '"') || (CUR == '\'')) {
2965 URI = htmlParseSystemLiteral(ctxt);
2966 }
2967 }
2968 return(URI);
2969}
2970
2971/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002972 * xmlParsePI:
2973 * @ctxt: an XML parser context
2974 *
2975 * parse an XML Processing Instruction.
2976 *
2977 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2978 */
2979static void
2980htmlParsePI(htmlParserCtxtPtr ctxt) {
2981 xmlChar *buf = NULL;
2982 int len = 0;
2983 int size = HTML_PARSER_BUFFER_SIZE;
2984 int cur, l;
2985 const xmlChar *target;
2986 xmlParserInputState state;
2987 int count = 0;
2988
2989 if ((RAW == '<') && (NXT(1) == '?')) {
2990 state = ctxt->instate;
2991 ctxt->instate = XML_PARSER_PI;
2992 /*
2993 * this is a Processing Instruction.
2994 */
2995 SKIP(2);
2996 SHRINK;
2997
2998 /*
2999 * Parse the target name and check for special support like
3000 * namespace.
3001 */
3002 target = htmlParseName(ctxt);
3003 if (target != NULL) {
3004 if (RAW == '>') {
3005 SKIP(1);
3006
3007 /*
3008 * SAX: PI detected.
3009 */
3010 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3011 (ctxt->sax->processingInstruction != NULL))
3012 ctxt->sax->processingInstruction(ctxt->userData,
3013 target, NULL);
3014 ctxt->instate = state;
3015 return;
3016 }
3017 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3018 if (buf == NULL) {
3019 htmlErrMemory(ctxt, NULL);
3020 ctxt->instate = state;
3021 return;
3022 }
3023 cur = CUR;
3024 if (!IS_BLANK(cur)) {
3025 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3026 "ParsePI: PI %s space expected\n", target, NULL);
3027 }
3028 SKIP_BLANKS;
3029 cur = CUR_CHAR(l);
3030 while (IS_CHAR(cur) && (cur != '>')) {
3031 if (len + 5 >= size) {
3032 xmlChar *tmp;
3033
3034 size *= 2;
3035 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3036 if (tmp == NULL) {
3037 htmlErrMemory(ctxt, NULL);
3038 xmlFree(buf);
3039 ctxt->instate = state;
3040 return;
3041 }
3042 buf = tmp;
3043 }
3044 count++;
3045 if (count > 50) {
3046 GROW;
3047 count = 0;
3048 }
3049 COPY_BUF(l,buf,len,cur);
3050 NEXTL(l);
3051 cur = CUR_CHAR(l);
3052 if (cur == 0) {
3053 SHRINK;
3054 GROW;
3055 cur = CUR_CHAR(l);
3056 }
3057 }
3058 buf[len] = 0;
3059 if (cur != '>') {
3060 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3061 "ParsePI: PI %s never end ...\n", target, NULL);
3062 } else {
3063 SKIP(1);
3064
3065 /*
3066 * SAX: PI detected.
3067 */
3068 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3069 (ctxt->sax->processingInstruction != NULL))
3070 ctxt->sax->processingInstruction(ctxt->userData,
3071 target, buf);
3072 }
3073 xmlFree(buf);
3074 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003075 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003076 "PI is not started correctly", NULL, NULL);
3077 }
3078 ctxt->instate = state;
3079 }
3080}
3081
3082/**
Owen Taylor3473f882001-02-23 17:55:21 +00003083 * htmlParseComment:
3084 * @ctxt: an HTML parser context
3085 *
3086 * Parse an XML (SGML) comment <!-- .... -->
3087 *
3088 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3089 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003090static void
Owen Taylor3473f882001-02-23 17:55:21 +00003091htmlParseComment(htmlParserCtxtPtr ctxt) {
3092 xmlChar *buf = NULL;
3093 int len;
3094 int size = HTML_PARSER_BUFFER_SIZE;
3095 int q, ql;
3096 int r, rl;
3097 int cur, l;
3098 xmlParserInputState state;
3099
3100 /*
3101 * Check that there is a comment right here.
3102 */
3103 if ((RAW != '<') || (NXT(1) != '!') ||
3104 (NXT(2) != '-') || (NXT(3) != '-')) return;
3105
3106 state = ctxt->instate;
3107 ctxt->instate = XML_PARSER_COMMENT;
3108 SHRINK;
3109 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003110 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003111 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003112 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003113 ctxt->instate = state;
3114 return;
3115 }
3116 q = CUR_CHAR(ql);
3117 NEXTL(ql);
3118 r = CUR_CHAR(rl);
3119 NEXTL(rl);
3120 cur = CUR_CHAR(l);
3121 len = 0;
3122 while (IS_CHAR(cur) &&
3123 ((cur != '>') ||
3124 (r != '-') || (q != '-'))) {
3125 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003126 xmlChar *tmp;
3127
Owen Taylor3473f882001-02-23 17:55:21 +00003128 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003129 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3130 if (tmp == NULL) {
3131 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003132 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003133 ctxt->instate = state;
3134 return;
3135 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003136 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003137 }
3138 COPY_BUF(ql,buf,len,q);
3139 q = r;
3140 ql = rl;
3141 r = cur;
3142 rl = l;
3143 NEXTL(l);
3144 cur = CUR_CHAR(l);
3145 if (cur == 0) {
3146 SHRINK;
3147 GROW;
3148 cur = CUR_CHAR(l);
3149 }
3150 }
3151 buf[len] = 0;
3152 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003153 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3154 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003155 xmlFree(buf);
3156 } else {
3157 NEXT;
3158 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3159 (!ctxt->disableSAX))
3160 ctxt->sax->comment(ctxt->userData, buf);
3161 xmlFree(buf);
3162 }
3163 ctxt->instate = state;
3164}
3165
3166/**
3167 * htmlParseCharRef:
3168 * @ctxt: an HTML parser context
3169 *
3170 * parse Reference declarations
3171 *
3172 * [66] CharRef ::= '&#' [0-9]+ ';' |
3173 * '&#x' [0-9a-fA-F]+ ';'
3174 *
3175 * Returns the value parsed (as an int)
3176 */
3177int
3178htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3179 int val = 0;
3180
Daniel Veillarda03e3652004-11-02 18:45:30 +00003181 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3182 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3183 "htmlParseCharRef: context error\n",
3184 NULL, NULL);
3185 return(0);
3186 }
Owen Taylor3473f882001-02-23 17:55:21 +00003187 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003188 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003189 SKIP(3);
3190 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003191 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003192 val = val * 16 + (CUR - '0');
3193 else if ((CUR >= 'a') && (CUR <= 'f'))
3194 val = val * 16 + (CUR - 'a') + 10;
3195 else if ((CUR >= 'A') && (CUR <= 'F'))
3196 val = val * 16 + (CUR - 'A') + 10;
3197 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003198 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003199 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003200 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003201 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003202 }
3203 NEXT;
3204 }
3205 if (CUR == ';')
3206 NEXT;
3207 } else if ((CUR == '&') && (NXT(1) == '#')) {
3208 SKIP(2);
3209 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003210 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003211 val = val * 10 + (CUR - '0');
3212 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003213 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003214 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003215 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003216 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003217 }
3218 NEXT;
3219 }
3220 if (CUR == ';')
3221 NEXT;
3222 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003223 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3224 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003225 }
3226 /*
3227 * Check the value IS_CHAR ...
3228 */
3229 if (IS_CHAR(val)) {
3230 return(val);
3231 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003232 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3233 "htmlParseCharRef: invalid xmlChar value %d\n",
3234 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003235 }
3236 return(0);
3237}
3238
3239
3240/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003241 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003242 * @ctxt: an HTML parser context
3243 *
3244 * parse a DOCTYPE declaration
3245 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003246 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003247 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3248 */
3249
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003250static void
Owen Taylor3473f882001-02-23 17:55:21 +00003251htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003252 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003253 xmlChar *ExternalID = NULL;
3254 xmlChar *URI = NULL;
3255
3256 /*
3257 * We know that '<!DOCTYPE' has been detected.
3258 */
3259 SKIP(9);
3260
3261 SKIP_BLANKS;
3262
3263 /*
3264 * Parse the DOCTYPE name.
3265 */
3266 name = htmlParseName(ctxt);
3267 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003268 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3269 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3270 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003271 }
3272 /*
3273 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3274 */
3275
3276 SKIP_BLANKS;
3277
3278 /*
3279 * Check for SystemID and ExternalID
3280 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003281 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003282 SKIP_BLANKS;
3283
3284 /*
3285 * We should be at the end of the DOCTYPE declaration.
3286 */
3287 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003288 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3289 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003290 /* We shouldn't try to resynchronize ... */
3291 }
3292 NEXT;
3293
3294 /*
3295 * Create or update the document accordingly to the DOCTYPE
3296 */
3297 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3298 (!ctxt->disableSAX))
3299 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3300
3301 /*
3302 * Cleanup, since we don't use all those identifiers
3303 */
3304 if (URI != NULL) xmlFree(URI);
3305 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003306}
3307
3308/**
3309 * htmlParseAttribute:
3310 * @ctxt: an HTML parser context
3311 * @value: a xmlChar ** used to store the value of the attribute
3312 *
3313 * parse an attribute
3314 *
3315 * [41] Attribute ::= Name Eq AttValue
3316 *
3317 * [25] Eq ::= S? '=' S?
3318 *
3319 * With namespace:
3320 *
3321 * [NS 11] Attribute ::= QName Eq AttValue
3322 *
3323 * Also the case QName == xmlns:??? is handled independently as a namespace
3324 * definition.
3325 *
3326 * Returns the attribute name, and the value in *value.
3327 */
3328
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003329static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003330htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003331 const xmlChar *name;
3332 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003333
3334 *value = NULL;
3335 name = htmlParseHTMLName(ctxt);
3336 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003337 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3338 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003339 return(NULL);
3340 }
3341
3342 /*
3343 * read the value
3344 */
3345 SKIP_BLANKS;
3346 if (CUR == '=') {
3347 NEXT;
3348 SKIP_BLANKS;
3349 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003350 } else if (htmlIsBooleanAttr(name)) {
3351 /*
3352 * assume a minimized attribute
3353 */
3354 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
3356
3357 *value = val;
3358 return(name);
3359}
3360
3361/**
3362 * htmlCheckEncoding:
3363 * @ctxt: an HTML parser context
3364 * @attvalue: the attribute value
3365 *
3366 * Checks an http-equiv attribute from a Meta tag to detect
3367 * the encoding
3368 * If a new encoding is detected the parser is switched to decode
3369 * it and pass UTF8
3370 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003371static void
Owen Taylor3473f882001-02-23 17:55:21 +00003372htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3373 const xmlChar *encoding;
3374
3375 if ((ctxt == NULL) || (attvalue == NULL))
3376 return;
3377
Daniel Veillarde77db162009-08-22 11:32:38 +02003378 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003379 if (ctxt->input->encoding != NULL)
3380 return;
3381
3382 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3383 if (encoding != NULL) {
3384 encoding += 8;
3385 } else {
3386 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3387 if (encoding != NULL)
3388 encoding += 9;
3389 }
3390 if (encoding != NULL) {
3391 xmlCharEncoding enc;
3392 xmlCharEncodingHandlerPtr handler;
3393
3394 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3395
3396 if (ctxt->input->encoding != NULL)
3397 xmlFree((xmlChar *) ctxt->input->encoding);
3398 ctxt->input->encoding = xmlStrdup(encoding);
3399
3400 enc = xmlParseCharEncoding((const char *) encoding);
3401 /*
3402 * registered set of known encodings
3403 */
3404 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003405 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003406 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3407 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3408 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3409 (ctxt->input->buf != NULL) &&
3410 (ctxt->input->buf->encoder == NULL)) {
3411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3412 "htmlCheckEncoding: wrong encoding meta\n",
3413 NULL, NULL);
3414 } else {
3415 xmlSwitchEncoding(ctxt, enc);
3416 }
Owen Taylor3473f882001-02-23 17:55:21 +00003417 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3418 } else {
3419 /*
3420 * fallback for unknown encodings
3421 */
3422 handler = xmlFindCharEncodingHandler((const char *) encoding);
3423 if (handler != NULL) {
3424 xmlSwitchToEncoding(ctxt, handler);
3425 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3426 } else {
3427 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3428 }
3429 }
3430
3431 if ((ctxt->input->buf != NULL) &&
3432 (ctxt->input->buf->encoder != NULL) &&
3433 (ctxt->input->buf->raw != NULL) &&
3434 (ctxt->input->buf->buffer != NULL)) {
3435 int nbchars;
3436 int processed;
3437
3438 /*
3439 * convert as much as possible to the parser reading buffer.
3440 */
3441 processed = ctxt->input->cur - ctxt->input->base;
3442 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3443 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3444 ctxt->input->buf->buffer,
3445 ctxt->input->buf->raw);
3446 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003447 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3448 "htmlCheckEncoding: encoder error\n",
3449 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003450 }
3451 ctxt->input->base =
3452 ctxt->input->cur = ctxt->input->buf->buffer->content;
3453 }
3454 }
3455}
3456
3457/**
3458 * htmlCheckMeta:
3459 * @ctxt: an HTML parser context
3460 * @atts: the attributes values
3461 *
3462 * Checks an attributes from a Meta tag
3463 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003464static void
Owen Taylor3473f882001-02-23 17:55:21 +00003465htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3466 int i;
3467 const xmlChar *att, *value;
3468 int http = 0;
3469 const xmlChar *content = NULL;
3470
3471 if ((ctxt == NULL) || (atts == NULL))
3472 return;
3473
3474 i = 0;
3475 att = atts[i++];
3476 while (att != NULL) {
3477 value = atts[i++];
3478 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3479 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3480 http = 1;
3481 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3482 content = value;
3483 att = atts[i++];
3484 }
3485 if ((http) && (content != NULL))
3486 htmlCheckEncoding(ctxt, content);
3487
3488}
3489
3490/**
3491 * htmlParseStartTag:
3492 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003493 *
Owen Taylor3473f882001-02-23 17:55:21 +00003494 * parse a start of tag either for rule element or
3495 * EmptyElement. In both case we don't parse the tag closing chars.
3496 *
3497 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3498 *
3499 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3500 *
3501 * With namespace:
3502 *
3503 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3504 *
3505 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3506 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003507 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003508 */
3509
Daniel Veillard597f1c12005-07-03 23:00:18 +00003510static int
Owen Taylor3473f882001-02-23 17:55:21 +00003511htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003512 const xmlChar *name;
3513 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003514 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003515 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003516 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003517 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003518 int meta = 0;
3519 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003520 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003521
Daniel Veillarde77db162009-08-22 11:32:38 +02003522 if (ctxt->instate == XML_PARSER_EOF)
3523 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003524 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3525 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3526 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003527 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003528 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003529 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003530 NEXT;
3531
Daniel Veillard30e76072006-03-09 14:13:55 +00003532 atts = ctxt->atts;
3533 maxatts = ctxt->maxatts;
3534
Owen Taylor3473f882001-02-23 17:55:21 +00003535 GROW;
3536 name = htmlParseHTMLName(ctxt);
3537 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003538 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3539 "htmlParseStartTag: invalid element name\n",
3540 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003541 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003542 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3543 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003544 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003545 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003546 }
3547 if (xmlStrEqual(name, BAD_CAST"meta"))
3548 meta = 1;
3549
3550 /*
3551 * Check for auto-closure of HTML elements.
3552 */
3553 htmlAutoClose(ctxt, name);
3554
3555 /*
3556 * Check for implied HTML elements.
3557 */
3558 htmlCheckImplied(ctxt, name);
3559
3560 /*
3561 * Avoid html at any level > 0, head at any level != 1
3562 * or any attempt to recurse body
3563 */
3564 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003565 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3566 "htmlParseStartTag: misplaced <html> tag\n",
3567 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003568 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003569 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003570 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003571 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003572 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003573 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3574 "htmlParseStartTag: misplaced <head> tag\n",
3575 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003576 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003577 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003578 }
3579 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003580 int indx;
3581 for (indx = 0;indx < ctxt->nameNr;indx++) {
3582 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003583 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3584 "htmlParseStartTag: misplaced <body> tag\n",
3585 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003586 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003587 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003588 }
3589 }
3590 }
3591
3592 /*
3593 * Now parse the attributes, it ends up with the ending
3594 *
3595 * (S Attribute)* S?
3596 */
3597 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003598 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003599 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003600 ((CUR != '/') || (NXT(1) != '>'))) {
3601 long cons = ctxt->nbChars;
3602
3603 GROW;
3604 attname = htmlParseAttribute(ctxt, &attvalue);
3605 if (attname != NULL) {
3606
3607 /*
3608 * Well formedness requires at most one declaration of an attribute
3609 */
3610 for (i = 0; i < nbatts;i += 2) {
3611 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003612 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3613 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003614 if (attvalue != NULL)
3615 xmlFree(attvalue);
3616 goto failed;
3617 }
3618 }
3619
3620 /*
3621 * Add the pair to atts
3622 */
3623 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003624 maxatts = 22; /* allow for 10 attrs by default */
3625 atts = (const xmlChar **)
3626 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003627 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003628 htmlErrMemory(ctxt, NULL);
3629 if (attvalue != NULL)
3630 xmlFree(attvalue);
3631 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003632 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003633 ctxt->atts = atts;
3634 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003635 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003636 const xmlChar **n;
3637
Owen Taylor3473f882001-02-23 17:55:21 +00003638 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003639 n = (const xmlChar **) xmlRealloc((void *) atts,
3640 maxatts * sizeof(const xmlChar *));
3641 if (n == NULL) {
3642 htmlErrMemory(ctxt, NULL);
3643 if (attvalue != NULL)
3644 xmlFree(attvalue);
3645 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003646 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003647 atts = n;
3648 ctxt->atts = atts;
3649 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003650 }
3651 atts[nbatts++] = attname;
3652 atts[nbatts++] = attvalue;
3653 atts[nbatts] = NULL;
3654 atts[nbatts + 1] = NULL;
3655 }
3656 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003657 if (attvalue != NULL)
3658 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003659 /* Dump the bogus attribute string up to the next blank or
3660 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003661 while ((IS_CHAR_CH(CUR)) &&
3662 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003663 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003664 NEXT;
3665 }
3666
3667failed:
3668 SKIP_BLANKS;
3669 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003670 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3671 "htmlParseStartTag: problem parsing attributes\n",
3672 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003673 break;
3674 }
3675 }
3676
3677 /*
3678 * Handle specific association to the META tag
3679 */
William M. Bracke978ae22007-03-21 06:16:02 +00003680 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003681 htmlCheckMeta(ctxt, atts);
3682
3683 /*
3684 * SAX: Start of Element !
3685 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003686 if (!discardtag) {
3687 htmlnamePush(ctxt, name);
3688 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3689 if (nbatts != 0)
3690 ctxt->sax->startElement(ctxt->userData, name, atts);
3691 else
3692 ctxt->sax->startElement(ctxt->userData, name, NULL);
3693 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003694 }
Owen Taylor3473f882001-02-23 17:55:21 +00003695
3696 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003697 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003698 if (atts[i] != NULL)
3699 xmlFree((xmlChar *) atts[i]);
3700 }
Owen Taylor3473f882001-02-23 17:55:21 +00003701 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003702
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003703 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003704}
3705
3706/**
3707 * htmlParseEndTag:
3708 * @ctxt: an HTML parser context
3709 *
3710 * parse an end of tag
3711 *
3712 * [42] ETag ::= '</' Name S? '>'
3713 *
3714 * With namespace
3715 *
3716 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003717 *
3718 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003719 */
3720
Daniel Veillardf420ac52001-07-04 16:04:09 +00003721static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003722htmlParseEndTag(htmlParserCtxtPtr ctxt)
3723{
3724 const xmlChar *name;
3725 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003726 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003727
3728 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003729 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3730 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003731 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003732 }
3733 SKIP(2);
3734
3735 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003736 if (name == NULL)
3737 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003738 /*
3739 * We should definitely be at the ending "S? '>'" part
3740 */
3741 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003742 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003743 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3744 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003745 if (ctxt->recovery) {
3746 /*
3747 * We're not at the ending > !!
3748 * Error, unless in recover mode where we search forwards
3749 * until we find a >
3750 */
3751 while (CUR != '\0' && CUR != '>') NEXT;
3752 NEXT;
3753 }
Owen Taylor3473f882001-02-23 17:55:21 +00003754 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003755 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003756
3757 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003758 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3759 * out now.
3760 */
3761 if ((ctxt->depth > 0) &&
3762 (xmlStrEqual(name, BAD_CAST "html") ||
3763 xmlStrEqual(name, BAD_CAST "body") ||
3764 xmlStrEqual(name, BAD_CAST "head"))) {
3765 ctxt->depth--;
3766 return (0);
3767 }
3768
3769 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003770 * If the name read is not one of the element in the parsing stack
3771 * then return, it's just an error.
3772 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003773 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3774 if (xmlStrEqual(name, ctxt->nameTab[i]))
3775 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003776 }
3777 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003778 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3779 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003780 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003781 }
3782
3783
3784 /*
3785 * Check for auto-closure of HTML elements.
3786 */
3787
3788 htmlAutoCloseOnClose(ctxt, name);
3789
3790 /*
3791 * Well formedness constraints, opening and closing must match.
3792 * With the exception that the autoclose may have popped stuff out
3793 * of the stack.
3794 */
3795 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003796 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003797 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3798 "Opening and ending tag mismatch: %s and %s\n",
3799 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003800 }
3801 }
3802
3803 /*
3804 * SAX: End of Tag
3805 */
3806 oldname = ctxt->name;
3807 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003808 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3809 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003810 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003811 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003812 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003813 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003814 }
3815
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003816 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003817}
3818
3819
3820/**
3821 * htmlParseReference:
3822 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003823 *
Owen Taylor3473f882001-02-23 17:55:21 +00003824 * parse and handle entity references in content,
3825 * this will end-up in a call to character() since this is either a
3826 * CharRef, or a predefined entity.
3827 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003828static void
Owen Taylor3473f882001-02-23 17:55:21 +00003829htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003830 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003831 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003832 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003833 if (CUR != '&') return;
3834
3835 if (NXT(1) == '#') {
3836 unsigned int c;
3837 int bits, i = 0;
3838
3839 c = htmlParseCharRef(ctxt);
3840 if (c == 0)
3841 return;
3842
3843 if (c < 0x80) { out[i++]= c; bits= -6; }
3844 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3845 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3846 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003847
Owen Taylor3473f882001-02-23 17:55:21 +00003848 for ( ; bits >= 0; bits-= 6) {
3849 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3850 }
3851 out[i] = 0;
3852
3853 htmlCheckParagraph(ctxt);
3854 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3855 ctxt->sax->characters(ctxt->userData, out, i);
3856 } else {
3857 ent = htmlParseEntityRef(ctxt, &name);
3858 if (name == NULL) {
3859 htmlCheckParagraph(ctxt);
3860 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3861 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3862 return;
3863 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003864 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003865 htmlCheckParagraph(ctxt);
3866 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3867 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3868 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3869 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3870 }
3871 } else {
3872 unsigned int c;
3873 int bits, i = 0;
3874
3875 c = ent->value;
3876 if (c < 0x80)
3877 { out[i++]= c; bits= -6; }
3878 else if (c < 0x800)
3879 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3880 else if (c < 0x10000)
3881 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003882 else
Owen Taylor3473f882001-02-23 17:55:21 +00003883 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003884
Owen Taylor3473f882001-02-23 17:55:21 +00003885 for ( ; bits >= 0; bits-= 6) {
3886 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3887 }
3888 out[i] = 0;
3889
3890 htmlCheckParagraph(ctxt);
3891 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3892 ctxt->sax->characters(ctxt->userData, out, i);
3893 }
Owen Taylor3473f882001-02-23 17:55:21 +00003894 }
3895}
3896
3897/**
3898 * htmlParseContent:
3899 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003900 *
3901 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003902 */
3903
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003904static void
Owen Taylor3473f882001-02-23 17:55:21 +00003905htmlParseContent(htmlParserCtxtPtr ctxt) {
3906 xmlChar *currentNode;
3907 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003908 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003909
3910 currentNode = xmlStrdup(ctxt->name);
3911 depth = ctxt->nameNr;
3912 while (1) {
3913 long cons = ctxt->nbChars;
3914
3915 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003916
3917 if (ctxt->instate == XML_PARSER_EOF)
3918 break;
3919
Owen Taylor3473f882001-02-23 17:55:21 +00003920 /*
3921 * Our tag or one of it's parent or children is ending.
3922 */
3923 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003924 if (htmlParseEndTag(ctxt) &&
3925 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3926 if (currentNode != NULL)
3927 xmlFree(currentNode);
3928 return;
3929 }
3930 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003931 }
3932
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003933 else if ((CUR == '<') &&
3934 ((IS_ASCII_LETTER(NXT(1))) ||
3935 (NXT(1) == '_') || (NXT(1) == ':'))) {
3936 name = htmlParseHTMLName_nonInvasive(ctxt);
3937 if (name == NULL) {
3938 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3939 "htmlParseStartTag: invalid element name\n",
3940 NULL, NULL);
3941 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003942 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003943 NEXT;
3944
3945 if (currentNode != NULL)
3946 xmlFree(currentNode);
3947 return;
3948 }
3949
3950 if (ctxt->name != NULL) {
3951 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3952 htmlAutoClose(ctxt, name);
3953 continue;
3954 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003955 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003956 }
3957
Owen Taylor3473f882001-02-23 17:55:21 +00003958 /*
3959 * Has this node been popped out during parsing of
3960 * the next element
3961 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003962 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3963 (!xmlStrEqual(currentNode, ctxt->name)))
3964 {
Owen Taylor3473f882001-02-23 17:55:21 +00003965 if (currentNode != NULL) xmlFree(currentNode);
3966 return;
3967 }
3968
Daniel Veillardf9533d12001-03-03 10:04:57 +00003969 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3970 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003971 /*
3972 * Handle SCRIPT/STYLE separately
3973 */
3974 htmlParseScript(ctxt);
3975 } else {
3976 /*
3977 * Sometimes DOCTYPE arrives in the middle of the document
3978 */
3979 if ((CUR == '<') && (NXT(1) == '!') &&
3980 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3981 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3982 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3983 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003984 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3985 "Misplaced DOCTYPE declaration\n",
3986 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003987 htmlParseDocTypeDecl(ctxt);
3988 }
3989
3990 /*
3991 * First case : a comment
3992 */
3993 if ((CUR == '<') && (NXT(1) == '!') &&
3994 (NXT(2) == '-') && (NXT(3) == '-')) {
3995 htmlParseComment(ctxt);
3996 }
3997
3998 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003999 * Second case : a Processing Instruction.
4000 */
4001 else if ((CUR == '<') && (NXT(1) == '?')) {
4002 htmlParsePI(ctxt);
4003 }
4004
4005 /*
4006 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004007 */
4008 else if (CUR == '<') {
4009 htmlParseElement(ctxt);
4010 }
4011
4012 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004013 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004014 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004015 */
4016 else if (CUR == '&') {
4017 htmlParseReference(ctxt);
4018 }
4019
4020 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004021 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004022 */
4023 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004024 htmlAutoCloseOnEnd(ctxt);
4025 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004026 }
4027
4028 /*
4029 * Last case, text. Note that References are handled directly.
4030 */
4031 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004032 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004033 }
4034
4035 if (cons == ctxt->nbChars) {
4036 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004037 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4038 "detected an error in element content\n",
4039 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004040 }
4041 break;
4042 }
4043 }
4044 GROW;
4045 }
4046 if (currentNode != NULL) xmlFree(currentNode);
4047}
4048
4049/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004050 * htmlParseContent:
4051 * @ctxt: an HTML parser context
4052 *
4053 * Parse a content: comment, sub-element, reference or text.
4054 */
4055
4056void
4057__htmlParseContent(void *ctxt) {
4058 if (ctxt != NULL)
4059 htmlParseContent((htmlParserCtxtPtr) ctxt);
4060}
4061
4062/**
Owen Taylor3473f882001-02-23 17:55:21 +00004063 * htmlParseElement:
4064 * @ctxt: an HTML parser context
4065 *
4066 * parse an HTML element, this is highly recursive
4067 *
4068 * [39] element ::= EmptyElemTag | STag content ETag
4069 *
4070 * [41] Attribute ::= Name Eq AttValue
4071 */
4072
4073void
4074htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004075 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004076 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004077 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004078 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004079 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004080 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004081 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004082
Daniel Veillarde77db162009-08-22 11:32:38 +02004083 if (ctxt->instate == XML_PARSER_EOF)
4084 return;
4085
Daniel Veillarda03e3652004-11-02 18:45:30 +00004086 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4087 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004088 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004089 return;
4090 }
Owen Taylor3473f882001-02-23 17:55:21 +00004091 /* Capture start position */
4092 if (ctxt->record_info) {
4093 node_info.begin_pos = ctxt->input->consumed +
4094 (CUR_PTR - ctxt->input->base);
4095 node_info.begin_line = ctxt->input->line;
4096 }
4097
Daniel Veillard597f1c12005-07-03 23:00:18 +00004098 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004099 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004100 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004101 if (CUR == '>')
4102 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004103 return;
4104 }
Owen Taylor3473f882001-02-23 17:55:21 +00004105
4106 /*
4107 * Lookup the info for that element.
4108 */
4109 info = htmlTagLookup(name);
4110 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004111 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4112 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004113 }
4114
4115 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004116 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004117 */
4118 if ((CUR == '/') && (NXT(1) == '>')) {
4119 SKIP(2);
4120 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4121 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004122 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004123 return;
4124 }
4125
4126 if (CUR == '>') {
4127 NEXT;
4128 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004129 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4130 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004131
4132 /*
4133 * end of parsing of this node.
4134 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004135 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004136 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004137 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004138 }
Owen Taylor3473f882001-02-23 17:55:21 +00004139
4140 /*
4141 * Capture end position and add node
4142 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004143 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004144 node_info.end_pos = ctxt->input->consumed +
4145 (CUR_PTR - ctxt->input->base);
4146 node_info.end_line = ctxt->input->line;
4147 node_info.node = ctxt->node;
4148 xmlParserAddNodeInfo(ctxt, &node_info);
4149 }
4150 return;
4151 }
4152
4153 /*
4154 * Check for an Empty Element from DTD definition
4155 */
4156 if ((info != NULL) && (info->empty)) {
4157 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4158 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004159 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004160 return;
4161 }
4162
4163 /*
4164 * Parse the content of the element:
4165 */
4166 currentNode = xmlStrdup(ctxt->name);
4167 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004168 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004169 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004170 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004171 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004172 if (ctxt->nameNr < depth) break;
4173 }
Owen Taylor3473f882001-02-23 17:55:21 +00004174
Owen Taylor3473f882001-02-23 17:55:21 +00004175 /*
4176 * Capture end position and add node
4177 */
4178 if ( currentNode != NULL && ctxt->record_info ) {
4179 node_info.end_pos = ctxt->input->consumed +
4180 (CUR_PTR - ctxt->input->base);
4181 node_info.end_line = ctxt->input->line;
4182 node_info.node = ctxt->node;
4183 xmlParserAddNodeInfo(ctxt, &node_info);
4184 }
William M. Brack76e95df2003-10-18 16:20:14 +00004185 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004186 htmlAutoCloseOnEnd(ctxt);
4187 }
4188
Owen Taylor3473f882001-02-23 17:55:21 +00004189 if (currentNode != NULL)
4190 xmlFree(currentNode);
4191}
4192
4193/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004194 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004195 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004196 *
Owen Taylor3473f882001-02-23 17:55:21 +00004197 * parse an HTML document (and build a tree if using the standard SAX
4198 * interface).
4199 *
4200 * Returns 0, -1 in case of error. the parser context is augmented
4201 * as a result of the parsing.
4202 */
4203
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004204int
Owen Taylor3473f882001-02-23 17:55:21 +00004205htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004206 xmlChar start[4];
4207 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004208 xmlDtdPtr dtd;
4209
Daniel Veillardd0463562001-10-13 09:15:48 +00004210 xmlInitParser();
4211
Owen Taylor3473f882001-02-23 17:55:21 +00004212 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004213
Daniel Veillarda03e3652004-11-02 18:45:30 +00004214 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4215 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4216 "htmlParseDocument: context error\n", NULL, NULL);
4217 return(XML_ERR_INTERNAL_ERROR);
4218 }
4219 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004220 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004221 GROW;
4222 /*
4223 * SAX: beginning of the document processing.
4224 */
4225 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4226 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4227
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004228 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4229 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4230 /*
4231 * Get the 4 first bytes and decode the charset
4232 * if enc != XML_CHAR_ENCODING_NONE
4233 * plug some encoding conversion routines.
4234 */
4235 start[0] = RAW;
4236 start[1] = NXT(1);
4237 start[2] = NXT(2);
4238 start[3] = NXT(3);
4239 enc = xmlDetectCharEncoding(&start[0], 4);
4240 if (enc != XML_CHAR_ENCODING_NONE) {
4241 xmlSwitchEncoding(ctxt, enc);
4242 }
4243 }
4244
Owen Taylor3473f882001-02-23 17:55:21 +00004245 /*
4246 * Wipe out everything which is before the first '<'
4247 */
4248 SKIP_BLANKS;
4249 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004250 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004251 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004252 }
4253
4254 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4255 ctxt->sax->startDocument(ctxt->userData);
4256
4257
4258 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004259 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004260 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004261 while (((CUR == '<') && (NXT(1) == '!') &&
4262 (NXT(2) == '-') && (NXT(3) == '-')) ||
4263 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004264 htmlParseComment(ctxt);
4265 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004266 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004267 }
Owen Taylor3473f882001-02-23 17:55:21 +00004268
4269
4270 /*
4271 * Then possibly doc type declaration(s) and more Misc
4272 * (doctypedecl Misc*)?
4273 */
4274 if ((CUR == '<') && (NXT(1) == '!') &&
4275 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4276 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4277 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4278 (UPP(8) == 'E')) {
4279 htmlParseDocTypeDecl(ctxt);
4280 }
4281 SKIP_BLANKS;
4282
4283 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004284 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004285 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004286 while (((CUR == '<') && (NXT(1) == '!') &&
4287 (NXT(2) == '-') && (NXT(3) == '-')) ||
4288 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004289 htmlParseComment(ctxt);
4290 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004291 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004292 }
Owen Taylor3473f882001-02-23 17:55:21 +00004293
4294 /*
4295 * Time to start parsing the tree itself
4296 */
4297 htmlParseContent(ctxt);
4298
4299 /*
4300 * autoclose
4301 */
4302 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004303 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004304
4305
4306 /*
4307 * SAX: end of the document processing.
4308 */
4309 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4310 ctxt->sax->endDocument(ctxt->userData);
4311
4312 if (ctxt->myDoc != NULL) {
4313 dtd = xmlGetIntSubset(ctxt->myDoc);
4314 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004315 ctxt->myDoc->intSubset =
4316 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004317 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4318 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4319 }
4320 if (! ctxt->wellFormed) return(-1);
4321 return(0);
4322}
4323
4324
4325/************************************************************************
4326 * *
4327 * Parser contexts handling *
4328 * *
4329 ************************************************************************/
4330
4331/**
William M. Brackedb65a72004-02-06 07:36:04 +00004332 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004333 * @ctxt: an HTML parser context
4334 *
4335 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004336 *
4337 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004338 */
4339
Daniel Veillardf403d292003-10-05 13:51:35 +00004340static int
Owen Taylor3473f882001-02-23 17:55:21 +00004341htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4342{
4343 htmlSAXHandler *sax;
4344
Daniel Veillardf403d292003-10-05 13:51:35 +00004345 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004346 memset(ctxt, 0, sizeof(htmlParserCtxt));
4347
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004348 ctxt->dict = xmlDictCreate();
4349 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004350 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4351 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004352 }
Owen Taylor3473f882001-02-23 17:55:21 +00004353 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4354 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004355 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4356 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004357 }
4358 else
4359 memset(sax, 0, sizeof(htmlSAXHandler));
4360
4361 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004362 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004363 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4364 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004365 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004366 ctxt->inputNr = 0;
4367 ctxt->inputMax = 0;
4368 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004369 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004370 }
4371 ctxt->inputNr = 0;
4372 ctxt->inputMax = 5;
4373 ctxt->input = NULL;
4374 ctxt->version = NULL;
4375 ctxt->encoding = NULL;
4376 ctxt->standalone = -1;
4377 ctxt->instate = XML_PARSER_START;
4378
4379 /* Allocate the Node stack */
4380 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4381 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004382 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004383 ctxt->nodeNr = 0;
4384 ctxt->nodeMax = 0;
4385 ctxt->node = NULL;
4386 ctxt->inputNr = 0;
4387 ctxt->inputMax = 0;
4388 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004389 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004390 }
4391 ctxt->nodeNr = 0;
4392 ctxt->nodeMax = 10;
4393 ctxt->node = NULL;
4394
4395 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004396 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004397 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004398 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004399 ctxt->nameNr = 0;
4400 ctxt->nameMax = 10;
4401 ctxt->name = NULL;
4402 ctxt->nodeNr = 0;
4403 ctxt->nodeMax = 0;
4404 ctxt->node = NULL;
4405 ctxt->inputNr = 0;
4406 ctxt->inputMax = 0;
4407 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004408 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004409 }
4410 ctxt->nameNr = 0;
4411 ctxt->nameMax = 10;
4412 ctxt->name = NULL;
4413
Daniel Veillard092643b2003-09-25 14:29:29 +00004414 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004415 else {
4416 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004417 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004418 }
4419 ctxt->userData = ctxt;
4420 ctxt->myDoc = NULL;
4421 ctxt->wellFormed = 1;
4422 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004423 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004424 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004425 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004426 ctxt->vctxt.userData = ctxt;
4427 ctxt->vctxt.error = xmlParserValidityError;
4428 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004429 ctxt->record_info = 0;
4430 ctxt->validate = 0;
4431 ctxt->nbChars = 0;
4432 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004433 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004434 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004435 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004436}
4437
4438/**
4439 * htmlFreeParserCtxt:
4440 * @ctxt: an HTML parser context
4441 *
4442 * Free all the memory used by a parser context. However the parsed
4443 * document in ctxt->myDoc is not freed.
4444 */
4445
4446void
4447htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4448{
4449 xmlFreeParserCtxt(ctxt);
4450}
4451
4452/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004453 * htmlNewParserCtxt:
4454 *
4455 * Allocate and initialize a new parser context.
4456 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004457 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004458 */
4459
Daniel Veillard34c647c2006-09-21 06:53:59 +00004460htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004461htmlNewParserCtxt(void)
4462{
4463 xmlParserCtxtPtr ctxt;
4464
4465 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4466 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004467 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004468 return(NULL);
4469 }
4470 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004471 if (htmlInitParserCtxt(ctxt) < 0) {
4472 htmlFreeParserCtxt(ctxt);
4473 return(NULL);
4474 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004475 return(ctxt);
4476}
4477
4478/**
4479 * htmlCreateMemoryParserCtxt:
4480 * @buffer: a pointer to a char array
4481 * @size: the size of the array
4482 *
4483 * Create a parser context for an HTML in-memory document.
4484 *
4485 * Returns the new parser context or NULL
4486 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004487htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004488htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4489 xmlParserCtxtPtr ctxt;
4490 xmlParserInputPtr input;
4491 xmlParserInputBufferPtr buf;
4492
4493 if (buffer == NULL)
4494 return(NULL);
4495 if (size <= 0)
4496 return(NULL);
4497
4498 ctxt = htmlNewParserCtxt();
4499 if (ctxt == NULL)
4500 return(NULL);
4501
4502 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4503 if (buf == NULL) return(NULL);
4504
4505 input = xmlNewInputStream(ctxt);
4506 if (input == NULL) {
4507 xmlFreeParserCtxt(ctxt);
4508 return(NULL);
4509 }
4510
4511 input->filename = NULL;
4512 input->buf = buf;
4513 input->base = input->buf->buffer->content;
4514 input->cur = input->buf->buffer->content;
4515 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4516
4517 inputPush(ctxt, input);
4518 return(ctxt);
4519}
4520
4521/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004522 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004523 * @cur: a pointer to an array of xmlChar
4524 * @encoding: a free form C string describing the HTML document encoding, or NULL
4525 *
4526 * Create a parser context for an HTML document.
4527 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004528 * TODO: check the need to add encoding handling there
4529 *
Owen Taylor3473f882001-02-23 17:55:21 +00004530 * Returns the new parser context or NULL
4531 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004532static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004533htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004534 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004535 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004536
Daniel Veillard1d995272002-07-22 16:43:32 +00004537 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004538 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004539 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004540 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004541 if (ctxt == NULL)
4542 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004543
4544 if (encoding != NULL) {
4545 xmlCharEncoding enc;
4546 xmlCharEncodingHandlerPtr handler;
4547
4548 if (ctxt->input->encoding != NULL)
4549 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004550 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004551
4552 enc = xmlParseCharEncoding(encoding);
4553 /*
4554 * registered set of known encodings
4555 */
4556 if (enc != XML_CHAR_ENCODING_ERROR) {
4557 xmlSwitchEncoding(ctxt, enc);
4558 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004559 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004560 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004561 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004562 }
4563 } else {
4564 /*
4565 * fallback for unknown encodings
4566 */
4567 handler = xmlFindCharEncodingHandler((const char *) encoding);
4568 if (handler != NULL) {
4569 xmlSwitchToEncoding(ctxt, handler);
4570 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004571 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4572 "Unsupported encoding %s\n",
4573 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004574 }
4575 }
4576 }
4577 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004578}
4579
Daniel Veillard73b013f2003-09-30 12:36:01 +00004580#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004581/************************************************************************
4582 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004583 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004584 * *
4585 ************************************************************************/
4586
4587/**
4588 * htmlParseLookupSequence:
4589 * @ctxt: an HTML parser context
4590 * @first: the first char to lookup
4591 * @next: the next char to lookup or zero
4592 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004593 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004594 *
4595 * Try to find if a sequence (first, next, third) or just (first next) or
4596 * (first) is available in the input stream.
4597 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4598 * to avoid rescanning sequences of bytes, it DOES change the state of the
4599 * parser, do not use liberally.
4600 * This is basically similar to xmlParseLookupSequence()
4601 *
4602 * Returns the index to the current parsing point if the full sequence
4603 * is available, -1 otherwise.
4604 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004605static int
Owen Taylor3473f882001-02-23 17:55:21 +00004606htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004607 xmlChar next, xmlChar third, int iscomment,
4608 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004609 int base, len;
4610 htmlParserInputPtr in;
4611 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004612 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004613 int invalue = 0;
4614 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004615
4616 in = ctxt->input;
4617 if (in == NULL) return(-1);
4618 base = in->cur - in->base;
4619 if (base < 0) return(-1);
4620 if (ctxt->checkIndex > base)
4621 base = ctxt->checkIndex;
4622 if (in->buf == NULL) {
4623 buf = in->base;
4624 len = in->length;
4625 } else {
4626 buf = in->buf->buffer->content;
4627 len = in->buf->buffer->use;
4628 }
4629 /* take into account the sequence length */
4630 if (third) len -= 2;
4631 else if (next) len --;
4632 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004633 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004634 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4635 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4636 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004637 /* do not increment past <! - some people use <!--> */
4638 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004639 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004640 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004641 if (ignoreattrval) {
4642 if (buf[base] == '"' || buf[base] == '\'') {
4643 if (invalue) {
4644 if (buf[base] == valdellim) {
4645 invalue = 0;
4646 continue;
4647 }
4648 } else {
4649 valdellim = buf[base];
4650 invalue = 1;
4651 continue;
4652 }
4653 } else if (invalue) {
4654 continue;
4655 }
4656 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004657 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004658 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004659 return(-1);
4660 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4661 (buf[base + 2] == '>')) {
4662 incomment = 0;
4663 base += 2;
4664 }
4665 continue;
4666 }
Owen Taylor3473f882001-02-23 17:55:21 +00004667 if (buf[base] == first) {
4668 if (third != 0) {
4669 if ((buf[base + 1] != next) ||
4670 (buf[base + 2] != third)) continue;
4671 } else if (next != 0) {
4672 if (buf[base + 1] != next) continue;
4673 }
4674 ctxt->checkIndex = 0;
4675#ifdef DEBUG_PUSH
4676 if (next == 0)
4677 xmlGenericError(xmlGenericErrorContext,
4678 "HPP: lookup '%c' found at %d\n",
4679 first, base);
4680 else if (third == 0)
4681 xmlGenericError(xmlGenericErrorContext,
4682 "HPP: lookup '%c%c' found at %d\n",
4683 first, next, base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004684 else
Owen Taylor3473f882001-02-23 17:55:21 +00004685 xmlGenericError(xmlGenericErrorContext,
4686 "HPP: lookup '%c%c%c' found at %d\n",
4687 first, next, third, base);
4688#endif
4689 return(base - (in->cur - in->base));
4690 }
4691 }
4692 ctxt->checkIndex = base;
4693#ifdef DEBUG_PUSH
4694 if (next == 0)
4695 xmlGenericError(xmlGenericErrorContext,
4696 "HPP: lookup '%c' failed\n", first);
4697 else if (third == 0)
4698 xmlGenericError(xmlGenericErrorContext,
4699 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004700 else
Owen Taylor3473f882001-02-23 17:55:21 +00004701 xmlGenericError(xmlGenericErrorContext,
4702 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4703#endif
4704 return(-1);
4705}
4706
4707/**
4708 * htmlParseTryOrFinish:
4709 * @ctxt: an HTML parser context
4710 * @terminate: last chunk indicator
4711 *
4712 * Try to progress on parsing
4713 *
4714 * Returns zero if no parsing was possible
4715 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004716static int
Owen Taylor3473f882001-02-23 17:55:21 +00004717htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4718 int ret = 0;
4719 htmlParserInputPtr in;
4720 int avail = 0;
4721 xmlChar cur, next;
4722
4723#ifdef DEBUG_PUSH
4724 switch (ctxt->instate) {
4725 case XML_PARSER_EOF:
4726 xmlGenericError(xmlGenericErrorContext,
4727 "HPP: try EOF\n"); break;
4728 case XML_PARSER_START:
4729 xmlGenericError(xmlGenericErrorContext,
4730 "HPP: try START\n"); break;
4731 case XML_PARSER_MISC:
4732 xmlGenericError(xmlGenericErrorContext,
4733 "HPP: try MISC\n");break;
4734 case XML_PARSER_COMMENT:
4735 xmlGenericError(xmlGenericErrorContext,
4736 "HPP: try COMMENT\n");break;
4737 case XML_PARSER_PROLOG:
4738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: try PROLOG\n");break;
4740 case XML_PARSER_START_TAG:
4741 xmlGenericError(xmlGenericErrorContext,
4742 "HPP: try START_TAG\n");break;
4743 case XML_PARSER_CONTENT:
4744 xmlGenericError(xmlGenericErrorContext,
4745 "HPP: try CONTENT\n");break;
4746 case XML_PARSER_CDATA_SECTION:
4747 xmlGenericError(xmlGenericErrorContext,
4748 "HPP: try CDATA_SECTION\n");break;
4749 case XML_PARSER_END_TAG:
4750 xmlGenericError(xmlGenericErrorContext,
4751 "HPP: try END_TAG\n");break;
4752 case XML_PARSER_ENTITY_DECL:
4753 xmlGenericError(xmlGenericErrorContext,
4754 "HPP: try ENTITY_DECL\n");break;
4755 case XML_PARSER_ENTITY_VALUE:
4756 xmlGenericError(xmlGenericErrorContext,
4757 "HPP: try ENTITY_VALUE\n");break;
4758 case XML_PARSER_ATTRIBUTE_VALUE:
4759 xmlGenericError(xmlGenericErrorContext,
4760 "HPP: try ATTRIBUTE_VALUE\n");break;
4761 case XML_PARSER_DTD:
4762 xmlGenericError(xmlGenericErrorContext,
4763 "HPP: try DTD\n");break;
4764 case XML_PARSER_EPILOG:
4765 xmlGenericError(xmlGenericErrorContext,
4766 "HPP: try EPILOG\n");break;
4767 case XML_PARSER_PI:
4768 xmlGenericError(xmlGenericErrorContext,
4769 "HPP: try PI\n");break;
4770 case XML_PARSER_SYSTEM_LITERAL:
4771 xmlGenericError(xmlGenericErrorContext,
4772 "HPP: try SYSTEM_LITERAL\n");break;
4773 }
4774#endif
4775
4776 while (1) {
4777
4778 in = ctxt->input;
4779 if (in == NULL) break;
4780 if (in->buf == NULL)
4781 avail = in->length - (in->cur - in->base);
4782 else
4783 avail = in->buf->buffer->use - (in->cur - in->base);
4784 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004785 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004786 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004787 /*
4788 * SAX: end of the document processing.
4789 */
4790 ctxt->instate = XML_PARSER_EOF;
4791 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4792 ctxt->sax->endDocument(ctxt->userData);
4793 }
4794 }
4795 if (avail < 1)
4796 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004797 cur = in->cur[0];
4798 if (cur == 0) {
4799 SKIP(1);
4800 continue;
4801 }
4802
Owen Taylor3473f882001-02-23 17:55:21 +00004803 switch (ctxt->instate) {
4804 case XML_PARSER_EOF:
4805 /*
4806 * Document parsing is done !
4807 */
4808 goto done;
4809 case XML_PARSER_START:
4810 /*
4811 * Very first chars read from the document flow.
4812 */
4813 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004814 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004815 SKIP_BLANKS;
4816 if (in->buf == NULL)
4817 avail = in->length - (in->cur - in->base);
4818 else
4819 avail = in->buf->buffer->use - (in->cur - in->base);
4820 }
4821 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4822 ctxt->sax->setDocumentLocator(ctxt->userData,
4823 &xmlDefaultSAXLocator);
4824 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4825 (!ctxt->disableSAX))
4826 ctxt->sax->startDocument(ctxt->userData);
4827
4828 cur = in->cur[0];
4829 next = in->cur[1];
4830 if ((cur == '<') && (next == '!') &&
4831 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4832 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4833 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4834 (UPP(8) == 'E')) {
4835 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004836 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004837 goto done;
4838#ifdef DEBUG_PUSH
4839 xmlGenericError(xmlGenericErrorContext,
4840 "HPP: Parsing internal subset\n");
4841#endif
4842 htmlParseDocTypeDecl(ctxt);
4843 ctxt->instate = XML_PARSER_PROLOG;
4844#ifdef DEBUG_PUSH
4845 xmlGenericError(xmlGenericErrorContext,
4846 "HPP: entering PROLOG\n");
4847#endif
4848 } else {
4849 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004850#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004851 xmlGenericError(xmlGenericErrorContext,
4852 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004853#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004854 }
Owen Taylor3473f882001-02-23 17:55:21 +00004855 break;
4856 case XML_PARSER_MISC:
4857 SKIP_BLANKS;
4858 if (in->buf == NULL)
4859 avail = in->length - (in->cur - in->base);
4860 else
4861 avail = in->buf->buffer->use - (in->cur - in->base);
4862 if (avail < 2)
4863 goto done;
4864 cur = in->cur[0];
4865 next = in->cur[1];
4866 if ((cur == '<') && (next == '!') &&
4867 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4868 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004869 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004870 goto done;
4871#ifdef DEBUG_PUSH
4872 xmlGenericError(xmlGenericErrorContext,
4873 "HPP: Parsing Comment\n");
4874#endif
4875 htmlParseComment(ctxt);
4876 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004877 } else if ((cur == '<') && (next == '?')) {
4878 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004879 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004880 goto done;
4881#ifdef DEBUG_PUSH
4882 xmlGenericError(xmlGenericErrorContext,
4883 "HPP: Parsing PI\n");
4884#endif
4885 htmlParsePI(ctxt);
4886 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004887 } else if ((cur == '<') && (next == '!') &&
4888 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4889 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4890 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4891 (UPP(8) == 'E')) {
4892 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004893 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004894 goto done;
4895#ifdef DEBUG_PUSH
4896 xmlGenericError(xmlGenericErrorContext,
4897 "HPP: Parsing internal subset\n");
4898#endif
4899 htmlParseDocTypeDecl(ctxt);
4900 ctxt->instate = XML_PARSER_PROLOG;
4901#ifdef DEBUG_PUSH
4902 xmlGenericError(xmlGenericErrorContext,
4903 "HPP: entering PROLOG\n");
4904#endif
4905 } else if ((cur == '<') && (next == '!') &&
4906 (avail < 9)) {
4907 goto done;
4908 } else {
4909 ctxt->instate = XML_PARSER_START_TAG;
4910#ifdef DEBUG_PUSH
4911 xmlGenericError(xmlGenericErrorContext,
4912 "HPP: entering START_TAG\n");
4913#endif
4914 }
4915 break;
4916 case XML_PARSER_PROLOG:
4917 SKIP_BLANKS;
4918 if (in->buf == NULL)
4919 avail = in->length - (in->cur - in->base);
4920 else
4921 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004922 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00004923 goto done;
4924 cur = in->cur[0];
4925 next = in->cur[1];
4926 if ((cur == '<') && (next == '!') &&
4927 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4928 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004929 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004930 goto done;
4931#ifdef DEBUG_PUSH
4932 xmlGenericError(xmlGenericErrorContext,
4933 "HPP: Parsing Comment\n");
4934#endif
4935 htmlParseComment(ctxt);
4936 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004937 } else if ((cur == '<') && (next == '?')) {
4938 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004939 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004940 goto done;
4941#ifdef DEBUG_PUSH
4942 xmlGenericError(xmlGenericErrorContext,
4943 "HPP: Parsing PI\n");
4944#endif
4945 htmlParsePI(ctxt);
4946 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004947 } else if ((cur == '<') && (next == '!') &&
4948 (avail < 4)) {
4949 goto done;
4950 } else {
4951 ctxt->instate = XML_PARSER_START_TAG;
4952#ifdef DEBUG_PUSH
4953 xmlGenericError(xmlGenericErrorContext,
4954 "HPP: entering START_TAG\n");
4955#endif
4956 }
4957 break;
4958 case XML_PARSER_EPILOG:
4959 if (in->buf == NULL)
4960 avail = in->length - (in->cur - in->base);
4961 else
4962 avail = in->buf->buffer->use - (in->cur - in->base);
4963 if (avail < 1)
4964 goto done;
4965 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004966 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004967 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004968 goto done;
4969 }
4970 if (avail < 2)
4971 goto done;
4972 next = in->cur[1];
4973 if ((cur == '<') && (next == '!') &&
4974 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4975 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004976 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004977 goto done;
4978#ifdef DEBUG_PUSH
4979 xmlGenericError(xmlGenericErrorContext,
4980 "HPP: Parsing Comment\n");
4981#endif
4982 htmlParseComment(ctxt);
4983 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004984 } else if ((cur == '<') && (next == '?')) {
4985 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004986 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004987 goto done;
4988#ifdef DEBUG_PUSH
4989 xmlGenericError(xmlGenericErrorContext,
4990 "HPP: Parsing PI\n");
4991#endif
4992 htmlParsePI(ctxt);
4993 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004994 } else if ((cur == '<') && (next == '!') &&
4995 (avail < 4)) {
4996 goto done;
4997 } else {
4998 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004999 ctxt->wellFormed = 0;
5000 ctxt->instate = XML_PARSER_EOF;
5001#ifdef DEBUG_PUSH
5002 xmlGenericError(xmlGenericErrorContext,
5003 "HPP: entering EOF\n");
5004#endif
5005 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5006 ctxt->sax->endDocument(ctxt->userData);
5007 goto done;
5008 }
5009 break;
5010 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005011 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005012 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005013 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005014
5015 if (avail < 2)
5016 goto done;
5017 cur = in->cur[0];
5018 if (cur != '<') {
5019 ctxt->instate = XML_PARSER_CONTENT;
5020#ifdef DEBUG_PUSH
5021 xmlGenericError(xmlGenericErrorContext,
5022 "HPP: entering CONTENT\n");
5023#endif
5024 break;
5025 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005026 if (in->cur[1] == '/') {
5027 ctxt->instate = XML_PARSER_END_TAG;
5028 ctxt->checkIndex = 0;
5029#ifdef DEBUG_PUSH
5030 xmlGenericError(xmlGenericErrorContext,
5031 "HPP: entering END_TAG\n");
5032#endif
5033 break;
5034 }
Owen Taylor3473f882001-02-23 17:55:21 +00005035 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005036 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005037 goto done;
5038
Daniel Veillard597f1c12005-07-03 23:00:18 +00005039 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005040 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005041 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005042 (name == NULL)) {
5043 if (CUR == '>')
5044 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005045 break;
5046 }
Owen Taylor3473f882001-02-23 17:55:21 +00005047
5048 /*
5049 * Lookup the info for that element.
5050 */
5051 info = htmlTagLookup(name);
5052 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005053 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5054 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005055 }
5056
5057 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005058 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005059 */
5060 if ((CUR == '/') && (NXT(1) == '>')) {
5061 SKIP(2);
5062 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5063 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005064 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005065 ctxt->instate = XML_PARSER_CONTENT;
5066#ifdef DEBUG_PUSH
5067 xmlGenericError(xmlGenericErrorContext,
5068 "HPP: entering CONTENT\n");
5069#endif
5070 break;
5071 }
5072
5073 if (CUR == '>') {
5074 NEXT;
5075 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005076 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5077 "Couldn't find end of Start Tag %s\n",
5078 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005079
5080 /*
5081 * end of parsing of this node.
5082 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005083 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005084 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005085 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005086 }
Owen Taylor3473f882001-02-23 17:55:21 +00005087
5088 ctxt->instate = XML_PARSER_CONTENT;
5089#ifdef DEBUG_PUSH
5090 xmlGenericError(xmlGenericErrorContext,
5091 "HPP: entering CONTENT\n");
5092#endif
5093 break;
5094 }
5095
5096 /*
5097 * Check for an Empty Element from DTD definition
5098 */
5099 if ((info != NULL) && (info->empty)) {
5100 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5101 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005102 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005103 }
5104 ctxt->instate = XML_PARSER_CONTENT;
5105#ifdef DEBUG_PUSH
5106 xmlGenericError(xmlGenericErrorContext,
5107 "HPP: entering CONTENT\n");
5108#endif
5109 break;
5110 }
5111 case XML_PARSER_CONTENT: {
5112 long cons;
5113 /*
5114 * Handle preparsed entities and charRef
5115 */
5116 if (ctxt->token != 0) {
5117 xmlChar chr[2] = { 0 , 0 } ;
5118
5119 chr[0] = (xmlChar) ctxt->token;
5120 htmlCheckParagraph(ctxt);
5121 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5122 ctxt->sax->characters(ctxt->userData, chr, 1);
5123 ctxt->token = 0;
5124 ctxt->checkIndex = 0;
5125 }
5126 if ((avail == 1) && (terminate)) {
5127 cur = in->cur[0];
5128 if ((cur != '<') && (cur != '&')) {
5129 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005130 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005131 if (ctxt->sax->ignorableWhitespace != NULL)
5132 ctxt->sax->ignorableWhitespace(
5133 ctxt->userData, &cur, 1);
5134 } else {
5135 htmlCheckParagraph(ctxt);
5136 if (ctxt->sax->characters != NULL)
5137 ctxt->sax->characters(
5138 ctxt->userData, &cur, 1);
5139 }
5140 }
5141 ctxt->token = 0;
5142 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005143 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005144 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005145 }
Owen Taylor3473f882001-02-23 17:55:21 +00005146 }
5147 if (avail < 2)
5148 goto done;
5149 cur = in->cur[0];
5150 next = in->cur[1];
5151 cons = ctxt->nbChars;
5152 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5153 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5154 /*
5155 * Handle SCRIPT/STYLE separately
5156 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005157 if (!terminate) {
5158 int idx;
5159 xmlChar val;
5160
Jiri Netolicky446e1262009-08-07 17:05:36 +02005161 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005162 if (idx < 0)
5163 goto done;
5164 val = in->cur[idx + 2];
5165 if (val == 0) /* bad cut of input */
5166 goto done;
5167 }
Owen Taylor3473f882001-02-23 17:55:21 +00005168 htmlParseScript(ctxt);
5169 if ((cur == '<') && (next == '/')) {
5170 ctxt->instate = XML_PARSER_END_TAG;
5171 ctxt->checkIndex = 0;
5172#ifdef DEBUG_PUSH
5173 xmlGenericError(xmlGenericErrorContext,
5174 "HPP: entering END_TAG\n");
5175#endif
5176 break;
5177 }
5178 } else {
5179 /*
5180 * Sometimes DOCTYPE arrives in the middle of the document
5181 */
5182 if ((cur == '<') && (next == '!') &&
5183 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5184 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5185 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5186 (UPP(8) == 'E')) {
5187 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005188 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005189 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005190 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5191 "Misplaced DOCTYPE declaration\n",
5192 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005193 htmlParseDocTypeDecl(ctxt);
5194 } else if ((cur == '<') && (next == '!') &&
5195 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5196 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005197 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005198 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005199 goto done;
5200#ifdef DEBUG_PUSH
5201 xmlGenericError(xmlGenericErrorContext,
5202 "HPP: Parsing Comment\n");
5203#endif
5204 htmlParseComment(ctxt);
5205 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005206 } else if ((cur == '<') && (next == '?')) {
5207 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005208 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005209 goto done;
5210#ifdef DEBUG_PUSH
5211 xmlGenericError(xmlGenericErrorContext,
5212 "HPP: Parsing PI\n");
5213#endif
5214 htmlParsePI(ctxt);
5215 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005216 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5217 goto done;
5218 } else if ((cur == '<') && (next == '/')) {
5219 ctxt->instate = XML_PARSER_END_TAG;
5220 ctxt->checkIndex = 0;
5221#ifdef DEBUG_PUSH
5222 xmlGenericError(xmlGenericErrorContext,
5223 "HPP: entering END_TAG\n");
5224#endif
5225 break;
5226 } else if (cur == '<') {
5227 ctxt->instate = XML_PARSER_START_TAG;
5228 ctxt->checkIndex = 0;
5229#ifdef DEBUG_PUSH
5230 xmlGenericError(xmlGenericErrorContext,
5231 "HPP: entering START_TAG\n");
5232#endif
5233 break;
5234 } else if (cur == '&') {
5235 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005236 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005237 goto done;
5238#ifdef DEBUG_PUSH
5239 xmlGenericError(xmlGenericErrorContext,
5240 "HPP: Parsing Reference\n");
5241#endif
5242 /* TODO: check generation of subtrees if noent !!! */
5243 htmlParseReference(ctxt);
5244 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005245 /*
5246 * check that the text sequence is complete
5247 * before handing out the data to the parser
5248 * to avoid problems with erroneous end of
5249 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005250 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005251 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005252 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005253 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005254 ctxt->checkIndex = 0;
5255#ifdef DEBUG_PUSH
5256 xmlGenericError(xmlGenericErrorContext,
5257 "HPP: Parsing char data\n");
5258#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005259 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005260 }
5261 }
5262 if (cons == ctxt->nbChars) {
5263 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005264 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5265 "detected an error in element content\n",
5266 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005267 }
5268 NEXT;
5269 break;
5270 }
5271
5272 break;
5273 }
5274 case XML_PARSER_END_TAG:
5275 if (avail < 2)
5276 goto done;
5277 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005278 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005279 goto done;
5280 htmlParseEndTag(ctxt);
5281 if (ctxt->nameNr == 0) {
5282 ctxt->instate = XML_PARSER_EPILOG;
5283 } else {
5284 ctxt->instate = XML_PARSER_CONTENT;
5285 }
5286 ctxt->checkIndex = 0;
5287#ifdef DEBUG_PUSH
5288 xmlGenericError(xmlGenericErrorContext,
5289 "HPP: entering CONTENT\n");
5290#endif
5291 break;
5292 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005293 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5294 "HPP: internal error, state == CDATA\n",
5295 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005296 ctxt->instate = XML_PARSER_CONTENT;
5297 ctxt->checkIndex = 0;
5298#ifdef DEBUG_PUSH
5299 xmlGenericError(xmlGenericErrorContext,
5300 "HPP: entering CONTENT\n");
5301#endif
5302 break;
5303 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005304 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5305 "HPP: internal error, state == DTD\n",
5306 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005307 ctxt->instate = XML_PARSER_CONTENT;
5308 ctxt->checkIndex = 0;
5309#ifdef DEBUG_PUSH
5310 xmlGenericError(xmlGenericErrorContext,
5311 "HPP: entering CONTENT\n");
5312#endif
5313 break;
5314 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005315 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5316 "HPP: internal error, state == COMMENT\n",
5317 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005318 ctxt->instate = XML_PARSER_CONTENT;
5319 ctxt->checkIndex = 0;
5320#ifdef DEBUG_PUSH
5321 xmlGenericError(xmlGenericErrorContext,
5322 "HPP: entering CONTENT\n");
5323#endif
5324 break;
5325 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005326 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5327 "HPP: internal error, state == PI\n",
5328 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005329 ctxt->instate = XML_PARSER_CONTENT;
5330 ctxt->checkIndex = 0;
5331#ifdef DEBUG_PUSH
5332 xmlGenericError(xmlGenericErrorContext,
5333 "HPP: entering CONTENT\n");
5334#endif
5335 break;
5336 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005337 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5338 "HPP: internal error, state == ENTITY_DECL\n",
5339 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005340 ctxt->instate = XML_PARSER_CONTENT;
5341 ctxt->checkIndex = 0;
5342#ifdef DEBUG_PUSH
5343 xmlGenericError(xmlGenericErrorContext,
5344 "HPP: entering CONTENT\n");
5345#endif
5346 break;
5347 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005348 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5349 "HPP: internal error, state == ENTITY_VALUE\n",
5350 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005351 ctxt->instate = XML_PARSER_CONTENT;
5352 ctxt->checkIndex = 0;
5353#ifdef DEBUG_PUSH
5354 xmlGenericError(xmlGenericErrorContext,
5355 "HPP: entering DTD\n");
5356#endif
5357 break;
5358 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005359 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5360 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5361 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005362 ctxt->instate = XML_PARSER_START_TAG;
5363 ctxt->checkIndex = 0;
5364#ifdef DEBUG_PUSH
5365 xmlGenericError(xmlGenericErrorContext,
5366 "HPP: entering START_TAG\n");
5367#endif
5368 break;
5369 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005370 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5371 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5372 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005373 ctxt->instate = XML_PARSER_CONTENT;
5374 ctxt->checkIndex = 0;
5375#ifdef DEBUG_PUSH
5376 xmlGenericError(xmlGenericErrorContext,
5377 "HPP: entering CONTENT\n");
5378#endif
5379 break;
5380 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005381 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5382 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5383 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005384 ctxt->instate = XML_PARSER_CONTENT;
5385 ctxt->checkIndex = 0;
5386#ifdef DEBUG_PUSH
5387 xmlGenericError(xmlGenericErrorContext,
5388 "HPP: entering CONTENT\n");
5389#endif
5390 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005391 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005392 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5393 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5394 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005395 ctxt->instate = XML_PARSER_CONTENT;
5396 ctxt->checkIndex = 0;
5397#ifdef DEBUG_PUSH
5398 xmlGenericError(xmlGenericErrorContext,
5399 "HPP: entering CONTENT\n");
5400#endif
5401 break;
5402
Owen Taylor3473f882001-02-23 17:55:21 +00005403 }
5404 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005405done:
Owen Taylor3473f882001-02-23 17:55:21 +00005406 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005407 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005408 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005409 /*
5410 * SAX: end of the document processing.
5411 */
5412 ctxt->instate = XML_PARSER_EOF;
5413 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5414 ctxt->sax->endDocument(ctxt->userData);
5415 }
5416 }
5417 if ((ctxt->myDoc != NULL) &&
5418 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5419 (ctxt->instate == XML_PARSER_EPILOG))) {
5420 xmlDtdPtr dtd;
5421 dtd = xmlGetIntSubset(ctxt->myDoc);
5422 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005423 ctxt->myDoc->intSubset =
5424 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005425 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5426 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5427 }
5428#ifdef DEBUG_PUSH
5429 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5430#endif
5431 return(ret);
5432}
5433
5434/**
Owen Taylor3473f882001-02-23 17:55:21 +00005435 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005436 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005437 * @chunk: an char array
5438 * @size: the size in byte of the chunk
5439 * @terminate: last chunk indicator
5440 *
5441 * Parse a Chunk of memory
5442 *
5443 * Returns zero if no error, the xmlParserErrors otherwise.
5444 */
5445int
5446htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5447 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005448 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5449 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5450 "htmlParseChunk: context error\n", NULL, NULL);
5451 return(XML_ERR_INTERNAL_ERROR);
5452 }
Owen Taylor3473f882001-02-23 17:55:21 +00005453 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5454 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5455 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5456 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005457 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005458
5459 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005460 if (res < 0) {
5461 ctxt->errNo = XML_PARSER_EOF;
5462 ctxt->disableSAX = 1;
5463 return (XML_PARSER_EOF);
5464 }
Owen Taylor3473f882001-02-23 17:55:21 +00005465 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5466 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005467 ctxt->input->end =
5468 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005469#ifdef DEBUG_PUSH
5470 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5471#endif
5472
Daniel Veillard14f752c2003-08-09 11:44:50 +00005473#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005474 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5475 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005476#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005477 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005478 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5479 xmlParserInputBufferPtr in = ctxt->input->buf;
5480 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5481 (in->raw != NULL)) {
5482 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005483
Daniel Veillard14f752c2003-08-09 11:44:50 +00005484 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5485 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005486 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5487 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005488 return(XML_ERR_INVALID_ENCODING);
5489 }
5490 }
5491 }
Owen Taylor3473f882001-02-23 17:55:21 +00005492 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005493 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005494 if (terminate) {
5495 if ((ctxt->instate != XML_PARSER_EOF) &&
5496 (ctxt->instate != XML_PARSER_EPILOG) &&
5497 (ctxt->instate != XML_PARSER_MISC)) {
5498 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005499 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005500 }
Owen Taylor3473f882001-02-23 17:55:21 +00005501 if (ctxt->instate != XML_PARSER_EOF) {
5502 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5503 ctxt->sax->endDocument(ctxt->userData);
5504 }
5505 ctxt->instate = XML_PARSER_EOF;
5506 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005507 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005508}
5509
5510/************************************************************************
5511 * *
5512 * User entry points *
5513 * *
5514 ************************************************************************/
5515
5516/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005517 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005518 * @sax: a SAX handler
5519 * @user_data: The user data returned on SAX callbacks
5520 * @chunk: a pointer to an array of chars
5521 * @size: number of chars in the array
5522 * @filename: an optional file name or URI
5523 * @enc: an optional encoding
5524 *
5525 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005526 * The value of @filename is used for fetching external entities
5527 * and error/warning reports.
5528 *
5529 * Returns the new parser context or NULL
5530 */
5531htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005532htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005533 const char *chunk, int size, const char *filename,
5534 xmlCharEncoding enc) {
5535 htmlParserCtxtPtr ctxt;
5536 htmlParserInputPtr inputStream;
5537 xmlParserInputBufferPtr buf;
5538
Daniel Veillardd0463562001-10-13 09:15:48 +00005539 xmlInitParser();
5540
Owen Taylor3473f882001-02-23 17:55:21 +00005541 buf = xmlAllocParserInputBuffer(enc);
5542 if (buf == NULL) return(NULL);
5543
Daniel Veillardf403d292003-10-05 13:51:35 +00005544 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005545 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005546 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005547 return(NULL);
5548 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005549 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5550 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005551 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005552 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005553 xmlFree(ctxt->sax);
5554 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5555 if (ctxt->sax == NULL) {
5556 xmlFree(buf);
5557 xmlFree(ctxt);
5558 return(NULL);
5559 }
5560 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5561 if (user_data != NULL)
5562 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005563 }
Owen Taylor3473f882001-02-23 17:55:21 +00005564 if (filename == NULL) {
5565 ctxt->directory = NULL;
5566 } else {
5567 ctxt->directory = xmlParserGetDirectory(filename);
5568 }
5569
5570 inputStream = htmlNewInputStream(ctxt);
5571 if (inputStream == NULL) {
5572 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005573 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005574 return(NULL);
5575 }
5576
5577 if (filename == NULL)
5578 inputStream->filename = NULL;
5579 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005580 inputStream->filename = (char *)
5581 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005582 inputStream->buf = buf;
5583 inputStream->base = inputStream->buf->buffer->content;
5584 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005585 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005586 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005587
5588 inputPush(ctxt, inputStream);
5589
5590 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005591 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005592 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5593 int cur = ctxt->input->cur - ctxt->input->base;
5594
Daniel Veillarde77db162009-08-22 11:32:38 +02005595 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005596
5597 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5598 ctxt->input->cur = ctxt->input->base + cur;
5599 ctxt->input->end =
5600 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005601#ifdef DEBUG_PUSH
5602 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5603#endif
5604 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005605 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005606
5607 return(ctxt);
5608}
William M. Brack21e4ef22005-01-02 09:53:13 +00005609#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005610
5611/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005612 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005613 * @cur: a pointer to an array of xmlChar
5614 * @encoding: a free form C string describing the HTML document encoding, or NULL
5615 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005616 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005617 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005618 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5619 * to handle parse events. If sax is NULL, fallback to the default DOM
5620 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005621 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005622 * Returns the resulting document tree unless SAX is NULL or the document is
5623 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005624 */
5625
5626htmlDocPtr
5627htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5628 htmlDocPtr ret;
5629 htmlParserCtxtPtr ctxt;
5630
Daniel Veillardd0463562001-10-13 09:15:48 +00005631 xmlInitParser();
5632
Owen Taylor3473f882001-02-23 17:55:21 +00005633 if (cur == NULL) return(NULL);
5634
5635
5636 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5637 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005638 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005639 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005640 ctxt->sax = sax;
5641 ctxt->userData = userData;
5642 }
5643
5644 htmlParseDocument(ctxt);
5645 ret = ctxt->myDoc;
5646 if (sax != NULL) {
5647 ctxt->sax = NULL;
5648 ctxt->userData = NULL;
5649 }
5650 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005651
Owen Taylor3473f882001-02-23 17:55:21 +00005652 return(ret);
5653}
5654
5655/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005656 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005657 * @cur: a pointer to an array of xmlChar
5658 * @encoding: a free form C string describing the HTML document encoding, or NULL
5659 *
5660 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005661 *
Owen Taylor3473f882001-02-23 17:55:21 +00005662 * Returns the resulting document tree
5663 */
5664
5665htmlDocPtr
5666htmlParseDoc(xmlChar *cur, const char *encoding) {
5667 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5668}
5669
5670
5671/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005672 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005673 * @filename: the filename
5674 * @encoding: a free form C string describing the HTML document encoding, or NULL
5675 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005676 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005677 * Automatic support for ZLIB/Compress compressed document is provided
5678 * by default if found at compile-time.
5679 *
5680 * Returns the new parser context or NULL
5681 */
5682htmlParserCtxtPtr
5683htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5684{
5685 htmlParserCtxtPtr ctxt;
5686 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005687 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005688 /* htmlCharEncoding enc; */
5689 xmlChar *content, *content_line = (xmlChar *) "charset=";
5690
Daniel Veillarda03e3652004-11-02 18:45:30 +00005691 if (filename == NULL)
5692 return(NULL);
5693
Daniel Veillardf403d292003-10-05 13:51:35 +00005694 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005695 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005696 return(NULL);
5697 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005698 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5699 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005700#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005701 if (xmlDefaultSAXHandler.error != NULL) {
5702 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5703 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005704#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005705 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005706 return(NULL);
5707 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005708
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005709 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5710 xmlFree(canonicFilename);
5711 if (inputStream == NULL) {
5712 xmlFreeParserCtxt(ctxt);
5713 return(NULL);
5714 }
Owen Taylor3473f882001-02-23 17:55:21 +00005715
5716 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005717
Owen Taylor3473f882001-02-23 17:55:21 +00005718 /* set encoding */
5719 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005720 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005721 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005722 strcpy ((char *)content, (char *)content_line);
5723 strcat ((char *)content, (char *)encoding);
5724 htmlCheckEncoding (ctxt, content);
5725 xmlFree (content);
5726 }
5727 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005728
Owen Taylor3473f882001-02-23 17:55:21 +00005729 return(ctxt);
5730}
5731
5732/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005733 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005734 * @filename: the filename
5735 * @encoding: a free form C string describing the HTML document encoding, or NULL
5736 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005737 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005738 *
5739 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5740 * compressed document is provided by default if found at compile-time.
5741 * It use the given SAX function block to handle the parsing callback.
5742 * If sax is NULL, fallback to the default DOM tree building routines.
5743 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005744 * Returns the resulting document tree unless SAX is NULL or the document is
5745 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005746 */
5747
5748htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005749htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005750 void *userData) {
5751 htmlDocPtr ret;
5752 htmlParserCtxtPtr ctxt;
5753 htmlSAXHandlerPtr oldsax = NULL;
5754
Daniel Veillardd0463562001-10-13 09:15:48 +00005755 xmlInitParser();
5756
Owen Taylor3473f882001-02-23 17:55:21 +00005757 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5758 if (ctxt == NULL) return(NULL);
5759 if (sax != NULL) {
5760 oldsax = ctxt->sax;
5761 ctxt->sax = sax;
5762 ctxt->userData = userData;
5763 }
5764
5765 htmlParseDocument(ctxt);
5766
5767 ret = ctxt->myDoc;
5768 if (sax != NULL) {
5769 ctxt->sax = oldsax;
5770 ctxt->userData = NULL;
5771 }
5772 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005773
Owen Taylor3473f882001-02-23 17:55:21 +00005774 return(ret);
5775}
5776
5777/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005778 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005779 * @filename: the filename
5780 * @encoding: a free form C string describing the HTML document encoding, or NULL
5781 *
5782 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5783 * compressed document is provided by default if found at compile-time.
5784 *
5785 * Returns the resulting document tree
5786 */
5787
5788htmlDocPtr
5789htmlParseFile(const char *filename, const char *encoding) {
5790 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5791}
5792
5793/**
5794 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005795 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005796 *
5797 * Set and return the previous value for handling HTML omitted tags.
5798 *
5799 * Returns the last value for 0 for no handling, 1 for auto insertion.
5800 */
5801
5802int
5803htmlHandleOmittedElem(int val) {
5804 int old = htmlOmittedDefaultValue;
5805
5806 htmlOmittedDefaultValue = val;
5807 return(old);
5808}
5809
Daniel Veillard930dfb62003-02-05 10:17:38 +00005810/**
5811 * htmlElementAllowedHere:
5812 * @parent: HTML parent element
5813 * @elt: HTML element
5814 *
5815 * Checks whether an HTML element may be a direct child of a parent element.
5816 * Note - doesn't check for deprecated elements
5817 *
5818 * Returns 1 if allowed; 0 otherwise.
5819 */
5820int
5821htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5822 const char** p ;
5823
5824 if ( ! elt || ! parent || ! parent->subelts )
5825 return 0 ;
5826
5827 for ( p = parent->subelts; *p; ++p )
5828 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5829 return 1 ;
5830
5831 return 0 ;
5832}
5833/**
5834 * htmlElementStatusHere:
5835 * @parent: HTML parent element
5836 * @elt: HTML element
5837 *
5838 * Checks whether an HTML element may be a direct child of a parent element.
5839 * and if so whether it is valid or deprecated.
5840 *
5841 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5842 */
5843htmlStatus
5844htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5845 if ( ! parent || ! elt )
5846 return HTML_INVALID ;
5847 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5848 return HTML_INVALID ;
5849
5850 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5851}
5852/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005853 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005854 * @elt: HTML element
5855 * @attr: HTML attribute
5856 * @legacy: whether to allow deprecated attributes
5857 *
5858 * Checks whether an attribute is valid for an element
5859 * Has full knowledge of Required and Deprecated attributes
5860 *
5861 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5862 */
5863htmlStatus
5864htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5865 const char** p ;
5866
5867 if ( !elt || ! attr )
5868 return HTML_INVALID ;
5869
5870 if ( elt->attrs_req )
5871 for ( p = elt->attrs_req; *p; ++p)
5872 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5873 return HTML_REQUIRED ;
5874
5875 if ( elt->attrs_opt )
5876 for ( p = elt->attrs_opt; *p; ++p)
5877 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5878 return HTML_VALID ;
5879
5880 if ( legacy && elt->attrs_depr )
5881 for ( p = elt->attrs_depr; *p; ++p)
5882 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5883 return HTML_DEPRECATED ;
5884
5885 return HTML_INVALID ;
5886}
5887/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005888 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005889 * @node: an htmlNodePtr in a tree
5890 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005891 * for Element nodes)
5892 *
5893 * Checks whether the tree node is valid. Experimental (the author
5894 * only uses the HTML enhancements in a SAX parser)
5895 *
5896 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5897 * legacy allowed) or htmlElementStatusHere (otherwise).
5898 * for Attribute nodes, a return from htmlAttrAllowed
5899 * for other nodes, HTML_NA (no checks performed)
5900 */
5901htmlStatus
5902htmlNodeStatus(const htmlNodePtr node, int legacy) {
5903 if ( ! node )
5904 return HTML_INVALID ;
5905
5906 switch ( node->type ) {
5907 case XML_ELEMENT_NODE:
5908 return legacy
5909 ? ( htmlElementAllowedHere (
5910 htmlTagLookup(node->parent->name) , node->name
5911 ) ? HTML_VALID : HTML_INVALID )
5912 : htmlElementStatusHere(
5913 htmlTagLookup(node->parent->name) ,
5914 htmlTagLookup(node->name) )
5915 ;
5916 case XML_ATTRIBUTE_NODE:
5917 return htmlAttrAllowed(
5918 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5919 default: return HTML_NA ;
5920 }
5921}
Daniel Veillard9475a352003-09-26 12:47:50 +00005922/************************************************************************
5923 * *
5924 * New set (2.6.0) of simpler and more flexible APIs *
5925 * *
5926 ************************************************************************/
5927/**
5928 * DICT_FREE:
5929 * @str: a string
5930 *
5931 * Free a string if it is not owned by the "dict" dictionnary in the
5932 * current scope
5933 */
5934#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02005935 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00005936 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5937 xmlFree((char *)(str));
5938
5939/**
5940 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005941 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005942 *
5943 * Reset a parser context
5944 */
5945void
5946htmlCtxtReset(htmlParserCtxtPtr ctxt)
5947{
5948 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005949 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02005950
Daniel Veillarda03e3652004-11-02 18:45:30 +00005951 if (ctxt == NULL)
5952 return;
5953
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005954 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005955 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005956
5957 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5958 xmlFreeInputStream(input);
5959 }
5960 ctxt->inputNr = 0;
5961 ctxt->input = NULL;
5962
5963 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005964 if (ctxt->spaceTab != NULL) {
5965 ctxt->spaceTab[0] = -1;
5966 ctxt->space = &ctxt->spaceTab[0];
5967 } else {
5968 ctxt->space = NULL;
5969 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005970
5971
5972 ctxt->nodeNr = 0;
5973 ctxt->node = NULL;
5974
5975 ctxt->nameNr = 0;
5976 ctxt->name = NULL;
5977
5978 DICT_FREE(ctxt->version);
5979 ctxt->version = NULL;
5980 DICT_FREE(ctxt->encoding);
5981 ctxt->encoding = NULL;
5982 DICT_FREE(ctxt->directory);
5983 ctxt->directory = NULL;
5984 DICT_FREE(ctxt->extSubURI);
5985 ctxt->extSubURI = NULL;
5986 DICT_FREE(ctxt->extSubSystem);
5987 ctxt->extSubSystem = NULL;
5988 if (ctxt->myDoc != NULL)
5989 xmlFreeDoc(ctxt->myDoc);
5990 ctxt->myDoc = NULL;
5991
5992 ctxt->standalone = -1;
5993 ctxt->hasExternalSubset = 0;
5994 ctxt->hasPErefs = 0;
5995 ctxt->html = 1;
5996 ctxt->external = 0;
5997 ctxt->instate = XML_PARSER_START;
5998 ctxt->token = 0;
5999
6000 ctxt->wellFormed = 1;
6001 ctxt->nsWellFormed = 1;
6002 ctxt->valid = 1;
6003 ctxt->vctxt.userData = ctxt;
6004 ctxt->vctxt.error = xmlParserValidityError;
6005 ctxt->vctxt.warning = xmlParserValidityWarning;
6006 ctxt->record_info = 0;
6007 ctxt->nbChars = 0;
6008 ctxt->checkIndex = 0;
6009 ctxt->inSubset = 0;
6010 ctxt->errNo = XML_ERR_OK;
6011 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006012 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006013 ctxt->catalogs = NULL;
6014 xmlInitNodeInfoSeq(&ctxt->node_seq);
6015
6016 if (ctxt->attsDefault != NULL) {
6017 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6018 ctxt->attsDefault = NULL;
6019 }
6020 if (ctxt->attsSpecial != NULL) {
6021 xmlHashFree(ctxt->attsSpecial, NULL);
6022 ctxt->attsSpecial = NULL;
6023 }
6024}
6025
6026/**
6027 * htmlCtxtUseOptions:
6028 * @ctxt: an HTML parser context
6029 * @options: a combination of htmlParserOption(s)
6030 *
6031 * Applies the options to the parser context
6032 *
6033 * Returns 0 in case of success, the set of unknown or unimplemented options
6034 * in case of error.
6035 */
6036int
6037htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6038{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006039 if (ctxt == NULL)
6040 return(-1);
6041
Daniel Veillard9475a352003-09-26 12:47:50 +00006042 if (options & HTML_PARSE_NOWARNING) {
6043 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006044 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006045 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006046 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006047 }
6048 if (options & HTML_PARSE_NOERROR) {
6049 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006050 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006051 ctxt->sax->fatalError = NULL;
6052 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006053 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006054 }
6055 if (options & HTML_PARSE_PEDANTIC) {
6056 ctxt->pedantic = 1;
6057 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006058 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006059 } else
6060 ctxt->pedantic = 0;
6061 if (options & XML_PARSE_NOBLANKS) {
6062 ctxt->keepBlanks = 0;
6063 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6064 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006065 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006066 } else
6067 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006068 if (options & HTML_PARSE_RECOVER) {
6069 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006070 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006071 } else
6072 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006073 if (options & HTML_PARSE_COMPACT) {
6074 ctxt->options |= HTML_PARSE_COMPACT;
6075 options -= HTML_PARSE_COMPACT;
6076 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006077 if (options & XML_PARSE_HUGE) {
6078 ctxt->options |= XML_PARSE_HUGE;
6079 options -= XML_PARSE_HUGE;
6080 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006081 ctxt->dictNames = 0;
6082 return (options);
6083}
6084
6085/**
6086 * htmlDoRead:
6087 * @ctxt: an HTML parser context
6088 * @URL: the base URL to use for the document
6089 * @encoding: the document encoding, or NULL
6090 * @options: a combination of htmlParserOption(s)
6091 * @reuse: keep the context for reuse
6092 *
6093 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006094 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006095 * Returns the resulting document tree or NULL
6096 */
6097static htmlDocPtr
6098htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6099 int options, int reuse)
6100{
6101 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006102
Daniel Veillard9475a352003-09-26 12:47:50 +00006103 htmlCtxtUseOptions(ctxt, options);
6104 ctxt->html = 1;
6105 if (encoding != NULL) {
6106 xmlCharEncodingHandlerPtr hdlr;
6107
6108 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006109 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006110 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006111 if (ctxt->input->encoding != NULL)
6112 xmlFree((xmlChar *) ctxt->input->encoding);
6113 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6114 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006115 }
6116 if ((URL != NULL) && (ctxt->input != NULL) &&
6117 (ctxt->input->filename == NULL))
6118 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6119 htmlParseDocument(ctxt);
6120 ret = ctxt->myDoc;
6121 ctxt->myDoc = NULL;
6122 if (!reuse) {
6123 if ((ctxt->dictNames) &&
6124 (ret != NULL) &&
6125 (ret->dict == ctxt->dict))
6126 ctxt->dict = NULL;
6127 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006128 }
6129 return (ret);
6130}
6131
6132/**
6133 * htmlReadDoc:
6134 * @cur: a pointer to a zero terminated string
6135 * @URL: the base URL to use for the document
6136 * @encoding: the document encoding, or NULL
6137 * @options: a combination of htmlParserOption(s)
6138 *
6139 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006140 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006141 * Returns the resulting document tree
6142 */
6143htmlDocPtr
6144htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6145{
6146 htmlParserCtxtPtr ctxt;
6147
6148 if (cur == NULL)
6149 return (NULL);
6150
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006151 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006152 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006153 if (ctxt == NULL)
6154 return (NULL);
6155 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6156}
6157
6158/**
6159 * htmlReadFile:
6160 * @filename: a file or URL
6161 * @encoding: the document encoding, or NULL
6162 * @options: a combination of htmlParserOption(s)
6163 *
6164 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006165 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006166 * Returns the resulting document tree
6167 */
6168htmlDocPtr
6169htmlReadFile(const char *filename, const char *encoding, int options)
6170{
6171 htmlParserCtxtPtr ctxt;
6172
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006173 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006174 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6175 if (ctxt == NULL)
6176 return (NULL);
6177 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6178}
6179
6180/**
6181 * htmlReadMemory:
6182 * @buffer: a pointer to a char array
6183 * @size: the size of the array
6184 * @URL: the base URL to use for the document
6185 * @encoding: the document encoding, or NULL
6186 * @options: a combination of htmlParserOption(s)
6187 *
6188 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006189 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006190 * Returns the resulting document tree
6191 */
6192htmlDocPtr
6193htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6194{
6195 htmlParserCtxtPtr ctxt;
6196
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006197 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006198 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6199 if (ctxt == NULL)
6200 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006201 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006202 if (ctxt->sax != NULL)
6203 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006204 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6205}
6206
6207/**
6208 * htmlReadFd:
6209 * @fd: an open file descriptor
6210 * @URL: the base URL to use for the document
6211 * @encoding: the document encoding, or NULL
6212 * @options: a combination of htmlParserOption(s)
6213 *
6214 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006215 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006216 * Returns the resulting document tree
6217 */
6218htmlDocPtr
6219htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6220{
6221 htmlParserCtxtPtr ctxt;
6222 xmlParserInputBufferPtr input;
6223 xmlParserInputPtr stream;
6224
6225 if (fd < 0)
6226 return (NULL);
6227
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006228 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006229 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6230 if (input == NULL)
6231 return (NULL);
6232 ctxt = xmlNewParserCtxt();
6233 if (ctxt == NULL) {
6234 xmlFreeParserInputBuffer(input);
6235 return (NULL);
6236 }
6237 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6238 if (stream == NULL) {
6239 xmlFreeParserInputBuffer(input);
6240 xmlFreeParserCtxt(ctxt);
6241 return (NULL);
6242 }
6243 inputPush(ctxt, stream);
6244 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6245}
6246
6247/**
6248 * htmlReadIO:
6249 * @ioread: an I/O read function
6250 * @ioclose: an I/O close function
6251 * @ioctx: an I/O handler
6252 * @URL: the base URL to use for the document
6253 * @encoding: the document encoding, or NULL
6254 * @options: a combination of htmlParserOption(s)
6255 *
6256 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006257 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006258 * Returns the resulting document tree
6259 */
6260htmlDocPtr
6261htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6262 void *ioctx, const char *URL, const char *encoding, int options)
6263{
6264 htmlParserCtxtPtr ctxt;
6265 xmlParserInputBufferPtr input;
6266 xmlParserInputPtr stream;
6267
6268 if (ioread == NULL)
6269 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006270 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006271
6272 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6273 XML_CHAR_ENCODING_NONE);
6274 if (input == NULL)
6275 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006276 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006277 if (ctxt == NULL) {
6278 xmlFreeParserInputBuffer(input);
6279 return (NULL);
6280 }
6281 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6282 if (stream == NULL) {
6283 xmlFreeParserInputBuffer(input);
6284 xmlFreeParserCtxt(ctxt);
6285 return (NULL);
6286 }
6287 inputPush(ctxt, stream);
6288 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6289}
6290
6291/**
6292 * htmlCtxtReadDoc:
6293 * @ctxt: an HTML parser context
6294 * @cur: a pointer to a zero terminated string
6295 * @URL: the base URL to use for the document
6296 * @encoding: the document encoding, or NULL
6297 * @options: a combination of htmlParserOption(s)
6298 *
6299 * parse an XML in-memory document and build a tree.
6300 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006301 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006302 * Returns the resulting document tree
6303 */
6304htmlDocPtr
6305htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6306 const char *URL, const char *encoding, int options)
6307{
6308 xmlParserInputPtr stream;
6309
6310 if (cur == NULL)
6311 return (NULL);
6312 if (ctxt == NULL)
6313 return (NULL);
6314
6315 htmlCtxtReset(ctxt);
6316
6317 stream = xmlNewStringInputStream(ctxt, cur);
6318 if (stream == NULL) {
6319 return (NULL);
6320 }
6321 inputPush(ctxt, stream);
6322 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6323}
6324
6325/**
6326 * htmlCtxtReadFile:
6327 * @ctxt: an HTML parser context
6328 * @filename: a file or URL
6329 * @encoding: the document encoding, or NULL
6330 * @options: a combination of htmlParserOption(s)
6331 *
6332 * parse an XML file from the filesystem or the network.
6333 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006334 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006335 * Returns the resulting document tree
6336 */
6337htmlDocPtr
6338htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6339 const char *encoding, int options)
6340{
6341 xmlParserInputPtr stream;
6342
6343 if (filename == NULL)
6344 return (NULL);
6345 if (ctxt == NULL)
6346 return (NULL);
6347
6348 htmlCtxtReset(ctxt);
6349
Daniel Veillard29614c72004-11-26 10:47:26 +00006350 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006351 if (stream == NULL) {
6352 return (NULL);
6353 }
6354 inputPush(ctxt, stream);
6355 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6356}
6357
6358/**
6359 * htmlCtxtReadMemory:
6360 * @ctxt: an HTML parser context
6361 * @buffer: a pointer to a char array
6362 * @size: the size of the array
6363 * @URL: the base URL to use for the document
6364 * @encoding: the document encoding, or NULL
6365 * @options: a combination of htmlParserOption(s)
6366 *
6367 * parse an XML in-memory document and build a tree.
6368 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006369 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006370 * Returns the resulting document tree
6371 */
6372htmlDocPtr
6373htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6374 const char *URL, const char *encoding, int options)
6375{
6376 xmlParserInputBufferPtr input;
6377 xmlParserInputPtr stream;
6378
6379 if (ctxt == NULL)
6380 return (NULL);
6381 if (buffer == NULL)
6382 return (NULL);
6383
6384 htmlCtxtReset(ctxt);
6385
6386 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6387 if (input == NULL) {
6388 return(NULL);
6389 }
6390
6391 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6392 if (stream == NULL) {
6393 xmlFreeParserInputBuffer(input);
6394 return(NULL);
6395 }
6396
6397 inputPush(ctxt, stream);
6398 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6399}
6400
6401/**
6402 * htmlCtxtReadFd:
6403 * @ctxt: an HTML parser context
6404 * @fd: an open file descriptor
6405 * @URL: the base URL to use for the document
6406 * @encoding: the document encoding, or NULL
6407 * @options: a combination of htmlParserOption(s)
6408 *
6409 * parse an XML from a file descriptor and build a tree.
6410 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006411 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006412 * Returns the resulting document tree
6413 */
6414htmlDocPtr
6415htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6416 const char *URL, const char *encoding, int options)
6417{
6418 xmlParserInputBufferPtr input;
6419 xmlParserInputPtr stream;
6420
6421 if (fd < 0)
6422 return (NULL);
6423 if (ctxt == NULL)
6424 return (NULL);
6425
6426 htmlCtxtReset(ctxt);
6427
6428
6429 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6430 if (input == NULL)
6431 return (NULL);
6432 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6433 if (stream == NULL) {
6434 xmlFreeParserInputBuffer(input);
6435 return (NULL);
6436 }
6437 inputPush(ctxt, stream);
6438 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6439}
6440
6441/**
6442 * htmlCtxtReadIO:
6443 * @ctxt: an HTML parser context
6444 * @ioread: an I/O read function
6445 * @ioclose: an I/O close function
6446 * @ioctx: an I/O handler
6447 * @URL: the base URL to use for the document
6448 * @encoding: the document encoding, or NULL
6449 * @options: a combination of htmlParserOption(s)
6450 *
6451 * parse an HTML document from I/O functions and source and build a tree.
6452 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006453 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006454 * Returns the resulting document tree
6455 */
6456htmlDocPtr
6457htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6458 xmlInputCloseCallback ioclose, void *ioctx,
6459 const char *URL,
6460 const char *encoding, int options)
6461{
6462 xmlParserInputBufferPtr input;
6463 xmlParserInputPtr stream;
6464
6465 if (ioread == NULL)
6466 return (NULL);
6467 if (ctxt == NULL)
6468 return (NULL);
6469
6470 htmlCtxtReset(ctxt);
6471
6472 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6473 XML_CHAR_ENCODING_NONE);
6474 if (input == NULL)
6475 return (NULL);
6476 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6477 if (stream == NULL) {
6478 xmlFreeParserInputBuffer(input);
6479 return (NULL);
6480 }
6481 inputPush(ctxt, stream);
6482 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6483}
6484
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006485#define bottom_HTMLparser
6486#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006487#endif /* LIBXML_HTML_ENABLED */