blob: 63388109033af7cedb3846d9f7044c753a78cf5e [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
166 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000168 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000169 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000173 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 return (0);
175 }
176 }
177 ctxt->nameTab[ctxt->nameNr] = value;
178 ctxt->name = value;
179 return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000189static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000192 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000193
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000195 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000196 ctxt->nameNr--;
197 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000198 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000199 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else
202 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000204 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000205 return (ret);
206}
Owen Taylor3473f882001-02-23 17:55:21 +0000207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser
218 * only to compare to ASCII values otherwise it would break when
219 * running with UTF-8 encoding.
220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221 * to compare on ASCII based substring.
222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223 * it should be used only to compare on ASCII based substring.
224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000225 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 * CURRENT Returns the current char value, with the full decoding of
230 * UTF-8 if we are using this mode. It returns an int.
231 * NEXT Skip to the next character, this does the proper decoding
232 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000233 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
Daniel Veillard77a90a72003-03-22 00:04:05 +0000239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000251#define GROW if ((ctxt->progressive == 0) && \
252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000263#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000264
Daniel Veillard561b7f82002-03-20 21:55:57 +0000265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200276
Owen Taylor3473f882001-02-23 17:55:21 +0000277/************
278 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v)
289
290/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200291 * htmlFindEncoding:
292 * @the HTML parser context
293 *
294 * Ty to find and encoding in the current data available in the input
295 * buffer this is needed to try to switch to the proper encoding when
296 * one face a character error.
297 * That's an heuristic, since it's operating outside of parsing it could
298 * try to use a meta which had been commented out, that's the reason it
299 * should only be used in case of error, not as a default.
300 *
301 * Returns an encoding string or NULL if not found, the string need to
302 * be freed
303 */
304static xmlChar *
305htmlFindEncoding(xmlParserCtxtPtr ctxt) {
306 const xmlChar *start, *cur, *end;
307
308 if ((ctxt == NULL) || (ctxt->input == NULL) ||
309 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
310 (ctxt->input->buf->encoder != NULL))
311 return(NULL);
312 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
313 return(NULL);
314
315 start = ctxt->input->cur;
316 end = ctxt->input->end;
317 /* we also expect the input buffer to be zero terminated */
318 if (*end != 0)
319 return(NULL);
320
321 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
322 if (cur == NULL)
323 return(NULL);
324 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
325 if (cur == NULL)
326 return(NULL);
327 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
328 if (cur == NULL)
329 return(NULL);
330 cur += 8;
331 start = cur;
332 while (((*cur >= 'A') && (*cur <= 'Z')) ||
333 ((*cur >= 'a') && (*cur <= 'z')) ||
334 ((*cur >= '0') && (*cur <= '9')) ||
335 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
336 cur++;
337 if (cur == start)
338 return(NULL);
339 return(xmlStrndup(start, cur - start));
340}
341
342/**
Owen Taylor3473f882001-02-23 17:55:21 +0000343 * htmlCurrentChar:
344 * @ctxt: the HTML parser context
345 * @len: pointer to the length of the char read
346 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000347 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000348 * bytes in the input buffer. Implement the end of line normalization:
349 * 2.11 End-of-Line Handling
350 * If the encoding is unspecified, in the case we find an ISO-Latin-1
351 * char, then the encoding converter is plugged in automatically.
352 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000353 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000354 */
355
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000356static int
Owen Taylor3473f882001-02-23 17:55:21 +0000357htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
358 if (ctxt->instate == XML_PARSER_EOF)
359 return(0);
360
361 if (ctxt->token != 0) {
362 *len = 0;
363 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200364 }
Owen Taylor3473f882001-02-23 17:55:21 +0000365 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
366 /*
367 * We are supposed to handle UTF8, check it's valid
368 * From rfc2044: encoding of the Unicode values on UTF-8:
369 *
370 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
371 * 0000 0000-0000 007F 0xxxxxxx
372 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200373 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000374 *
375 * Check for the 0x110000 limit too
376 */
377 const unsigned char *cur = ctxt->input->cur;
378 unsigned char c;
379 unsigned int val;
380
381 c = *cur;
382 if (c & 0x80) {
383 if (cur[1] == 0)
384 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
385 if ((cur[1] & 0xc0) != 0x80)
386 goto encoding_error;
387 if ((c & 0xe0) == 0xe0) {
388
389 if (cur[2] == 0)
390 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
391 if ((cur[2] & 0xc0) != 0x80)
392 goto encoding_error;
393 if ((c & 0xf0) == 0xf0) {
394 if (cur[3] == 0)
395 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
396 if (((c & 0xf8) != 0xf0) ||
397 ((cur[3] & 0xc0) != 0x80))
398 goto encoding_error;
399 /* 4-byte code */
400 *len = 4;
401 val = (cur[0] & 0x7) << 18;
402 val |= (cur[1] & 0x3f) << 12;
403 val |= (cur[2] & 0x3f) << 6;
404 val |= cur[3] & 0x3f;
405 } else {
406 /* 3-byte code */
407 *len = 3;
408 val = (cur[0] & 0xf) << 12;
409 val |= (cur[1] & 0x3f) << 6;
410 val |= cur[2] & 0x3f;
411 }
412 } else {
413 /* 2-byte code */
414 *len = 2;
415 val = (cur[0] & 0x1f) << 6;
416 val |= cur[1] & 0x3f;
417 }
418 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000419 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
420 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200421 }
Owen Taylor3473f882001-02-23 17:55:21 +0000422 return(val);
423 } else {
424 /* 1-byte code */
425 *len = 1;
426 return((int) *ctxt->input->cur);
427 }
428 }
429 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000430 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000431 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000432 * XML constructs only use < 128 chars
433 */
434 *len = 1;
435 if ((int) *ctxt->input->cur < 0x80)
436 return((int) *ctxt->input->cur);
437
438 /*
439 * Humm this is bad, do an automatic flow conversion
440 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200441 {
442 xmlChar * guess;
443 xmlCharEncodingHandlerPtr handler;
444
445 guess = htmlFindEncoding(ctxt);
446 if (guess == NULL) {
447 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
448 } else {
449 if (ctxt->input->encoding != NULL)
450 xmlFree((xmlChar *) ctxt->input->encoding);
451 ctxt->input->encoding = guess;
452 handler = xmlFindCharEncodingHandler((const char *) guess);
453 if (handler != NULL) {
454 xmlSwitchToEncoding(ctxt, handler);
455 } else {
456 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
457 "Unsupported encoding %s", guess, NULL);
458 }
459 }
460 ctxt->charset = XML_CHAR_ENCODING_UTF8;
461 }
462
Owen Taylor3473f882001-02-23 17:55:21 +0000463 return(xmlCurrentChar(ctxt, len));
464
465encoding_error:
466 /*
467 * If we detect an UTF8 error that probably mean that the
468 * input encoding didn't get properly advertized in the
469 * declaration header. Report the error and switch the encoding
470 * to ISO-Latin-1 (if you don't like this policy, just declare the
471 * encoding !)
472 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000473 {
474 char buffer[150];
475
Daniel Veillard861101d2007-06-12 08:38:57 +0000476 if (ctxt->input->end - ctxt->input->cur >= 4) {
477 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
478 ctxt->input->cur[0], ctxt->input->cur[1],
479 ctxt->input->cur[2], ctxt->input->cur[3]);
480 } else {
481 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
482 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000483 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
484 "Input is not proper UTF-8, indicate encoding !\n",
485 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000486 }
487
Daniel Veillarde77db162009-08-22 11:32:38 +0200488 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000489 *len = 1;
490 return((int) *ctxt->input->cur);
491}
492
493/**
Owen Taylor3473f882001-02-23 17:55:21 +0000494 * htmlSkipBlankChars:
495 * @ctxt: the HTML parser context
496 *
497 * skip all blanks character found at that point in the input streams.
498 *
499 * Returns the number of space chars skipped
500 */
501
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000502static int
Owen Taylor3473f882001-02-23 17:55:21 +0000503htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
504 int res = 0;
505
William M. Brack76e95df2003-10-18 16:20:14 +0000506 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000507 if ((*ctxt->input->cur == 0) &&
508 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
509 xmlPopInput(ctxt);
510 } else {
511 if (*(ctxt->input->cur) == '\n') {
512 ctxt->input->line++; ctxt->input->col = 1;
513 } else ctxt->input->col++;
514 ctxt->input->cur++;
515 ctxt->nbChars++;
516 if (*ctxt->input->cur == 0)
517 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
518 }
519 res++;
520 }
521 return(res);
522}
523
524
525
526/************************************************************************
527 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200528 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000529 * *
530 ************************************************************************/
531
532/*
533 * Start Tag: 1 means the start tag can be ommited
534 * End Tag: 1 means the end tag can be ommited
535 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000536 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000537 * Depr: this element is deprecated
538 * DTD: 1 means that this element is valid only in the Loose DTD
539 * 2 means that this element is valid only in the Frameset DTD
540 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000541 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000542 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000543 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000544
545/* Definitions and a couple of vars for HTML Elements */
546
547#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000548#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000549#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000550#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000551#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
552#define NB_SPECIAL 16
Daniel Veillard930dfb62003-02-05 10:17:38 +0000553#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000554#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
555#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
556#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000557#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000558#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000559#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000560#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000561#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000562#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000563#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000564#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000565#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000566#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000567#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000568#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000569#define EMPTY NULL
570
571
Daniel Veillard065abe82006-07-03 08:55:04 +0000572static const char* const html_flow[] = { FLOW, NULL } ;
573static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000574
575/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000576static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000577#define html_cdata html_pcdata
578
579
580/* ... and for HTML Attributes */
581
582#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000583#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000584#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000585#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000586#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000587#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000588#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000589#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000590#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000591#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000592#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000593#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000594
Daniel Veillard065abe82006-07-03 08:55:04 +0000595static const char* const html_attrs[] = { ATTRS, NULL } ;
596static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
597static const char* const core_attrs[] = { COREATTRS, NULL } ;
598static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000599
600
601/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000602static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000603 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
604 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000605static const char* const target_attr[] = { "target", NULL } ;
606static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
607static const char* const alt_attr[] = { "alt", NULL } ;
608static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
609static const char* const href_attrs[] = { "href", NULL } ;
610static const char* const clear_attrs[] = { "clear", NULL } ;
611static const char* const inline_p[] = { INLINE, "p", NULL } ;
612
613static const char* const flow_param[] = { FLOW, "param", NULL } ;
614static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615 "archive", "alt", "name", "height", "width", "align",
616 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000617static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000618 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000619static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000621static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
622static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
623static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
624static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000625 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000626static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
628
629
Daniel Veillard065abe82006-07-03 08:55:04 +0000630static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
631static const char* const col_elt[] = { "col", NULL } ;
632static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
633static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
634static const char* const dl_contents[] = { "dt", "dd", NULL } ;
635static const char* const compact_attr[] = { "compact", NULL } ;
636static const char* const label_attr[] = { "label", NULL } ;
637static const char* const fieldset_contents[] = { FLOW, "legend" } ;
638static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
639static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
640static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
641static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
642static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
643static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
644static const char* const head_attrs[] = { I18N, "profile", NULL } ;
645static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
646static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
647static const char* const version_attr[] = { "version", NULL } ;
648static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
649static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
650static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000651static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000652static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
653static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
654static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
655static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
656static const char* const align_attr[] = { "align", NULL } ;
657static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
658static const char* const map_contents[] = { BLOCK, "area", NULL } ;
659static const char* const name_attr[] = { "name", NULL } ;
660static const char* const action_attr[] = { "action", NULL } ;
661static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
662static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
663static const char* const content_attr[] = { "content", NULL } ;
664static const char* const type_attr[] = { "type", NULL } ;
665static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
666static const char* const object_contents[] = { FLOW, "param", NULL } ;
667static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
668static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
669static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
670static const char* const option_elt[] = { "option", NULL } ;
671static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
672static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
673static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
674static const char* const width_attr[] = { "width", NULL } ;
675static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
676static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
677static const char* const language_attr[] = { "language", NULL } ;
678static const char* const select_content[] = { "optgroup", "option", NULL } ;
679static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
680static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200681static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000682static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
683static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
684static const char* const tr_elt[] = { "tr", NULL } ;
685static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
686static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
687static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
688static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
689static const char* const tr_contents[] = { "th", "td", NULL } ;
690static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
691static const char* const li_elt[] = { "li", NULL } ;
692static const char* const ul_depr[] = { "type", "compact", NULL} ;
693static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000694
695#define DECL (const char**)
696
Daniel Veillard22090732001-07-16 00:06:07 +0000697static const htmlElemDesc
698html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000699{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
700 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
701},
702{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
703 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
704},
705{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
706 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
707},
708{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
709 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
710},
711{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
712 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
713},
714{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
715 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
716},
717{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
718 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
719},
720{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
721 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
722},
723{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
724 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
725},
726{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
727 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
728},
729{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
730 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
731},
732{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
733 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
734},
735{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
736 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
737},
738{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
739 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
740},
741{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
742 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
743},
744{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
745 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
746},
747{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
748 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
749},
750{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
751 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
752},
753{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
754 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
755},
756{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
757 EMPTY , NULL , DECL col_attrs , NULL, NULL
758},
759{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
760 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
761},
762{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
763 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
764},
765{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
766 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
767},
768{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
769 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
770},
771{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
772 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
773},
774{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
775 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
776},
777{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000778 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000779},
780{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
781 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
782},
783{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
784 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
785},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000786{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000787 EMPTY, NULL, DECL embed_attrs, NULL, NULL
788},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000789{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
790 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
791},
792{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
793 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
794},
795{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
796 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
797},
798{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
799 EMPTY, NULL, NULL, DECL frame_attrs, NULL
800},
801{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
802 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
803},
804{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
805 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
806},
807{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
808 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
809},
810{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
814 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
815},
816{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
817 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
818},
819{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
820 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
823 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
824},
825{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
826 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
827},
828{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
829 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
830},
831{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
832 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
833},
834{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
835 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
836},
837{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000838 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000839},
840{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
841 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
842},
843{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
844 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
845},
846{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
847 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
848},
849{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
853 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
854},
855{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
856 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
857},
858{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
859 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
860},
861{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
862 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
863},
864{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000865 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000866},
867{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
868 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
869},
870{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
871 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
872},
873{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
874 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
875},
876{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
877 DECL html_flow, "div", DECL html_attrs, NULL, NULL
878},
879{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
880 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
881},
882{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
883 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
884},
885{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000886 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000887},
888{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
889 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
890},
891{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000895 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000896},
897{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
898 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
899},
900{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
901 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
902},
903{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
904 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
905},
906{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
907 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
908},
909{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
910 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
911},
912{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
913 DECL select_content, NULL, DECL select_attrs, NULL, NULL
914},
915{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
916 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
917},
918{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
919 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
920},
921{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
922 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
923},
924{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
925 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
926},
927{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
928 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
929},
930{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
931 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
932},
933{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
934 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
935},
936{ "table", 0, 0, 0, 0, 0, 0, 0, "",
937 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
938},
939{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
940 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
941},
942{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
943 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
944},
945{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
946 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
947},
948{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
949 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
950},
951{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
952 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
953},
954{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
955 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
956},
957{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
958 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
959},
960{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
961 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
962},
963{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
964 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
965},
966{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
967 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
968},
969{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
970 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
971},
972{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
973 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
974}
Owen Taylor3473f882001-02-23 17:55:21 +0000975};
976
977/*
Owen Taylor3473f882001-02-23 17:55:21 +0000978 * start tags that imply the end of current element
979 */
Daniel Veillard065abe82006-07-03 08:55:04 +0000980static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000981"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
982 "dl", "ul", "ol", "menu", "dir", "address", "pre",
983 "listing", "xmp", "head", NULL,
984"head", "p", NULL,
985"title", "p", NULL,
986"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000987"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000988"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
989 "pre", "listing", "xmp", "head", "li", NULL,
990"hr", "p", "head", NULL,
991"h1", "p", "head", NULL,
992"h2", "p", "head", NULL,
993"h3", "p", "head", NULL,
994"h4", "p", "head", NULL,
995"h5", "p", "head", NULL,
996"h6", "p", "head", NULL,
997"dir", "p", "head", NULL,
998"address", "p", "head", "ul", NULL,
999"pre", "p", "head", "ul", NULL,
1000"listing", "p", "head", NULL,
1001"xmp", "p", "head", NULL,
1002"blockquote", "p", "head", NULL,
1003"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1004 "xmp", "head", NULL,
1005"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1006 "head", "dd", NULL,
1007"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1008 "head", "dt", NULL,
1009"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1010 "listing", "xmp", NULL,
1011"ol", "p", "head", "ul", NULL,
1012"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001013"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001014"div", "p", "head", NULL,
1015"noscript", "p", "head", NULL,
1016"center", "font", "b", "i", "p", "head", NULL,
1017"a", "a", NULL,
1018"caption", "p", NULL,
1019"colgroup", "caption", "colgroup", "col", "p", NULL,
1020"col", "caption", "col", "p", NULL,
1021"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1022 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001023"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001024"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001025"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1026"thead", "caption", "col", "colgroup", NULL,
1027"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1028 "tbody", "p", NULL,
1029"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1030 "tfoot", "tbody", "p", NULL,
1031"optgroup", "option", NULL,
1032"option", "option", NULL,
1033"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1034 "pre", "listing", "xmp", "a", NULL,
1035NULL
1036};
1037
1038/*
1039 * The list of HTML elements which are supposed not to have
1040 * CDATA content and where a p element will be implied
1041 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001042 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001043 * implied paragraph
1044 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001045static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001046 "html",
1047 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001048 NULL
1049};
1050
1051/*
1052 * The list of HTML attributes which are of content %Script;
1053 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1054 * it assumes the name starts with 'on'
1055 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001056static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001057 "onclick",
1058 "ondblclick",
1059 "onmousedown",
1060 "onmouseup",
1061 "onmouseover",
1062 "onmousemove",
1063 "onmouseout",
1064 "onkeypress",
1065 "onkeydown",
1066 "onkeyup",
1067 "onload",
1068 "onunload",
1069 "onfocus",
1070 "onblur",
1071 "onsubmit",
1072 "onrest",
1073 "onchange",
1074 "onselect"
1075};
1076
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001077/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001078 * This table is used by the htmlparser to know what to do with
1079 * broken html pages. By assigning different priorities to different
1080 * elements the parser can decide how to handle extra endtags.
1081 * Endtags are only allowed to close elements with lower or equal
1082 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001083 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001084
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001085typedef struct {
1086 const char *name;
1087 int priority;
1088} elementPriority;
1089
Daniel Veillard22090732001-07-16 00:06:07 +00001090static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001091 {"div", 150},
1092 {"td", 160},
1093 {"th", 160},
1094 {"tr", 170},
1095 {"thead", 180},
1096 {"tbody", 180},
1097 {"tfoot", 180},
1098 {"table", 190},
1099 {"head", 200},
1100 {"body", 200},
1101 {"html", 220},
1102 {NULL, 100} /* Default priority */
1103};
Owen Taylor3473f882001-02-23 17:55:21 +00001104
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001105static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001106static int htmlStartCloseIndexinitialized = 0;
1107
1108/************************************************************************
1109 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001110 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001111 * *
1112 ************************************************************************/
1113
1114/**
1115 * htmlInitAutoClose:
1116 *
1117 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1118 * This is not reentrant. Call xmlInitParser() once before processing in
1119 * case of use in multithreaded programs.
1120 */
1121void
1122htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001123 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001124
1125 if (htmlStartCloseIndexinitialized) return;
1126
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001127 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1128 indx = 0;
1129 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001130 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001131 while (htmlStartClose[i] != NULL) i++;
1132 i++;
1133 }
1134 htmlStartCloseIndexinitialized = 1;
1135}
1136
1137/**
1138 * htmlTagLookup:
1139 * @tag: The tag name in lowercase
1140 *
1141 * Lookup the HTML tag in the ElementTable
1142 *
1143 * Returns the related htmlElemDescPtr or NULL if not found.
1144 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001145const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001146htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001147 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001148
1149 for (i = 0; i < (sizeof(html40ElementTable) /
1150 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001151 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001152 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154 return(NULL);
1155}
1156
1157/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001158 * htmlGetEndPriority:
1159 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001160 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001161 * Return value: The "endtag" priority.
1162 **/
1163static int
1164htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001165 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001166
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001167 while ((htmlEndPriority[i].name != NULL) &&
1168 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1169 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001170
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001172}
1173
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001174
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001175/**
Owen Taylor3473f882001-02-23 17:55:21 +00001176 * htmlCheckAutoClose:
1177 * @newtag: The new tag name
1178 * @oldtag: The old tag name
1179 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001180 * Checks whether the new tag is one of the registered valid tags for
1181 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001182 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1183 *
1184 * Returns 0 if no, 1 if yes.
1185 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001186static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001187htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1188{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001189 int i, indx;
1190 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001191
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001192 if (htmlStartCloseIndexinitialized == 0)
1193 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001194
1195 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001196 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001197 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 if (closed == NULL)
1199 return (0);
1200 if (xmlStrEqual(BAD_CAST * closed, newtag))
1201 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001202 }
1203
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001204 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001205 i++;
1206 while (htmlStartClose[i] != NULL) {
1207 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001208 return (1);
1209 }
1210 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001211 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001212 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001213}
1214
1215/**
1216 * htmlAutoCloseOnClose:
1217 * @ctxt: an HTML parser context
1218 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001219 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001220 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001221 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001222 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001223static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001224htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1225{
1226 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001227 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001228
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001229 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001230
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001231 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001232
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1234 break;
1235 /*
1236 * A missplaced endtag can only close elements with lower
1237 * or equal priority, so if we find an element with higher
1238 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001239 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001240 */
1241 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1242 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001243 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001244 if (i < 0)
1245 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001246
1247 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001248 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001249 if ((info != NULL) && (info->endTag == 3)) {
1250 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1251 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001252 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001253 }
1254 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1255 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001256 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001257 }
1258}
1259
1260/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001261 * htmlAutoCloseOnEnd:
1262 * @ctxt: an HTML parser context
1263 *
1264 * Close all remaining tags at the end of the stream
1265 */
1266static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001267htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1268{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001269 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001270
William M. Brack899e64a2003-09-26 18:03:42 +00001271 if (ctxt->nameNr == 0)
1272 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001273 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001274 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1275 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001276 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001277 }
1278}
1279
1280/**
Owen Taylor3473f882001-02-23 17:55:21 +00001281 * htmlAutoClose:
1282 * @ctxt: an HTML parser context
1283 * @newtag: The new tag name or NULL
1284 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001285 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001286 * The list is kept in htmlStartClose array. This function is
1287 * called when a new tag has been detected and generates the
1288 * appropriates closes if possible/needed.
1289 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001290 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001291 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001292static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001293htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1294{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001295 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001296 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1298 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001299 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001300 }
1301 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001302 htmlAutoCloseOnEnd(ctxt);
1303 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001304 }
1305 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1307 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1308 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001309 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1310 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001311 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 }
Owen Taylor3473f882001-02-23 17:55:21 +00001313}
1314
1315/**
1316 * htmlAutoCloseTag:
1317 * @doc: the HTML document
1318 * @name: The tag name
1319 * @elem: the HTML element
1320 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001321 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001322 * The list is kept in htmlStartClose array. This function checks
1323 * if the element or one of it's children would autoclose the
1324 * given tag.
1325 *
1326 * Returns 1 if autoclose, 0 otherwise
1327 */
1328int
1329htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1330 htmlNodePtr child;
1331
1332 if (elem == NULL) return(1);
1333 if (xmlStrEqual(name, elem->name)) return(0);
1334 if (htmlCheckAutoClose(elem->name, name)) return(1);
1335 child = elem->children;
1336 while (child != NULL) {
1337 if (htmlAutoCloseTag(doc, name, child)) return(1);
1338 child = child->next;
1339 }
1340 return(0);
1341}
1342
1343/**
1344 * htmlIsAutoClosed:
1345 * @doc: the HTML document
1346 * @elem: the HTML element
1347 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001348 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001349 * The list is kept in htmlStartClose array. This function checks
1350 * if a tag is autoclosed by one of it's child
1351 *
1352 * Returns 1 if autoclosed, 0 otherwise
1353 */
1354int
1355htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1356 htmlNodePtr child;
1357
1358 if (elem == NULL) return(1);
1359 child = elem->children;
1360 while (child != NULL) {
1361 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1362 child = child->next;
1363 }
1364 return(0);
1365}
1366
1367/**
1368 * htmlCheckImplied:
1369 * @ctxt: an HTML parser context
1370 * @newtag: The new tag name
1371 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001372 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001373 * called when a new tag has been detected and generates the
1374 * appropriates implicit tags if missing
1375 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001376static void
Owen Taylor3473f882001-02-23 17:55:21 +00001377htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1378 if (!htmlOmittedDefaultValue)
1379 return;
1380 if (xmlStrEqual(newtag, BAD_CAST"html"))
1381 return;
1382 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001384 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1386 }
1387 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1388 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001389 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001390 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1391 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1392 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1393 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1394 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1395 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02001396 /*
Owen Taylor3473f882001-02-23 17:55:21 +00001397 * dropped OBJECT ... i you put it first BODY will be
1398 * assumed !
1399 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001400 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001401 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1402 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1403 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1404 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1405 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1406 int i;
1407 for (i = 0;i < ctxt->nameNr;i++) {
1408 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1409 return;
1410 }
1411 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1412 return;
1413 }
1414 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001415
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001416 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001417 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1418 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1419 }
1420}
1421
1422/**
1423 * htmlCheckParagraph
1424 * @ctxt: an HTML parser context
1425 *
1426 * Check whether a p element need to be implied before inserting
1427 * characters in the current element.
1428 *
1429 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1430 * in case of error.
1431 */
1432
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001433static int
Owen Taylor3473f882001-02-23 17:55:21 +00001434htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1435 const xmlChar *tag;
1436 int i;
1437
1438 if (ctxt == NULL)
1439 return(-1);
1440 tag = ctxt->name;
1441 if (tag == NULL) {
1442 htmlAutoClose(ctxt, BAD_CAST"p");
1443 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001444 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001445 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1446 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1447 return(1);
1448 }
1449 if (!htmlOmittedDefaultValue)
1450 return(0);
1451 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1452 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001453 htmlAutoClose(ctxt, BAD_CAST"p");
1454 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001455 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1458 return(1);
1459 }
1460 }
1461 return(0);
1462}
1463
1464/**
1465 * htmlIsScriptAttribute:
1466 * @name: an attribute name
1467 *
1468 * Check if an attribute is of content type Script
1469 *
1470 * Returns 1 is the attribute is a script 0 otherwise
1471 */
1472int
1473htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001474 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001475
1476 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001477 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001478 /*
1479 * all script attributes start with 'on'
1480 */
1481 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001482 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001483 for (i = 0;
1484 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1485 i++) {
1486 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1487 return(1);
1488 }
1489 return(0);
1490}
1491
1492/************************************************************************
1493 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001494 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001495 * *
1496 ************************************************************************/
1497
1498
Daniel Veillard22090732001-07-16 00:06:07 +00001499static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001500/*
1501 * the 4 absolute ones, plus apostrophe.
1502 */
1503{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1504{ 38, "amp", "ampersand, U+0026 ISOnum" },
1505{ 39, "apos", "single quote" },
1506{ 60, "lt", "less-than sign, U+003C ISOnum" },
1507{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1508
1509/*
1510 * A bunch still in the 128-255 range
1511 * Replacing them depend really on the charset used.
1512 */
1513{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1514{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1515{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1516{ 163, "pound","pound sign, U+00A3 ISOnum" },
1517{ 164, "curren","currency sign, U+00A4 ISOnum" },
1518{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1519{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1520{ 167, "sect", "section sign, U+00A7 ISOnum" },
1521{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1522{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1523{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1524{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1525{ 172, "not", "not sign, U+00AC ISOnum" },
1526{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1527{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1528{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1529{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1530{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1531{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1532{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1533{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1534{ 181, "micro","micro sign, U+00B5 ISOnum" },
1535{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1536{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1537{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1538{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1539{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1540{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1541{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1542{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1543{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1544{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1545{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1546{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1547{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1548{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1549{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1550{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1551{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1552{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1553{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1554{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1555{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1556{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1557{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1558{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1559{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1560{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1561{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1562{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1563{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1564{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1565{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1566{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1567{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1568{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1569{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1570{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1571{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1572{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1573{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1574{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1575{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1576{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1577{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1578{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1579{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1580{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1581{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1582{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1583{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1584{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1585{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1586{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1587{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1588{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1589{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1590{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1591{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1592{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1593{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1594{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1595{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1596{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1597{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1598{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1599{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1600{ 247, "divide","division sign, U+00F7 ISOnum" },
1601{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1602{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1603{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1604{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1605{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1606{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1607{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1608{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1609
1610{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1611{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1612{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1613{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1614{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1615
1616/*
1617 * Anything below should really be kept as entities references
1618 */
1619{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1620
1621{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1622{ 732, "tilde","small tilde, U+02DC ISOdia" },
1623
1624{ 913, "Alpha","greek capital letter alpha, U+0391" },
1625{ 914, "Beta", "greek capital letter beta, U+0392" },
1626{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1627{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1628{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1629{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1630{ 919, "Eta", "greek capital letter eta, U+0397" },
1631{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1632{ 921, "Iota", "greek capital letter iota, U+0399" },
1633{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001634{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001635{ 924, "Mu", "greek capital letter mu, U+039C" },
1636{ 925, "Nu", "greek capital letter nu, U+039D" },
1637{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1638{ 927, "Omicron","greek capital letter omicron, U+039F" },
1639{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1640{ 929, "Rho", "greek capital letter rho, U+03A1" },
1641{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1642{ 932, "Tau", "greek capital letter tau, U+03A4" },
1643{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1644{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1645{ 935, "Chi", "greek capital letter chi, U+03A7" },
1646{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1647{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1648
1649{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1650{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1651{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1652{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1653{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1654{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1655{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1656{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1657{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1658{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1659{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1660{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1661{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1662{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1663{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1664{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1665{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1666{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1667{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1668{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1669{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1670{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1671{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1672{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1673{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1674{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1675{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1676{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1677
1678{ 8194, "ensp", "en space, U+2002 ISOpub" },
1679{ 8195, "emsp", "em space, U+2003 ISOpub" },
1680{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1681{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1682{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1683{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1684{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1685{ 8211, "ndash","en dash, U+2013 ISOpub" },
1686{ 8212, "mdash","em dash, U+2014 ISOpub" },
1687{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1688{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1689{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1690{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1691{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1692{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1693{ 8224, "dagger","dagger, U+2020 ISOpub" },
1694{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1695
1696{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1697{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1698
1699{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1700
1701{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1702{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1703
1704{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1705{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1706
1707{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1708{ 8260, "frasl","fraction slash, U+2044 NEW" },
1709
1710{ 8364, "euro", "euro sign, U+20AC NEW" },
1711
1712{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1713{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1714{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1715{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1716{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1717{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1718{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1719{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1720{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1721{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1722{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1723{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1724{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1725{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1726{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1727{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1728
1729{ 8704, "forall","for all, U+2200 ISOtech" },
1730{ 8706, "part", "partial differential, U+2202 ISOtech" },
1731{ 8707, "exist","there exists, U+2203 ISOtech" },
1732{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1733{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1734{ 8712, "isin", "element of, U+2208 ISOtech" },
1735{ 8713, "notin","not an element of, U+2209 ISOtech" },
1736{ 8715, "ni", "contains as member, U+220B ISOtech" },
1737{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001738{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001739{ 8722, "minus","minus sign, U+2212 ISOtech" },
1740{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1741{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1742{ 8733, "prop", "proportional to, U+221D ISOtech" },
1743{ 8734, "infin","infinity, U+221E ISOtech" },
1744{ 8736, "ang", "angle, U+2220 ISOamso" },
1745{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1746{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1747{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1748{ 8746, "cup", "union = cup, U+222A ISOtech" },
1749{ 8747, "int", "integral, U+222B ISOtech" },
1750{ 8756, "there4","therefore, U+2234 ISOtech" },
1751{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1752{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1753{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1754{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1755{ 8801, "equiv","identical to, U+2261 ISOtech" },
1756{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1757{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1758{ 8834, "sub", "subset of, U+2282 ISOtech" },
1759{ 8835, "sup", "superset of, U+2283 ISOtech" },
1760{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1761{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1762{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1763{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1764{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1765{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1766{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1767{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1768{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1769{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1770{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1771{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1772{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1773{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1774
1775{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1776{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1777{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1778{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1779
1780};
1781
1782/************************************************************************
1783 * *
1784 * Commodity functions to handle entities *
1785 * *
1786 ************************************************************************/
1787
1788/*
1789 * Macro used to grow the current buffer.
1790 */
1791#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001792 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001793 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001794 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1795 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001796 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001797 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001798 return(NULL); \
1799 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001800 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001801}
1802
1803/**
1804 * htmlEntityLookup:
1805 * @name: the entity name
1806 *
1807 * Lookup the given entity in EntitiesTable
1808 *
1809 * TODO: the linear scan is really ugly, an hash table is really needed.
1810 *
1811 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1812 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001813const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001814htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001815 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001816
1817 for (i = 0;i < (sizeof(html40EntitiesTable)/
1818 sizeof(html40EntitiesTable[0]));i++) {
1819 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001820 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001821 }
1822 }
1823 return(NULL);
1824}
1825
1826/**
1827 * htmlEntityValueLookup:
1828 * @value: the entity's unicode value
1829 *
1830 * Lookup the given entity in EntitiesTable
1831 *
1832 * TODO: the linear scan is really ugly, an hash table is really needed.
1833 *
1834 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1835 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001836const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001837htmlEntityValueLookup(unsigned int value) {
1838 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001839
1840 for (i = 0;i < (sizeof(html40EntitiesTable)/
1841 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001842 if (html40EntitiesTable[i].value >= value) {
1843 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001844 break;
William M. Brack78637da2003-07-31 14:47:38 +00001845 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001846 }
Owen Taylor3473f882001-02-23 17:55:21 +00001847 }
1848 return(NULL);
1849}
1850
1851/**
1852 * UTF8ToHtml:
1853 * @out: a pointer to an array of bytes to store the result
1854 * @outlen: the length of @out
1855 * @in: a pointer to an array of UTF-8 chars
1856 * @inlen: the length of @in
1857 *
1858 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1859 * plus HTML entities block of chars out.
1860 *
1861 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1862 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001863 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001864 * The value of @outlen after return is the number of octets consumed.
1865 */
1866int
1867UTF8ToHtml(unsigned char* out, int *outlen,
1868 const unsigned char* in, int *inlen) {
1869 const unsigned char* processed = in;
1870 const unsigned char* outend;
1871 const unsigned char* outstart = out;
1872 const unsigned char* instart = in;
1873 const unsigned char* inend;
1874 unsigned int c, d;
1875 int trailing;
1876
Daniel Veillardce682bc2004-11-05 17:22:25 +00001877 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001878 if (in == NULL) {
1879 /*
1880 * initialization nothing to do
1881 */
1882 *outlen = 0;
1883 *inlen = 0;
1884 return(0);
1885 }
1886 inend = in + (*inlen);
1887 outend = out + (*outlen);
1888 while (in < inend) {
1889 d = *in++;
1890 if (d < 0x80) { c= d; trailing= 0; }
1891 else if (d < 0xC0) {
1892 /* trailing byte in leading position */
1893 *outlen = out - outstart;
1894 *inlen = processed - instart;
1895 return(-2);
1896 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1897 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1898 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1899 else {
1900 /* no chance for this in Ascii */
1901 *outlen = out - outstart;
1902 *inlen = processed - instart;
1903 return(-2);
1904 }
1905
1906 if (inend - in < trailing) {
1907 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001908 }
Owen Taylor3473f882001-02-23 17:55:21 +00001909
1910 for ( ; trailing; trailing--) {
1911 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1912 break;
1913 c <<= 6;
1914 c |= d & 0x3F;
1915 }
1916
1917 /* assertion: c is a single UTF-4 value */
1918 if (c < 0x80) {
1919 if (out + 1 >= outend)
1920 break;
1921 *out++ = c;
1922 } else {
1923 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001924 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00001925 const char *cp;
1926 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00001927
1928 /*
1929 * Try to lookup a predefined HTML entity for it
1930 */
1931
1932 ent = htmlEntityValueLookup(c);
1933 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00001934 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1935 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00001936 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00001937 else
1938 cp = ent->name;
1939 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00001940 if (out + 2 + len >= outend)
1941 break;
1942 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00001943 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00001944 out += len;
1945 *out++ = ';';
1946 }
1947 processed = in;
1948 }
1949 *outlen = out - outstart;
1950 *inlen = processed - instart;
1951 return(0);
1952}
1953
1954/**
1955 * htmlEncodeEntities:
1956 * @out: a pointer to an array of bytes to store the result
1957 * @outlen: the length of @out
1958 * @in: a pointer to an array of UTF-8 chars
1959 * @inlen: the length of @in
1960 * @quoteChar: the quote character to escape (' or ") or zero.
1961 *
1962 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1963 * plus HTML entities block of chars out.
1964 *
1965 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1966 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001967 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001968 * The value of @outlen after return is the number of octets consumed.
1969 */
1970int
1971htmlEncodeEntities(unsigned char* out, int *outlen,
1972 const unsigned char* in, int *inlen, int quoteChar) {
1973 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001974 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00001975 const unsigned char* outstart = out;
1976 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00001977 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00001978 unsigned int c, d;
1979 int trailing;
1980
Daniel Veillardce682bc2004-11-05 17:22:25 +00001981 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1982 return(-1);
1983 outend = out + (*outlen);
1984 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001985 while (in < inend) {
1986 d = *in++;
1987 if (d < 0x80) { c= d; trailing= 0; }
1988 else if (d < 0xC0) {
1989 /* trailing byte in leading position */
1990 *outlen = out - outstart;
1991 *inlen = processed - instart;
1992 return(-2);
1993 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1994 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1995 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1996 else {
1997 /* no chance for this in Ascii */
1998 *outlen = out - outstart;
1999 *inlen = processed - instart;
2000 return(-2);
2001 }
2002
2003 if (inend - in < trailing)
2004 break;
2005
2006 while (trailing--) {
2007 if (((d= *in++) & 0xC0) != 0x80) {
2008 *outlen = out - outstart;
2009 *inlen = processed - instart;
2010 return(-2);
2011 }
2012 c <<= 6;
2013 c |= d & 0x3F;
2014 }
2015
2016 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002017 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2018 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002019 if (out >= outend)
2020 break;
2021 *out++ = c;
2022 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002023 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002024 const char *cp;
2025 char nbuf[16];
2026 int len;
2027
2028 /*
2029 * Try to lookup a predefined HTML entity for it
2030 */
2031 ent = htmlEntityValueLookup(c);
2032 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002033 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002034 cp = nbuf;
2035 }
2036 else
2037 cp = ent->name;
2038 len = strlen(cp);
2039 if (out + 2 + len > outend)
2040 break;
2041 *out++ = '&';
2042 memcpy(out, cp, len);
2043 out += len;
2044 *out++ = ';';
2045 }
2046 processed = in;
2047 }
2048 *outlen = out - outstart;
2049 *inlen = processed - instart;
2050 return(0);
2051}
2052
Owen Taylor3473f882001-02-23 17:55:21 +00002053/************************************************************************
2054 * *
2055 * Commodity functions to handle streams *
2056 * *
2057 ************************************************************************/
2058
2059/**
Owen Taylor3473f882001-02-23 17:55:21 +00002060 * htmlNewInputStream:
2061 * @ctxt: an HTML parser context
2062 *
2063 * Create a new input stream structure
2064 * Returns the new input stream or NULL
2065 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002066static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002067htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2068 htmlParserInputPtr input;
2069
2070 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2071 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002072 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002073 return(NULL);
2074 }
2075 memset(input, 0, sizeof(htmlParserInput));
2076 input->filename = NULL;
2077 input->directory = NULL;
2078 input->base = NULL;
2079 input->cur = NULL;
2080 input->buf = NULL;
2081 input->line = 1;
2082 input->col = 1;
2083 input->buf = NULL;
2084 input->free = NULL;
2085 input->version = NULL;
2086 input->consumed = 0;
2087 input->length = 0;
2088 return(input);
2089}
2090
2091
2092/************************************************************************
2093 * *
2094 * Commodity functions, cleanup needed ? *
2095 * *
2096 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002097/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002098 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002099 * NOTE: it might be more apropriate to integrate this information
2100 * into the html40ElementTable array but I don't want to risk any
2101 * binary incomptibility
2102 */
2103static const char *allowPCData[] = {
2104 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2105 "blockquote", "body", "button", "caption", "center", "cite", "code",
2106 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2107 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2108 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2109 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2110};
Owen Taylor3473f882001-02-23 17:55:21 +00002111
2112/**
2113 * areBlanks:
2114 * @ctxt: an HTML parser context
2115 * @str: a xmlChar *
2116 * @len: the size of @str
2117 *
2118 * Is this a sequence of blank chars that one can ignore ?
2119 *
2120 * Returns 1 if ignorable 0 otherwise.
2121 */
2122
2123static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002124 unsigned int i;
2125 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002126 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002127 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002128
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002129 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002130 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002131
2132 if (CUR == 0) return(1);
2133 if (CUR != '<') return(0);
2134 if (ctxt->name == NULL)
2135 return(1);
2136 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2137 return(1);
2138 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2139 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002140
2141 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2142 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2143 dtd = xmlGetIntSubset(ctxt->myDoc);
2144 if (dtd != NULL && dtd->ExternalID != NULL) {
2145 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2146 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2147 return(1);
2148 }
2149 }
2150
Owen Taylor3473f882001-02-23 17:55:21 +00002151 if (ctxt->node == NULL) return(0);
2152 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002153 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2154 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002155 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002156 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2157 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002158 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002159 for all tags "b" allowing PCDATA */
2160 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2161 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2162 return(0);
2163 }
2164 }
Owen Taylor3473f882001-02-23 17:55:21 +00002165 } else if (xmlNodeIsText(lastChild)) {
2166 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002167 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002168 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002169 for all tags "p" allowing PCDATA */
2170 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2171 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2172 return(0);
2173 }
2174 }
Owen Taylor3473f882001-02-23 17:55:21 +00002175 }
2176 return(1);
2177}
2178
2179/**
Owen Taylor3473f882001-02-23 17:55:21 +00002180 * htmlNewDocNoDtD:
2181 * @URI: URI for the dtd, or NULL
2182 * @ExternalID: the external ID of the DTD, or NULL
2183 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002184 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2185 * are NULL
2186 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002187 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002188 */
2189htmlDocPtr
2190htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2191 xmlDocPtr cur;
2192
2193 /*
2194 * Allocate a new document and fill the fields.
2195 */
2196 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2197 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002198 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002199 return(NULL);
2200 }
2201 memset(cur, 0, sizeof(xmlDoc));
2202
2203 cur->type = XML_HTML_DOCUMENT_NODE;
2204 cur->version = NULL;
2205 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002206 cur->doc = cur;
2207 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002208 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002209 cur->extSubset = NULL;
2210 cur->oldNs = NULL;
2211 cur->encoding = NULL;
2212 cur->standalone = 1;
2213 cur->compression = 0;
2214 cur->ids = NULL;
2215 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002216 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002217 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002218 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002219 if ((ExternalID != NULL) ||
2220 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002221 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002222 return(cur);
2223}
2224
2225/**
2226 * htmlNewDoc:
2227 * @URI: URI for the dtd, or NULL
2228 * @ExternalID: the external ID of the DTD, or NULL
2229 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002230 * Creates a new HTML document
2231 *
Owen Taylor3473f882001-02-23 17:55:21 +00002232 * Returns a new document
2233 */
2234htmlDocPtr
2235htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2236 if ((URI == NULL) && (ExternalID == NULL))
2237 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002238 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2239 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002240
2241 return(htmlNewDocNoDtD(URI, ExternalID));
2242}
2243
2244
2245/************************************************************************
2246 * *
2247 * The parser itself *
2248 * Relates to http://www.w3.org/TR/html40 *
2249 * *
2250 ************************************************************************/
2251
2252/************************************************************************
2253 * *
2254 * The parser itself *
2255 * *
2256 ************************************************************************/
2257
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002258static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002259
Owen Taylor3473f882001-02-23 17:55:21 +00002260/**
2261 * htmlParseHTMLName:
2262 * @ctxt: an HTML parser context
2263 *
2264 * parse an HTML tag or attribute name, note that we convert it to lowercase
2265 * since HTML names are not case-sensitive.
2266 *
2267 * Returns the Tag Name parsed or NULL
2268 */
2269
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002270static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002271htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002272 int i = 0;
2273 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2274
William M. Brackd1757ab2004-10-02 22:07:48 +00002275 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002276 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002277
2278 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002279 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002280 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2281 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002282 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2283 else loc[i] = CUR;
2284 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002285
Owen Taylor3473f882001-02-23 17:55:21 +00002286 NEXT;
2287 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002288
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002289 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002290}
2291
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002292
2293/**
2294 * htmlParseHTMLName_nonInvasive:
2295 * @ctxt: an HTML parser context
2296 *
2297 * parse an HTML tag or attribute name, note that we convert it to lowercase
2298 * since HTML names are not case-sensitive, this doesn't consume the data
2299 * from the stream, it's a look-ahead
2300 *
2301 * Returns the Tag Name parsed or NULL
2302 */
2303
2304static const xmlChar *
2305htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2306 int i = 0;
2307 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2308
2309 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2310 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002311
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002312 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2313 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2314 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2315 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2316 else loc[i] = NXT(1+i);
2317 i++;
2318 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002319
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002320 return(xmlDictLookup(ctxt->dict, loc, i));
2321}
2322
2323
Owen Taylor3473f882001-02-23 17:55:21 +00002324/**
2325 * htmlParseName:
2326 * @ctxt: an HTML parser context
2327 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002328 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002329 *
2330 * Returns the Name parsed or NULL
2331 */
2332
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002333static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002334htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002335 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002336 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002337 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002338
2339 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002340
2341 /*
2342 * Accelerator for simple ASCII names
2343 */
2344 in = ctxt->input->cur;
2345 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2346 ((*in >= 0x41) && (*in <= 0x5A)) ||
2347 (*in == '_') || (*in == ':')) {
2348 in++;
2349 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2350 ((*in >= 0x41) && (*in <= 0x5A)) ||
2351 ((*in >= 0x30) && (*in <= 0x39)) ||
2352 (*in == '_') || (*in == '-') ||
2353 (*in == ':') || (*in == '.'))
2354 in++;
2355 if ((*in > 0) && (*in < 0x80)) {
2356 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002357 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002358 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002359 ctxt->nbChars += count;
2360 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002361 return(ret);
2362 }
2363 }
2364 return(htmlParseNameComplex(ctxt));
2365}
2366
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002367static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002368htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002369 int len = 0, l;
2370 int c;
2371 int count = 0;
2372
2373 /*
2374 * Handler for more complex cases
2375 */
2376 GROW;
2377 c = CUR_CHAR(l);
2378 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2379 (!IS_LETTER(c) && (c != '_') &&
2380 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002381 return(NULL);
2382 }
2383
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002384 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2385 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2386 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002387 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002388 (IS_COMBINING(c)) ||
2389 (IS_EXTENDER(c)))) {
2390 if (count++ > 100) {
2391 count = 0;
2392 GROW;
2393 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002394 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002395 NEXTL(l);
2396 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002397 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002398 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002399}
2400
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002401
Owen Taylor3473f882001-02-23 17:55:21 +00002402/**
2403 * htmlParseHTMLAttribute:
2404 * @ctxt: an HTML parser context
2405 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002406 *
Owen Taylor3473f882001-02-23 17:55:21 +00002407 * parse an HTML attribute value till the stop (quote), if
2408 * stop is 0 then it stops at the first space
2409 *
2410 * Returns the attribute parsed or NULL
2411 */
2412
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002413static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002414htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2415 xmlChar *buffer = NULL;
2416 int buffer_size = 0;
2417 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002418 const xmlChar *name = NULL;
2419 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002420 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002421
2422 /*
2423 * allocate a translation buffer.
2424 */
2425 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002426 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002427 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002428 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002429 return(NULL);
2430 }
2431 out = buffer;
2432
2433 /*
2434 * Ok loop until we reach one of the ending chars
2435 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002436 while ((CUR != 0) && (CUR != stop)) {
2437 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002438 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002439 if (CUR == '&') {
2440 if (NXT(1) == '#') {
2441 unsigned int c;
2442 int bits;
2443
2444 c = htmlParseCharRef(ctxt);
2445 if (c < 0x80)
2446 { *out++ = c; bits= -6; }
2447 else if (c < 0x800)
2448 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2449 else if (c < 0x10000)
2450 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002451 else
Owen Taylor3473f882001-02-23 17:55:21 +00002452 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002453
Owen Taylor3473f882001-02-23 17:55:21 +00002454 for ( ; bits >= 0; bits-= 6) {
2455 *out++ = ((c >> bits) & 0x3F) | 0x80;
2456 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002457
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002458 if (out - buffer > buffer_size - 100) {
2459 int indx = out - buffer;
2460
2461 growBuffer(buffer);
2462 out = &buffer[indx];
2463 }
Owen Taylor3473f882001-02-23 17:55:21 +00002464 } else {
2465 ent = htmlParseEntityRef(ctxt, &name);
2466 if (name == NULL) {
2467 *out++ = '&';
2468 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002469 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002470
2471 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002472 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002473 }
2474 } else if (ent == NULL) {
2475 *out++ = '&';
2476 cur = name;
2477 while (*cur != 0) {
2478 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002479 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002480
2481 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002482 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002483 }
2484 *out++ = *cur++;
2485 }
Owen Taylor3473f882001-02-23 17:55:21 +00002486 } else {
2487 unsigned int c;
2488 int bits;
2489
2490 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002491 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002492
2493 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002494 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002495 }
Daniel Veillard48519092006-10-17 15:56:35 +00002496 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002497 if (c < 0x80)
2498 { *out++ = c; bits= -6; }
2499 else if (c < 0x800)
2500 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2501 else if (c < 0x10000)
2502 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002503 else
Owen Taylor3473f882001-02-23 17:55:21 +00002504 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002505
Owen Taylor3473f882001-02-23 17:55:21 +00002506 for ( ; bits >= 0; bits-= 6) {
2507 *out++ = ((c >> bits) & 0x3F) | 0x80;
2508 }
Owen Taylor3473f882001-02-23 17:55:21 +00002509 }
2510 }
2511 } else {
2512 unsigned int c;
2513 int bits, l;
2514
2515 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002516 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002517
2518 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002519 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002520 }
2521 c = CUR_CHAR(l);
2522 if (c < 0x80)
2523 { *out++ = c; bits= -6; }
2524 else if (c < 0x800)
2525 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2526 else if (c < 0x10000)
2527 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002528 else
Owen Taylor3473f882001-02-23 17:55:21 +00002529 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002530
Owen Taylor3473f882001-02-23 17:55:21 +00002531 for ( ; bits >= 0; bits-= 6) {
2532 *out++ = ((c >> bits) & 0x3F) | 0x80;
2533 }
2534 NEXT;
2535 }
2536 }
2537 *out++ = 0;
2538 return(buffer);
2539}
2540
2541/**
Owen Taylor3473f882001-02-23 17:55:21 +00002542 * htmlParseEntityRef:
2543 * @ctxt: an HTML parser context
2544 * @str: location to store the entity name
2545 *
2546 * parse an HTML ENTITY references
2547 *
2548 * [68] EntityRef ::= '&' Name ';'
2549 *
2550 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2551 * if non-NULL *str will have to be freed by the caller.
2552 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002553const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002554htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2555 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002556 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002557
2558 if (str != NULL) *str = NULL;
2559 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002560
2561 if (CUR == '&') {
2562 NEXT;
2563 name = htmlParseName(ctxt);
2564 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002565 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2566 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002567 } else {
2568 GROW;
2569 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002570 if (str != NULL)
2571 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002572
2573 /*
2574 * Lookup the entity in the table.
2575 */
2576 ent = htmlEntityLookup(name);
2577 if (ent != NULL) /* OK that's ugly !!! */
2578 NEXT;
2579 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002580 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2581 "htmlParseEntityRef: expecting ';'\n",
2582 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002583 if (str != NULL)
2584 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002585 }
2586 }
2587 }
2588 return(ent);
2589}
2590
2591/**
2592 * htmlParseAttValue:
2593 * @ctxt: an HTML parser context
2594 *
2595 * parse a value for an attribute
2596 * Note: the parser won't do substitution of entities here, this
2597 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002598 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002599 *
2600 * Returns the AttValue parsed or NULL.
2601 */
2602
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002603static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002604htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2605 xmlChar *ret = NULL;
2606
2607 if (CUR == '"') {
2608 NEXT;
2609 ret = htmlParseHTMLAttribute(ctxt, '"');
2610 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002611 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2612 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002613 } else
2614 NEXT;
2615 } else if (CUR == '\'') {
2616 NEXT;
2617 ret = htmlParseHTMLAttribute(ctxt, '\'');
2618 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002619 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2620 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002621 } else
2622 NEXT;
2623 } else {
2624 /*
2625 * That's an HTMLism, the attribute value may not be quoted
2626 */
2627 ret = htmlParseHTMLAttribute(ctxt, 0);
2628 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002629 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2630 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002631 }
2632 }
2633 return(ret);
2634}
2635
2636/**
2637 * htmlParseSystemLiteral:
2638 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002639 *
Owen Taylor3473f882001-02-23 17:55:21 +00002640 * parse an HTML Literal
2641 *
2642 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2643 *
2644 * Returns the SystemLiteral parsed or NULL
2645 */
2646
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002647static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002648htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2649 const xmlChar *q;
2650 xmlChar *ret = NULL;
2651
2652 if (CUR == '"') {
2653 NEXT;
2654 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002655 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002656 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002657 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002658 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2659 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002660 } else {
2661 ret = xmlStrndup(q, CUR_PTR - q);
2662 NEXT;
2663 }
2664 } else if (CUR == '\'') {
2665 NEXT;
2666 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002667 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002668 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002669 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002670 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2671 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002672 } else {
2673 ret = xmlStrndup(q, CUR_PTR - q);
2674 NEXT;
2675 }
2676 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002677 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2678 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002679 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002680
Owen Taylor3473f882001-02-23 17:55:21 +00002681 return(ret);
2682}
2683
2684/**
2685 * htmlParsePubidLiteral:
2686 * @ctxt: an HTML parser context
2687 *
2688 * parse an HTML public literal
2689 *
2690 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2691 *
2692 * Returns the PubidLiteral parsed or NULL.
2693 */
2694
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002695static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002696htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2697 const xmlChar *q;
2698 xmlChar *ret = NULL;
2699 /*
2700 * Name ::= (Letter | '_') (NameChar)*
2701 */
2702 if (CUR == '"') {
2703 NEXT;
2704 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002705 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002706 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002707 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2708 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002709 } else {
2710 ret = xmlStrndup(q, CUR_PTR - q);
2711 NEXT;
2712 }
2713 } else if (CUR == '\'') {
2714 NEXT;
2715 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002716 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002717 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002718 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002719 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2720 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002721 } else {
2722 ret = xmlStrndup(q, CUR_PTR - q);
2723 NEXT;
2724 }
2725 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002726 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2727 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002728 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002729
Owen Taylor3473f882001-02-23 17:55:21 +00002730 return(ret);
2731}
2732
2733/**
2734 * htmlParseScript:
2735 * @ctxt: an HTML parser context
2736 *
2737 * parse the content of an HTML SCRIPT or STYLE element
2738 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2739 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2740 * http://www.w3.org/TR/html4/types.html#type-script
2741 * http://www.w3.org/TR/html4/types.html#h-6.15
2742 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2743 *
2744 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2745 * element and the value of intrinsic event attributes. User agents must
2746 * not evaluate script data as HTML markup but instead must pass it on as
2747 * data to a script engine.
2748 * NOTES:
2749 * - The content is passed like CDATA
2750 * - the attributes for style and scripting "onXXX" are also described
2751 * as CDATA but SGML allows entities references in attributes so their
2752 * processing is identical as other attributes
2753 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002754static void
Owen Taylor3473f882001-02-23 17:55:21 +00002755htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002756 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002757 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002758 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002759
2760 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002761 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002762 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002763 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002764 /*
2765 * One should break here, the specification is clear:
2766 * Authors should therefore escape "</" within the content.
2767 * Escape mechanisms are specific to each scripting or
2768 * style sheet language.
2769 *
2770 * In recovery mode, only break if end tag match the
2771 * current tag, effectively ignoring all tags inside the
2772 * script/style block and treating the entire block as
2773 * CDATA.
2774 */
2775 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002776 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2777 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002778 {
2779 break; /* while */
2780 } else {
2781 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002782 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002783 ctxt->name, NULL);
2784 }
2785 } else {
2786 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002787 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002788 {
2789 break; /* while */
2790 }
2791 }
Owen Taylor3473f882001-02-23 17:55:21 +00002792 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002793 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002794 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2795 if (ctxt->sax->cdataBlock!= NULL) {
2796 /*
2797 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2798 */
2799 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002800 } else if (ctxt->sax->characters != NULL) {
2801 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002802 }
2803 nbchar = 0;
2804 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002805 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002806 NEXTL(l);
2807 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002808 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002809
Daniel Veillard68716a72006-10-16 09:32:17 +00002810 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002811 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2812 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002813 NEXT;
2814 }
2815
2816 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2817 if (ctxt->sax->cdataBlock!= NULL) {
2818 /*
2819 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2820 */
2821 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002822 } else if (ctxt->sax->characters != NULL) {
2823 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002824 }
2825 }
2826}
2827
2828
2829/**
2830 * htmlParseCharData:
2831 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002832 *
2833 * parse a CharData section.
2834 * if we are within a CDATA section ']]>' marks an end of section.
2835 *
2836 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2837 */
2838
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002839static void
2840htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002841 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2842 int nbchar = 0;
2843 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002844 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002845
2846 SHRINK;
2847 cur = CUR_CHAR(l);
2848 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002849 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002850 (cur != 0)) {
2851 if (!(IS_CHAR(cur))) {
2852 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2853 "Invalid char in CDATA 0x%X\n", cur);
2854 } else {
2855 COPY_BUF(l,buf,nbchar,cur);
2856 }
Owen Taylor3473f882001-02-23 17:55:21 +00002857 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2858 /*
2859 * Ok the segment is to be consumed as chars.
2860 */
2861 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2862 if (areBlanks(ctxt, buf, nbchar)) {
2863 if (ctxt->sax->ignorableWhitespace != NULL)
2864 ctxt->sax->ignorableWhitespace(ctxt->userData,
2865 buf, nbchar);
2866 } else {
2867 htmlCheckParagraph(ctxt);
2868 if (ctxt->sax->characters != NULL)
2869 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2870 }
2871 }
2872 nbchar = 0;
2873 }
2874 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002875 chunk++;
2876 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2877 chunk = 0;
2878 SHRINK;
2879 GROW;
2880 }
Owen Taylor3473f882001-02-23 17:55:21 +00002881 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002882 if (cur == 0) {
2883 SHRINK;
2884 GROW;
2885 cur = CUR_CHAR(l);
2886 }
Owen Taylor3473f882001-02-23 17:55:21 +00002887 }
2888 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002889 buf[nbchar] = 0;
2890
Owen Taylor3473f882001-02-23 17:55:21 +00002891 /*
2892 * Ok the segment is to be consumed as chars.
2893 */
2894 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2895 if (areBlanks(ctxt, buf, nbchar)) {
2896 if (ctxt->sax->ignorableWhitespace != NULL)
2897 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2898 } else {
2899 htmlCheckParagraph(ctxt);
2900 if (ctxt->sax->characters != NULL)
2901 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2902 }
2903 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002904 } else {
2905 /*
2906 * Loop detection
2907 */
2908 if (cur == 0)
2909 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002910 }
2911}
2912
2913/**
2914 * htmlParseExternalID:
2915 * @ctxt: an HTML parser context
2916 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002917 *
2918 * Parse an External ID or a Public ID
2919 *
Owen Taylor3473f882001-02-23 17:55:21 +00002920 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2921 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2922 *
2923 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2924 *
2925 * Returns the function returns SystemLiteral and in the second
2926 * case publicID receives PubidLiteral, is strict is off
2927 * it is possible to return NULL and have publicID set.
2928 */
2929
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002930static xmlChar *
2931htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002932 xmlChar *URI = NULL;
2933
2934 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2935 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2936 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2937 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002938 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002939 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2940 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002941 }
2942 SKIP_BLANKS;
2943 URI = htmlParseSystemLiteral(ctxt);
2944 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002945 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2946 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002947 }
2948 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2949 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2950 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2951 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002952 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002953 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2954 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002955 }
2956 SKIP_BLANKS;
2957 *publicID = htmlParsePubidLiteral(ctxt);
2958 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002959 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2960 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2961 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002962 }
2963 SKIP_BLANKS;
2964 if ((CUR == '"') || (CUR == '\'')) {
2965 URI = htmlParseSystemLiteral(ctxt);
2966 }
2967 }
2968 return(URI);
2969}
2970
2971/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00002972 * xmlParsePI:
2973 * @ctxt: an XML parser context
2974 *
2975 * parse an XML Processing Instruction.
2976 *
2977 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2978 */
2979static void
2980htmlParsePI(htmlParserCtxtPtr ctxt) {
2981 xmlChar *buf = NULL;
2982 int len = 0;
2983 int size = HTML_PARSER_BUFFER_SIZE;
2984 int cur, l;
2985 const xmlChar *target;
2986 xmlParserInputState state;
2987 int count = 0;
2988
2989 if ((RAW == '<') && (NXT(1) == '?')) {
2990 state = ctxt->instate;
2991 ctxt->instate = XML_PARSER_PI;
2992 /*
2993 * this is a Processing Instruction.
2994 */
2995 SKIP(2);
2996 SHRINK;
2997
2998 /*
2999 * Parse the target name and check for special support like
3000 * namespace.
3001 */
3002 target = htmlParseName(ctxt);
3003 if (target != NULL) {
3004 if (RAW == '>') {
3005 SKIP(1);
3006
3007 /*
3008 * SAX: PI detected.
3009 */
3010 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3011 (ctxt->sax->processingInstruction != NULL))
3012 ctxt->sax->processingInstruction(ctxt->userData,
3013 target, NULL);
3014 ctxt->instate = state;
3015 return;
3016 }
3017 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3018 if (buf == NULL) {
3019 htmlErrMemory(ctxt, NULL);
3020 ctxt->instate = state;
3021 return;
3022 }
3023 cur = CUR;
3024 if (!IS_BLANK(cur)) {
3025 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3026 "ParsePI: PI %s space expected\n", target, NULL);
3027 }
3028 SKIP_BLANKS;
3029 cur = CUR_CHAR(l);
3030 while (IS_CHAR(cur) && (cur != '>')) {
3031 if (len + 5 >= size) {
3032 xmlChar *tmp;
3033
3034 size *= 2;
3035 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3036 if (tmp == NULL) {
3037 htmlErrMemory(ctxt, NULL);
3038 xmlFree(buf);
3039 ctxt->instate = state;
3040 return;
3041 }
3042 buf = tmp;
3043 }
3044 count++;
3045 if (count > 50) {
3046 GROW;
3047 count = 0;
3048 }
3049 COPY_BUF(l,buf,len,cur);
3050 NEXTL(l);
3051 cur = CUR_CHAR(l);
3052 if (cur == 0) {
3053 SHRINK;
3054 GROW;
3055 cur = CUR_CHAR(l);
3056 }
3057 }
3058 buf[len] = 0;
3059 if (cur != '>') {
3060 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3061 "ParsePI: PI %s never end ...\n", target, NULL);
3062 } else {
3063 SKIP(1);
3064
3065 /*
3066 * SAX: PI detected.
3067 */
3068 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3069 (ctxt->sax->processingInstruction != NULL))
3070 ctxt->sax->processingInstruction(ctxt->userData,
3071 target, buf);
3072 }
3073 xmlFree(buf);
3074 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003075 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003076 "PI is not started correctly", NULL, NULL);
3077 }
3078 ctxt->instate = state;
3079 }
3080}
3081
3082/**
Owen Taylor3473f882001-02-23 17:55:21 +00003083 * htmlParseComment:
3084 * @ctxt: an HTML parser context
3085 *
3086 * Parse an XML (SGML) comment <!-- .... -->
3087 *
3088 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3089 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003090static void
Owen Taylor3473f882001-02-23 17:55:21 +00003091htmlParseComment(htmlParserCtxtPtr ctxt) {
3092 xmlChar *buf = NULL;
3093 int len;
3094 int size = HTML_PARSER_BUFFER_SIZE;
3095 int q, ql;
3096 int r, rl;
3097 int cur, l;
3098 xmlParserInputState state;
3099
3100 /*
3101 * Check that there is a comment right here.
3102 */
3103 if ((RAW != '<') || (NXT(1) != '!') ||
3104 (NXT(2) != '-') || (NXT(3) != '-')) return;
3105
3106 state = ctxt->instate;
3107 ctxt->instate = XML_PARSER_COMMENT;
3108 SHRINK;
3109 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003110 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003111 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003112 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003113 ctxt->instate = state;
3114 return;
3115 }
3116 q = CUR_CHAR(ql);
3117 NEXTL(ql);
3118 r = CUR_CHAR(rl);
3119 NEXTL(rl);
3120 cur = CUR_CHAR(l);
3121 len = 0;
3122 while (IS_CHAR(cur) &&
3123 ((cur != '>') ||
3124 (r != '-') || (q != '-'))) {
3125 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003126 xmlChar *tmp;
3127
Owen Taylor3473f882001-02-23 17:55:21 +00003128 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003129 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3130 if (tmp == NULL) {
3131 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003132 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003133 ctxt->instate = state;
3134 return;
3135 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003136 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003137 }
3138 COPY_BUF(ql,buf,len,q);
3139 q = r;
3140 ql = rl;
3141 r = cur;
3142 rl = l;
3143 NEXTL(l);
3144 cur = CUR_CHAR(l);
3145 if (cur == 0) {
3146 SHRINK;
3147 GROW;
3148 cur = CUR_CHAR(l);
3149 }
3150 }
3151 buf[len] = 0;
3152 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003153 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3154 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003155 xmlFree(buf);
3156 } else {
3157 NEXT;
3158 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3159 (!ctxt->disableSAX))
3160 ctxt->sax->comment(ctxt->userData, buf);
3161 xmlFree(buf);
3162 }
3163 ctxt->instate = state;
3164}
3165
3166/**
3167 * htmlParseCharRef:
3168 * @ctxt: an HTML parser context
3169 *
3170 * parse Reference declarations
3171 *
3172 * [66] CharRef ::= '&#' [0-9]+ ';' |
3173 * '&#x' [0-9a-fA-F]+ ';'
3174 *
3175 * Returns the value parsed (as an int)
3176 */
3177int
3178htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3179 int val = 0;
3180
Daniel Veillarda03e3652004-11-02 18:45:30 +00003181 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3182 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3183 "htmlParseCharRef: context error\n",
3184 NULL, NULL);
3185 return(0);
3186 }
Owen Taylor3473f882001-02-23 17:55:21 +00003187 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003188 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003189 SKIP(3);
3190 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003191 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003192 val = val * 16 + (CUR - '0');
3193 else if ((CUR >= 'a') && (CUR <= 'f'))
3194 val = val * 16 + (CUR - 'a') + 10;
3195 else if ((CUR >= 'A') && (CUR <= 'F'))
3196 val = val * 16 + (CUR - 'A') + 10;
3197 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003198 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003199 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003200 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003201 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003202 }
3203 NEXT;
3204 }
3205 if (CUR == ';')
3206 NEXT;
3207 } else if ((CUR == '&') && (NXT(1) == '#')) {
3208 SKIP(2);
3209 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003210 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003211 val = val * 10 + (CUR - '0');
3212 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003213 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Daniel Veillard36de63e2008-04-03 09:05:05 +00003214 "htmlParseCharRef: missing semicolumn\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003215 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003216 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003217 }
3218 NEXT;
3219 }
3220 if (CUR == ';')
3221 NEXT;
3222 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003223 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3224 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003225 }
3226 /*
3227 * Check the value IS_CHAR ...
3228 */
3229 if (IS_CHAR(val)) {
3230 return(val);
3231 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003232 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3233 "htmlParseCharRef: invalid xmlChar value %d\n",
3234 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003235 }
3236 return(0);
3237}
3238
3239
3240/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003241 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003242 * @ctxt: an HTML parser context
3243 *
3244 * parse a DOCTYPE declaration
3245 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003246 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003247 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3248 */
3249
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003250static void
Owen Taylor3473f882001-02-23 17:55:21 +00003251htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003252 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003253 xmlChar *ExternalID = NULL;
3254 xmlChar *URI = NULL;
3255
3256 /*
3257 * We know that '<!DOCTYPE' has been detected.
3258 */
3259 SKIP(9);
3260
3261 SKIP_BLANKS;
3262
3263 /*
3264 * Parse the DOCTYPE name.
3265 */
3266 name = htmlParseName(ctxt);
3267 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003268 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3269 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3270 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003271 }
3272 /*
3273 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3274 */
3275
3276 SKIP_BLANKS;
3277
3278 /*
3279 * Check for SystemID and ExternalID
3280 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003281 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003282 SKIP_BLANKS;
3283
3284 /*
3285 * We should be at the end of the DOCTYPE declaration.
3286 */
3287 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003288 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3289 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003290 /* We shouldn't try to resynchronize ... */
3291 }
3292 NEXT;
3293
3294 /*
3295 * Create or update the document accordingly to the DOCTYPE
3296 */
3297 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3298 (!ctxt->disableSAX))
3299 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3300
3301 /*
3302 * Cleanup, since we don't use all those identifiers
3303 */
3304 if (URI != NULL) xmlFree(URI);
3305 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003306}
3307
3308/**
3309 * htmlParseAttribute:
3310 * @ctxt: an HTML parser context
3311 * @value: a xmlChar ** used to store the value of the attribute
3312 *
3313 * parse an attribute
3314 *
3315 * [41] Attribute ::= Name Eq AttValue
3316 *
3317 * [25] Eq ::= S? '=' S?
3318 *
3319 * With namespace:
3320 *
3321 * [NS 11] Attribute ::= QName Eq AttValue
3322 *
3323 * Also the case QName == xmlns:??? is handled independently as a namespace
3324 * definition.
3325 *
3326 * Returns the attribute name, and the value in *value.
3327 */
3328
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003329static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003330htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003331 const xmlChar *name;
3332 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003333
3334 *value = NULL;
3335 name = htmlParseHTMLName(ctxt);
3336 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003337 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3338 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003339 return(NULL);
3340 }
3341
3342 /*
3343 * read the value
3344 */
3345 SKIP_BLANKS;
3346 if (CUR == '=') {
3347 NEXT;
3348 SKIP_BLANKS;
3349 val = htmlParseAttValue(ctxt);
Daniel Veillardc47d2632006-10-17 16:13:27 +00003350 } else if (htmlIsBooleanAttr(name)) {
3351 /*
3352 * assume a minimized attribute
3353 */
3354 val = xmlStrdup(name);
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
3356
3357 *value = val;
3358 return(name);
3359}
3360
3361/**
3362 * htmlCheckEncoding:
3363 * @ctxt: an HTML parser context
3364 * @attvalue: the attribute value
3365 *
3366 * Checks an http-equiv attribute from a Meta tag to detect
3367 * the encoding
3368 * If a new encoding is detected the parser is switched to decode
3369 * it and pass UTF8
3370 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003371static void
Owen Taylor3473f882001-02-23 17:55:21 +00003372htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3373 const xmlChar *encoding;
3374
3375 if ((ctxt == NULL) || (attvalue == NULL))
3376 return;
3377
Daniel Veillarde77db162009-08-22 11:32:38 +02003378 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003379 if (ctxt->input->encoding != NULL)
3380 return;
3381
3382 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3383 if (encoding != NULL) {
3384 encoding += 8;
3385 } else {
3386 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3387 if (encoding != NULL)
3388 encoding += 9;
3389 }
3390 if (encoding != NULL) {
3391 xmlCharEncoding enc;
3392 xmlCharEncodingHandlerPtr handler;
3393
3394 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3395
3396 if (ctxt->input->encoding != NULL)
3397 xmlFree((xmlChar *) ctxt->input->encoding);
3398 ctxt->input->encoding = xmlStrdup(encoding);
3399
3400 enc = xmlParseCharEncoding((const char *) encoding);
3401 /*
3402 * registered set of known encodings
3403 */
3404 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003405 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003406 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3407 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3408 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3409 (ctxt->input->buf != NULL) &&
3410 (ctxt->input->buf->encoder == NULL)) {
3411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3412 "htmlCheckEncoding: wrong encoding meta\n",
3413 NULL, NULL);
3414 } else {
3415 xmlSwitchEncoding(ctxt, enc);
3416 }
Owen Taylor3473f882001-02-23 17:55:21 +00003417 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3418 } else {
3419 /*
3420 * fallback for unknown encodings
3421 */
3422 handler = xmlFindCharEncodingHandler((const char *) encoding);
3423 if (handler != NULL) {
3424 xmlSwitchToEncoding(ctxt, handler);
3425 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3426 } else {
3427 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3428 }
3429 }
3430
3431 if ((ctxt->input->buf != NULL) &&
3432 (ctxt->input->buf->encoder != NULL) &&
3433 (ctxt->input->buf->raw != NULL) &&
3434 (ctxt->input->buf->buffer != NULL)) {
3435 int nbchars;
3436 int processed;
3437
3438 /*
3439 * convert as much as possible to the parser reading buffer.
3440 */
3441 processed = ctxt->input->cur - ctxt->input->base;
3442 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3443 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3444 ctxt->input->buf->buffer,
3445 ctxt->input->buf->raw);
3446 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003447 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3448 "htmlCheckEncoding: encoder error\n",
3449 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003450 }
3451 ctxt->input->base =
3452 ctxt->input->cur = ctxt->input->buf->buffer->content;
3453 }
3454 }
3455}
3456
3457/**
3458 * htmlCheckMeta:
3459 * @ctxt: an HTML parser context
3460 * @atts: the attributes values
3461 *
3462 * Checks an attributes from a Meta tag
3463 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003464static void
Owen Taylor3473f882001-02-23 17:55:21 +00003465htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3466 int i;
3467 const xmlChar *att, *value;
3468 int http = 0;
3469 const xmlChar *content = NULL;
3470
3471 if ((ctxt == NULL) || (atts == NULL))
3472 return;
3473
3474 i = 0;
3475 att = atts[i++];
3476 while (att != NULL) {
3477 value = atts[i++];
3478 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3479 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3480 http = 1;
3481 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3482 content = value;
3483 att = atts[i++];
3484 }
3485 if ((http) && (content != NULL))
3486 htmlCheckEncoding(ctxt, content);
3487
3488}
3489
3490/**
3491 * htmlParseStartTag:
3492 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003493 *
Owen Taylor3473f882001-02-23 17:55:21 +00003494 * parse a start of tag either for rule element or
3495 * EmptyElement. In both case we don't parse the tag closing chars.
3496 *
3497 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3498 *
3499 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3500 *
3501 * With namespace:
3502 *
3503 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3504 *
3505 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3506 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003507 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003508 */
3509
Daniel Veillard597f1c12005-07-03 23:00:18 +00003510static int
Owen Taylor3473f882001-02-23 17:55:21 +00003511htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003512 const xmlChar *name;
3513 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003514 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003515 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003516 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003517 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003518 int meta = 0;
3519 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003520 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003521
Daniel Veillarde77db162009-08-22 11:32:38 +02003522 if (ctxt->instate == XML_PARSER_EOF)
3523 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003524 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3525 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3526 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003527 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003528 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003529 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003530 NEXT;
3531
Daniel Veillard30e76072006-03-09 14:13:55 +00003532 atts = ctxt->atts;
3533 maxatts = ctxt->maxatts;
3534
Owen Taylor3473f882001-02-23 17:55:21 +00003535 GROW;
3536 name = htmlParseHTMLName(ctxt);
3537 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003538 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3539 "htmlParseStartTag: invalid element name\n",
3540 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003541 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003542 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3543 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003544 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003545 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003546 }
3547 if (xmlStrEqual(name, BAD_CAST"meta"))
3548 meta = 1;
3549
3550 /*
3551 * Check for auto-closure of HTML elements.
3552 */
3553 htmlAutoClose(ctxt, name);
3554
3555 /*
3556 * Check for implied HTML elements.
3557 */
3558 htmlCheckImplied(ctxt, name);
3559
3560 /*
3561 * Avoid html at any level > 0, head at any level != 1
3562 * or any attempt to recurse body
3563 */
3564 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003565 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3566 "htmlParseStartTag: misplaced <html> tag\n",
3567 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003568 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003569 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003570 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003571 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003572 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003573 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3574 "htmlParseStartTag: misplaced <head> tag\n",
3575 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003576 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003577 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003578 }
3579 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003580 int indx;
3581 for (indx = 0;indx < ctxt->nameNr;indx++) {
3582 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003583 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3584 "htmlParseStartTag: misplaced <body> tag\n",
3585 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003586 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003587 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003588 }
3589 }
3590 }
3591
3592 /*
3593 * Now parse the attributes, it ends up with the ending
3594 *
3595 * (S Attribute)* S?
3596 */
3597 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003598 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003599 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003600 ((CUR != '/') || (NXT(1) != '>'))) {
3601 long cons = ctxt->nbChars;
3602
3603 GROW;
3604 attname = htmlParseAttribute(ctxt, &attvalue);
3605 if (attname != NULL) {
3606
3607 /*
3608 * Well formedness requires at most one declaration of an attribute
3609 */
3610 for (i = 0; i < nbatts;i += 2) {
3611 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003612 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3613 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003614 if (attvalue != NULL)
3615 xmlFree(attvalue);
3616 goto failed;
3617 }
3618 }
3619
3620 /*
3621 * Add the pair to atts
3622 */
3623 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003624 maxatts = 22; /* allow for 10 attrs by default */
3625 atts = (const xmlChar **)
3626 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003627 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003628 htmlErrMemory(ctxt, NULL);
3629 if (attvalue != NULL)
3630 xmlFree(attvalue);
3631 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003632 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003633 ctxt->atts = atts;
3634 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003635 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003636 const xmlChar **n;
3637
Owen Taylor3473f882001-02-23 17:55:21 +00003638 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003639 n = (const xmlChar **) xmlRealloc((void *) atts,
3640 maxatts * sizeof(const xmlChar *));
3641 if (n == NULL) {
3642 htmlErrMemory(ctxt, NULL);
3643 if (attvalue != NULL)
3644 xmlFree(attvalue);
3645 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003646 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003647 atts = n;
3648 ctxt->atts = atts;
3649 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003650 }
3651 atts[nbatts++] = attname;
3652 atts[nbatts++] = attvalue;
3653 atts[nbatts] = NULL;
3654 atts[nbatts + 1] = NULL;
3655 }
3656 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003657 if (attvalue != NULL)
3658 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003659 /* Dump the bogus attribute string up to the next blank or
3660 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003661 while ((IS_CHAR_CH(CUR)) &&
3662 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003663 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003664 NEXT;
3665 }
3666
3667failed:
3668 SKIP_BLANKS;
3669 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003670 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3671 "htmlParseStartTag: problem parsing attributes\n",
3672 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003673 break;
3674 }
3675 }
3676
3677 /*
3678 * Handle specific association to the META tag
3679 */
William M. Bracke978ae22007-03-21 06:16:02 +00003680 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003681 htmlCheckMeta(ctxt, atts);
3682
3683 /*
3684 * SAX: Start of Element !
3685 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003686 if (!discardtag) {
3687 htmlnamePush(ctxt, name);
3688 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3689 if (nbatts != 0)
3690 ctxt->sax->startElement(ctxt->userData, name, atts);
3691 else
3692 ctxt->sax->startElement(ctxt->userData, name, NULL);
3693 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003694 }
Owen Taylor3473f882001-02-23 17:55:21 +00003695
3696 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003697 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003698 if (atts[i] != NULL)
3699 xmlFree((xmlChar *) atts[i]);
3700 }
Owen Taylor3473f882001-02-23 17:55:21 +00003701 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003702
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003703 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003704}
3705
3706/**
3707 * htmlParseEndTag:
3708 * @ctxt: an HTML parser context
3709 *
3710 * parse an end of tag
3711 *
3712 * [42] ETag ::= '</' Name S? '>'
3713 *
3714 * With namespace
3715 *
3716 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003717 *
3718 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003719 */
3720
Daniel Veillardf420ac52001-07-04 16:04:09 +00003721static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003722htmlParseEndTag(htmlParserCtxtPtr ctxt)
3723{
3724 const xmlChar *name;
3725 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003726 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003727
3728 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003729 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3730 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003731 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003732 }
3733 SKIP(2);
3734
3735 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003736 if (name == NULL)
3737 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003738 /*
3739 * We should definitely be at the ending "S? '>'" part
3740 */
3741 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003742 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003743 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3744 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003745 if (ctxt->recovery) {
3746 /*
3747 * We're not at the ending > !!
3748 * Error, unless in recover mode where we search forwards
3749 * until we find a >
3750 */
3751 while (CUR != '\0' && CUR != '>') NEXT;
3752 NEXT;
3753 }
Owen Taylor3473f882001-02-23 17:55:21 +00003754 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003755 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003756
3757 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003758 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3759 * out now.
3760 */
3761 if ((ctxt->depth > 0) &&
3762 (xmlStrEqual(name, BAD_CAST "html") ||
3763 xmlStrEqual(name, BAD_CAST "body") ||
3764 xmlStrEqual(name, BAD_CAST "head"))) {
3765 ctxt->depth--;
3766 return (0);
3767 }
3768
3769 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003770 * If the name read is not one of the element in the parsing stack
3771 * then return, it's just an error.
3772 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003773 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3774 if (xmlStrEqual(name, ctxt->nameTab[i]))
3775 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003776 }
3777 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003778 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3779 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003780 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003781 }
3782
3783
3784 /*
3785 * Check for auto-closure of HTML elements.
3786 */
3787
3788 htmlAutoCloseOnClose(ctxt, name);
3789
3790 /*
3791 * Well formedness constraints, opening and closing must match.
3792 * With the exception that the autoclose may have popped stuff out
3793 * of the stack.
3794 */
3795 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003796 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003797 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3798 "Opening and ending tag mismatch: %s and %s\n",
3799 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003800 }
3801 }
3802
3803 /*
3804 * SAX: End of Tag
3805 */
3806 oldname = ctxt->name;
3807 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003808 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3809 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003810 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003811 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003812 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003813 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003814 }
3815
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003816 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003817}
3818
3819
3820/**
3821 * htmlParseReference:
3822 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003823 *
Owen Taylor3473f882001-02-23 17:55:21 +00003824 * parse and handle entity references in content,
3825 * this will end-up in a call to character() since this is either a
3826 * CharRef, or a predefined entity.
3827 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003828static void
Owen Taylor3473f882001-02-23 17:55:21 +00003829htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003830 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003831 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003832 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003833 if (CUR != '&') return;
3834
3835 if (NXT(1) == '#') {
3836 unsigned int c;
3837 int bits, i = 0;
3838
3839 c = htmlParseCharRef(ctxt);
3840 if (c == 0)
3841 return;
3842
3843 if (c < 0x80) { out[i++]= c; bits= -6; }
3844 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3845 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3846 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003847
Owen Taylor3473f882001-02-23 17:55:21 +00003848 for ( ; bits >= 0; bits-= 6) {
3849 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3850 }
3851 out[i] = 0;
3852
3853 htmlCheckParagraph(ctxt);
3854 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3855 ctxt->sax->characters(ctxt->userData, out, i);
3856 } else {
3857 ent = htmlParseEntityRef(ctxt, &name);
3858 if (name == NULL) {
3859 htmlCheckParagraph(ctxt);
3860 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3861 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3862 return;
3863 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003864 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003865 htmlCheckParagraph(ctxt);
3866 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3867 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3868 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3869 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3870 }
3871 } else {
3872 unsigned int c;
3873 int bits, i = 0;
3874
3875 c = ent->value;
3876 if (c < 0x80)
3877 { out[i++]= c; bits= -6; }
3878 else if (c < 0x800)
3879 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3880 else if (c < 0x10000)
3881 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003882 else
Owen Taylor3473f882001-02-23 17:55:21 +00003883 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003884
Owen Taylor3473f882001-02-23 17:55:21 +00003885 for ( ; bits >= 0; bits-= 6) {
3886 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3887 }
3888 out[i] = 0;
3889
3890 htmlCheckParagraph(ctxt);
3891 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3892 ctxt->sax->characters(ctxt->userData, out, i);
3893 }
Owen Taylor3473f882001-02-23 17:55:21 +00003894 }
3895}
3896
3897/**
3898 * htmlParseContent:
3899 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003900 *
3901 * Parse a content: comment, sub-element, reference or text.
Owen Taylor3473f882001-02-23 17:55:21 +00003902 */
3903
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003904static void
Owen Taylor3473f882001-02-23 17:55:21 +00003905htmlParseContent(htmlParserCtxtPtr ctxt) {
3906 xmlChar *currentNode;
3907 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003908 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003909
3910 currentNode = xmlStrdup(ctxt->name);
3911 depth = ctxt->nameNr;
3912 while (1) {
3913 long cons = ctxt->nbChars;
3914
3915 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003916
3917 if (ctxt->instate == XML_PARSER_EOF)
3918 break;
3919
Owen Taylor3473f882001-02-23 17:55:21 +00003920 /*
3921 * Our tag or one of it's parent or children is ending.
3922 */
3923 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003924 if (htmlParseEndTag(ctxt) &&
3925 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3926 if (currentNode != NULL)
3927 xmlFree(currentNode);
3928 return;
3929 }
3930 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003931 }
3932
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003933 else if ((CUR == '<') &&
3934 ((IS_ASCII_LETTER(NXT(1))) ||
3935 (NXT(1) == '_') || (NXT(1) == ':'))) {
3936 name = htmlParseHTMLName_nonInvasive(ctxt);
3937 if (name == NULL) {
3938 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3939 "htmlParseStartTag: invalid element name\n",
3940 NULL, NULL);
3941 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003942 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003943 NEXT;
3944
3945 if (currentNode != NULL)
3946 xmlFree(currentNode);
3947 return;
3948 }
3949
3950 if (ctxt->name != NULL) {
3951 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3952 htmlAutoClose(ctxt, name);
3953 continue;
3954 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003955 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003956 }
3957
Owen Taylor3473f882001-02-23 17:55:21 +00003958 /*
3959 * Has this node been popped out during parsing of
3960 * the next element
3961 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003962 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3963 (!xmlStrEqual(currentNode, ctxt->name)))
3964 {
Owen Taylor3473f882001-02-23 17:55:21 +00003965 if (currentNode != NULL) xmlFree(currentNode);
3966 return;
3967 }
3968
Daniel Veillardf9533d12001-03-03 10:04:57 +00003969 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3970 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003971 /*
3972 * Handle SCRIPT/STYLE separately
3973 */
3974 htmlParseScript(ctxt);
3975 } else {
3976 /*
3977 * Sometimes DOCTYPE arrives in the middle of the document
3978 */
3979 if ((CUR == '<') && (NXT(1) == '!') &&
3980 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3981 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3982 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3983 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003984 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3985 "Misplaced DOCTYPE declaration\n",
3986 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003987 htmlParseDocTypeDecl(ctxt);
3988 }
3989
3990 /*
3991 * First case : a comment
3992 */
3993 if ((CUR == '<') && (NXT(1) == '!') &&
3994 (NXT(2) == '-') && (NXT(3) == '-')) {
3995 htmlParseComment(ctxt);
3996 }
3997
3998 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003999 * Second case : a Processing Instruction.
4000 */
4001 else if ((CUR == '<') && (NXT(1) == '?')) {
4002 htmlParsePI(ctxt);
4003 }
4004
4005 /*
4006 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004007 */
4008 else if (CUR == '<') {
4009 htmlParseElement(ctxt);
4010 }
4011
4012 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004013 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004014 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004015 */
4016 else if (CUR == '&') {
4017 htmlParseReference(ctxt);
4018 }
4019
4020 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004021 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004022 */
4023 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004024 htmlAutoCloseOnEnd(ctxt);
4025 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004026 }
4027
4028 /*
4029 * Last case, text. Note that References are handled directly.
4030 */
4031 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004032 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004033 }
4034
4035 if (cons == ctxt->nbChars) {
4036 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004037 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4038 "detected an error in element content\n",
4039 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004040 }
4041 break;
4042 }
4043 }
4044 GROW;
4045 }
4046 if (currentNode != NULL) xmlFree(currentNode);
4047}
4048
4049/**
Daniel Veillard499cc922006-01-18 17:22:35 +00004050 * htmlParseContent:
4051 * @ctxt: an HTML parser context
4052 *
4053 * Parse a content: comment, sub-element, reference or text.
4054 */
4055
4056void
4057__htmlParseContent(void *ctxt) {
4058 if (ctxt != NULL)
4059 htmlParseContent((htmlParserCtxtPtr) ctxt);
4060}
4061
4062/**
Owen Taylor3473f882001-02-23 17:55:21 +00004063 * htmlParseElement:
4064 * @ctxt: an HTML parser context
4065 *
4066 * parse an HTML element, this is highly recursive
4067 *
4068 * [39] element ::= EmptyElemTag | STag content ETag
4069 *
4070 * [41] Attribute ::= Name Eq AttValue
4071 */
4072
4073void
4074htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004075 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004076 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004077 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004078 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004079 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004080 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004081 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004082
Daniel Veillarda03e3652004-11-02 18:45:30 +00004083 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4084 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004085 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004086 return;
4087 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004088
4089 if (ctxt->instate == XML_PARSER_EOF)
4090 return;
4091
Owen Taylor3473f882001-02-23 17:55:21 +00004092 /* Capture start position */
4093 if (ctxt->record_info) {
4094 node_info.begin_pos = ctxt->input->consumed +
4095 (CUR_PTR - ctxt->input->base);
4096 node_info.begin_line = ctxt->input->line;
4097 }
4098
Daniel Veillard597f1c12005-07-03 23:00:18 +00004099 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004100 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004101 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004102 if (CUR == '>')
4103 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004104 return;
4105 }
Owen Taylor3473f882001-02-23 17:55:21 +00004106
4107 /*
4108 * Lookup the info for that element.
4109 */
4110 info = htmlTagLookup(name);
4111 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004112 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4113 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004114 }
4115
4116 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004117 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004118 */
4119 if ((CUR == '/') && (NXT(1) == '>')) {
4120 SKIP(2);
4121 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4122 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004123 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004124 return;
4125 }
4126
4127 if (CUR == '>') {
4128 NEXT;
4129 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004130 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4131 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004132
4133 /*
4134 * end of parsing of this node.
4135 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004136 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004137 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004138 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004139 }
Owen Taylor3473f882001-02-23 17:55:21 +00004140
4141 /*
4142 * Capture end position and add node
4143 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004144 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004145 node_info.end_pos = ctxt->input->consumed +
4146 (CUR_PTR - ctxt->input->base);
4147 node_info.end_line = ctxt->input->line;
4148 node_info.node = ctxt->node;
4149 xmlParserAddNodeInfo(ctxt, &node_info);
4150 }
4151 return;
4152 }
4153
4154 /*
4155 * Check for an Empty Element from DTD definition
4156 */
4157 if ((info != NULL) && (info->empty)) {
4158 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4159 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004160 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004161 return;
4162 }
4163
4164 /*
4165 * Parse the content of the element:
4166 */
4167 currentNode = xmlStrdup(ctxt->name);
4168 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004169 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004170 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004171 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004172 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004173 if (ctxt->nameNr < depth) break;
4174 }
Owen Taylor3473f882001-02-23 17:55:21 +00004175
Owen Taylor3473f882001-02-23 17:55:21 +00004176 /*
4177 * Capture end position and add node
4178 */
4179 if ( currentNode != NULL && ctxt->record_info ) {
4180 node_info.end_pos = ctxt->input->consumed +
4181 (CUR_PTR - ctxt->input->base);
4182 node_info.end_line = ctxt->input->line;
4183 node_info.node = ctxt->node;
4184 xmlParserAddNodeInfo(ctxt, &node_info);
4185 }
William M. Brack76e95df2003-10-18 16:20:14 +00004186 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004187 htmlAutoCloseOnEnd(ctxt);
4188 }
4189
Owen Taylor3473f882001-02-23 17:55:21 +00004190 if (currentNode != NULL)
4191 xmlFree(currentNode);
4192}
4193
4194/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004195 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004196 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004197 *
Owen Taylor3473f882001-02-23 17:55:21 +00004198 * parse an HTML document (and build a tree if using the standard SAX
4199 * interface).
4200 *
4201 * Returns 0, -1 in case of error. the parser context is augmented
4202 * as a result of the parsing.
4203 */
4204
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004205int
Owen Taylor3473f882001-02-23 17:55:21 +00004206htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004207 xmlChar start[4];
4208 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004209 xmlDtdPtr dtd;
4210
Daniel Veillardd0463562001-10-13 09:15:48 +00004211 xmlInitParser();
4212
Owen Taylor3473f882001-02-23 17:55:21 +00004213 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004214
Daniel Veillarda03e3652004-11-02 18:45:30 +00004215 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4216 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4217 "htmlParseDocument: context error\n", NULL, NULL);
4218 return(XML_ERR_INTERNAL_ERROR);
4219 }
4220 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004221 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004222 GROW;
4223 /*
4224 * SAX: beginning of the document processing.
4225 */
4226 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4227 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4228
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004229 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4230 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4231 /*
4232 * Get the 4 first bytes and decode the charset
4233 * if enc != XML_CHAR_ENCODING_NONE
4234 * plug some encoding conversion routines.
4235 */
4236 start[0] = RAW;
4237 start[1] = NXT(1);
4238 start[2] = NXT(2);
4239 start[3] = NXT(3);
4240 enc = xmlDetectCharEncoding(&start[0], 4);
4241 if (enc != XML_CHAR_ENCODING_NONE) {
4242 xmlSwitchEncoding(ctxt, enc);
4243 }
4244 }
4245
Owen Taylor3473f882001-02-23 17:55:21 +00004246 /*
4247 * Wipe out everything which is before the first '<'
4248 */
4249 SKIP_BLANKS;
4250 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004251 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004252 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004253 }
4254
4255 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4256 ctxt->sax->startDocument(ctxt->userData);
4257
4258
4259 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004260 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004261 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004262 while (((CUR == '<') && (NXT(1) == '!') &&
4263 (NXT(2) == '-') && (NXT(3) == '-')) ||
4264 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004265 htmlParseComment(ctxt);
4266 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004267 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004268 }
Owen Taylor3473f882001-02-23 17:55:21 +00004269
4270
4271 /*
4272 * Then possibly doc type declaration(s) and more Misc
4273 * (doctypedecl Misc*)?
4274 */
4275 if ((CUR == '<') && (NXT(1) == '!') &&
4276 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4277 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4278 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4279 (UPP(8) == 'E')) {
4280 htmlParseDocTypeDecl(ctxt);
4281 }
4282 SKIP_BLANKS;
4283
4284 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004285 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004286 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004287 while (((CUR == '<') && (NXT(1) == '!') &&
4288 (NXT(2) == '-') && (NXT(3) == '-')) ||
4289 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004290 htmlParseComment(ctxt);
4291 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004292 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004293 }
Owen Taylor3473f882001-02-23 17:55:21 +00004294
4295 /*
4296 * Time to start parsing the tree itself
4297 */
4298 htmlParseContent(ctxt);
4299
4300 /*
4301 * autoclose
4302 */
4303 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004304 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004305
4306
4307 /*
4308 * SAX: end of the document processing.
4309 */
4310 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4311 ctxt->sax->endDocument(ctxt->userData);
4312
4313 if (ctxt->myDoc != NULL) {
4314 dtd = xmlGetIntSubset(ctxt->myDoc);
4315 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004316 ctxt->myDoc->intSubset =
4317 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4319 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4320 }
4321 if (! ctxt->wellFormed) return(-1);
4322 return(0);
4323}
4324
4325
4326/************************************************************************
4327 * *
4328 * Parser contexts handling *
4329 * *
4330 ************************************************************************/
4331
4332/**
William M. Brackedb65a72004-02-06 07:36:04 +00004333 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004334 * @ctxt: an HTML parser context
4335 *
4336 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004337 *
4338 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004339 */
4340
Daniel Veillardf403d292003-10-05 13:51:35 +00004341static int
Owen Taylor3473f882001-02-23 17:55:21 +00004342htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4343{
4344 htmlSAXHandler *sax;
4345
Daniel Veillardf403d292003-10-05 13:51:35 +00004346 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004347 memset(ctxt, 0, sizeof(htmlParserCtxt));
4348
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004349 ctxt->dict = xmlDictCreate();
4350 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004351 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4352 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004353 }
Owen Taylor3473f882001-02-23 17:55:21 +00004354 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4355 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004356 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4357 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004358 }
4359 else
4360 memset(sax, 0, sizeof(htmlSAXHandler));
4361
4362 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004363 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004364 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4365 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004366 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004367 ctxt->inputNr = 0;
4368 ctxt->inputMax = 0;
4369 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004370 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004371 }
4372 ctxt->inputNr = 0;
4373 ctxt->inputMax = 5;
4374 ctxt->input = NULL;
4375 ctxt->version = NULL;
4376 ctxt->encoding = NULL;
4377 ctxt->standalone = -1;
4378 ctxt->instate = XML_PARSER_START;
4379
4380 /* Allocate the Node stack */
4381 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4382 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004383 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004384 ctxt->nodeNr = 0;
4385 ctxt->nodeMax = 0;
4386 ctxt->node = NULL;
4387 ctxt->inputNr = 0;
4388 ctxt->inputMax = 0;
4389 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004390 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004391 }
4392 ctxt->nodeNr = 0;
4393 ctxt->nodeMax = 10;
4394 ctxt->node = NULL;
4395
4396 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004397 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004398 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004399 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004400 ctxt->nameNr = 0;
4401 ctxt->nameMax = 10;
4402 ctxt->name = NULL;
4403 ctxt->nodeNr = 0;
4404 ctxt->nodeMax = 0;
4405 ctxt->node = NULL;
4406 ctxt->inputNr = 0;
4407 ctxt->inputMax = 0;
4408 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004409 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004410 }
4411 ctxt->nameNr = 0;
4412 ctxt->nameMax = 10;
4413 ctxt->name = NULL;
4414
Daniel Veillard092643b2003-09-25 14:29:29 +00004415 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004416 else {
4417 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004418 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004419 }
4420 ctxt->userData = ctxt;
4421 ctxt->myDoc = NULL;
4422 ctxt->wellFormed = 1;
4423 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004424 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004425 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004426 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004427 ctxt->vctxt.userData = ctxt;
4428 ctxt->vctxt.error = xmlParserValidityError;
4429 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004430 ctxt->record_info = 0;
4431 ctxt->validate = 0;
4432 ctxt->nbChars = 0;
4433 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004434 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004435 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004436 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004437}
4438
4439/**
4440 * htmlFreeParserCtxt:
4441 * @ctxt: an HTML parser context
4442 *
4443 * Free all the memory used by a parser context. However the parsed
4444 * document in ctxt->myDoc is not freed.
4445 */
4446
4447void
4448htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4449{
4450 xmlFreeParserCtxt(ctxt);
4451}
4452
4453/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004454 * htmlNewParserCtxt:
4455 *
4456 * Allocate and initialize a new parser context.
4457 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004458 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004459 */
4460
Daniel Veillard34c647c2006-09-21 06:53:59 +00004461htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004462htmlNewParserCtxt(void)
4463{
4464 xmlParserCtxtPtr ctxt;
4465
4466 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4467 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004468 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004469 return(NULL);
4470 }
4471 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004472 if (htmlInitParserCtxt(ctxt) < 0) {
4473 htmlFreeParserCtxt(ctxt);
4474 return(NULL);
4475 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004476 return(ctxt);
4477}
4478
4479/**
4480 * htmlCreateMemoryParserCtxt:
4481 * @buffer: a pointer to a char array
4482 * @size: the size of the array
4483 *
4484 * Create a parser context for an HTML in-memory document.
4485 *
4486 * Returns the new parser context or NULL
4487 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004488htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004489htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4490 xmlParserCtxtPtr ctxt;
4491 xmlParserInputPtr input;
4492 xmlParserInputBufferPtr buf;
4493
4494 if (buffer == NULL)
4495 return(NULL);
4496 if (size <= 0)
4497 return(NULL);
4498
4499 ctxt = htmlNewParserCtxt();
4500 if (ctxt == NULL)
4501 return(NULL);
4502
4503 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4504 if (buf == NULL) return(NULL);
4505
4506 input = xmlNewInputStream(ctxt);
4507 if (input == NULL) {
4508 xmlFreeParserCtxt(ctxt);
4509 return(NULL);
4510 }
4511
4512 input->filename = NULL;
4513 input->buf = buf;
4514 input->base = input->buf->buffer->content;
4515 input->cur = input->buf->buffer->content;
4516 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4517
4518 inputPush(ctxt, input);
4519 return(ctxt);
4520}
4521
4522/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004523 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004524 * @cur: a pointer to an array of xmlChar
4525 * @encoding: a free form C string describing the HTML document encoding, or NULL
4526 *
4527 * Create a parser context for an HTML document.
4528 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004529 * TODO: check the need to add encoding handling there
4530 *
Owen Taylor3473f882001-02-23 17:55:21 +00004531 * Returns the new parser context or NULL
4532 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004533static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004534htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004535 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004536 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004537
Daniel Veillard1d995272002-07-22 16:43:32 +00004538 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004539 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004540 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004541 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004542 if (ctxt == NULL)
4543 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004544
4545 if (encoding != NULL) {
4546 xmlCharEncoding enc;
4547 xmlCharEncodingHandlerPtr handler;
4548
4549 if (ctxt->input->encoding != NULL)
4550 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004551 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004552
4553 enc = xmlParseCharEncoding(encoding);
4554 /*
4555 * registered set of known encodings
4556 */
4557 if (enc != XML_CHAR_ENCODING_ERROR) {
4558 xmlSwitchEncoding(ctxt, enc);
4559 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004560 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004561 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004562 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004563 }
4564 } else {
4565 /*
4566 * fallback for unknown encodings
4567 */
4568 handler = xmlFindCharEncodingHandler((const char *) encoding);
4569 if (handler != NULL) {
4570 xmlSwitchToEncoding(ctxt, handler);
4571 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004572 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4573 "Unsupported encoding %s\n",
4574 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004575 }
4576 }
4577 }
4578 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004579}
4580
Daniel Veillard73b013f2003-09-30 12:36:01 +00004581#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004582/************************************************************************
4583 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004584 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004585 * *
4586 ************************************************************************/
4587
4588/**
4589 * htmlParseLookupSequence:
4590 * @ctxt: an HTML parser context
4591 * @first: the first char to lookup
4592 * @next: the next char to lookup or zero
4593 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004594 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004595 *
4596 * Try to find if a sequence (first, next, third) or just (first next) or
4597 * (first) is available in the input stream.
4598 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4599 * to avoid rescanning sequences of bytes, it DOES change the state of the
4600 * parser, do not use liberally.
4601 * This is basically similar to xmlParseLookupSequence()
4602 *
4603 * Returns the index to the current parsing point if the full sequence
4604 * is available, -1 otherwise.
4605 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004606static int
Owen Taylor3473f882001-02-23 17:55:21 +00004607htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004608 xmlChar next, xmlChar third, int iscomment,
4609 int ignoreattrval) {
Owen Taylor3473f882001-02-23 17:55:21 +00004610 int base, len;
4611 htmlParserInputPtr in;
4612 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004613 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004614 int invalue = 0;
4615 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004616
4617 in = ctxt->input;
4618 if (in == NULL) return(-1);
4619 base = in->cur - in->base;
4620 if (base < 0) return(-1);
4621 if (ctxt->checkIndex > base)
4622 base = ctxt->checkIndex;
4623 if (in->buf == NULL) {
4624 buf = in->base;
4625 len = in->length;
4626 } else {
4627 buf = in->buf->buffer->content;
4628 len = in->buf->buffer->use;
4629 }
4630 /* take into account the sequence length */
4631 if (third) len -= 2;
4632 else if (next) len --;
4633 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004634 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004635 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4636 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4637 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004638 /* do not increment past <! - some people use <!--> */
4639 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004640 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004641 }
Jiri Netolicky446e1262009-08-07 17:05:36 +02004642 if (ignoreattrval) {
4643 if (buf[base] == '"' || buf[base] == '\'') {
4644 if (invalue) {
4645 if (buf[base] == valdellim) {
4646 invalue = 0;
4647 continue;
4648 }
4649 } else {
4650 valdellim = buf[base];
4651 invalue = 1;
4652 continue;
4653 }
4654 } else if (invalue) {
4655 continue;
4656 }
4657 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004658 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004659 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004660 return(-1);
4661 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4662 (buf[base + 2] == '>')) {
4663 incomment = 0;
4664 base += 2;
4665 }
4666 continue;
4667 }
Owen Taylor3473f882001-02-23 17:55:21 +00004668 if (buf[base] == first) {
4669 if (third != 0) {
4670 if ((buf[base + 1] != next) ||
4671 (buf[base + 2] != third)) continue;
4672 } else if (next != 0) {
4673 if (buf[base + 1] != next) continue;
4674 }
4675 ctxt->checkIndex = 0;
4676#ifdef DEBUG_PUSH
4677 if (next == 0)
4678 xmlGenericError(xmlGenericErrorContext,
4679 "HPP: lookup '%c' found at %d\n",
4680 first, base);
4681 else if (third == 0)
4682 xmlGenericError(xmlGenericErrorContext,
4683 "HPP: lookup '%c%c' found at %d\n",
4684 first, next, base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004685 else
Owen Taylor3473f882001-02-23 17:55:21 +00004686 xmlGenericError(xmlGenericErrorContext,
4687 "HPP: lookup '%c%c%c' found at %d\n",
4688 first, next, third, base);
4689#endif
4690 return(base - (in->cur - in->base));
4691 }
4692 }
4693 ctxt->checkIndex = base;
4694#ifdef DEBUG_PUSH
4695 if (next == 0)
4696 xmlGenericError(xmlGenericErrorContext,
4697 "HPP: lookup '%c' failed\n", first);
4698 else if (third == 0)
4699 xmlGenericError(xmlGenericErrorContext,
4700 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02004701 else
Owen Taylor3473f882001-02-23 17:55:21 +00004702 xmlGenericError(xmlGenericErrorContext,
4703 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4704#endif
4705 return(-1);
4706}
4707
4708/**
4709 * htmlParseTryOrFinish:
4710 * @ctxt: an HTML parser context
4711 * @terminate: last chunk indicator
4712 *
4713 * Try to progress on parsing
4714 *
4715 * Returns zero if no parsing was possible
4716 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004717static int
Owen Taylor3473f882001-02-23 17:55:21 +00004718htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4719 int ret = 0;
4720 htmlParserInputPtr in;
4721 int avail = 0;
4722 xmlChar cur, next;
4723
4724#ifdef DEBUG_PUSH
4725 switch (ctxt->instate) {
4726 case XML_PARSER_EOF:
4727 xmlGenericError(xmlGenericErrorContext,
4728 "HPP: try EOF\n"); break;
4729 case XML_PARSER_START:
4730 xmlGenericError(xmlGenericErrorContext,
4731 "HPP: try START\n"); break;
4732 case XML_PARSER_MISC:
4733 xmlGenericError(xmlGenericErrorContext,
4734 "HPP: try MISC\n");break;
4735 case XML_PARSER_COMMENT:
4736 xmlGenericError(xmlGenericErrorContext,
4737 "HPP: try COMMENT\n");break;
4738 case XML_PARSER_PROLOG:
4739 xmlGenericError(xmlGenericErrorContext,
4740 "HPP: try PROLOG\n");break;
4741 case XML_PARSER_START_TAG:
4742 xmlGenericError(xmlGenericErrorContext,
4743 "HPP: try START_TAG\n");break;
4744 case XML_PARSER_CONTENT:
4745 xmlGenericError(xmlGenericErrorContext,
4746 "HPP: try CONTENT\n");break;
4747 case XML_PARSER_CDATA_SECTION:
4748 xmlGenericError(xmlGenericErrorContext,
4749 "HPP: try CDATA_SECTION\n");break;
4750 case XML_PARSER_END_TAG:
4751 xmlGenericError(xmlGenericErrorContext,
4752 "HPP: try END_TAG\n");break;
4753 case XML_PARSER_ENTITY_DECL:
4754 xmlGenericError(xmlGenericErrorContext,
4755 "HPP: try ENTITY_DECL\n");break;
4756 case XML_PARSER_ENTITY_VALUE:
4757 xmlGenericError(xmlGenericErrorContext,
4758 "HPP: try ENTITY_VALUE\n");break;
4759 case XML_PARSER_ATTRIBUTE_VALUE:
4760 xmlGenericError(xmlGenericErrorContext,
4761 "HPP: try ATTRIBUTE_VALUE\n");break;
4762 case XML_PARSER_DTD:
4763 xmlGenericError(xmlGenericErrorContext,
4764 "HPP: try DTD\n");break;
4765 case XML_PARSER_EPILOG:
4766 xmlGenericError(xmlGenericErrorContext,
4767 "HPP: try EPILOG\n");break;
4768 case XML_PARSER_PI:
4769 xmlGenericError(xmlGenericErrorContext,
4770 "HPP: try PI\n");break;
4771 case XML_PARSER_SYSTEM_LITERAL:
4772 xmlGenericError(xmlGenericErrorContext,
4773 "HPP: try SYSTEM_LITERAL\n");break;
4774 }
4775#endif
4776
4777 while (1) {
4778
4779 in = ctxt->input;
4780 if (in == NULL) break;
4781 if (in->buf == NULL)
4782 avail = in->length - (in->cur - in->base);
4783 else
4784 avail = in->buf->buffer->use - (in->cur - in->base);
4785 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004786 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004787 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004788 /*
4789 * SAX: end of the document processing.
4790 */
4791 ctxt->instate = XML_PARSER_EOF;
4792 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4793 ctxt->sax->endDocument(ctxt->userData);
4794 }
4795 }
4796 if (avail < 1)
4797 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004798 cur = in->cur[0];
4799 if (cur == 0) {
4800 SKIP(1);
4801 continue;
4802 }
4803
Owen Taylor3473f882001-02-23 17:55:21 +00004804 switch (ctxt->instate) {
4805 case XML_PARSER_EOF:
4806 /*
4807 * Document parsing is done !
4808 */
4809 goto done;
4810 case XML_PARSER_START:
4811 /*
4812 * Very first chars read from the document flow.
4813 */
4814 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004815 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004816 SKIP_BLANKS;
4817 if (in->buf == NULL)
4818 avail = in->length - (in->cur - in->base);
4819 else
4820 avail = in->buf->buffer->use - (in->cur - in->base);
4821 }
4822 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4823 ctxt->sax->setDocumentLocator(ctxt->userData,
4824 &xmlDefaultSAXLocator);
4825 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4826 (!ctxt->disableSAX))
4827 ctxt->sax->startDocument(ctxt->userData);
4828
4829 cur = in->cur[0];
4830 next = in->cur[1];
4831 if ((cur == '<') && (next == '!') &&
4832 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4833 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4834 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4835 (UPP(8) == 'E')) {
4836 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004837 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004838 goto done;
4839#ifdef DEBUG_PUSH
4840 xmlGenericError(xmlGenericErrorContext,
4841 "HPP: Parsing internal subset\n");
4842#endif
4843 htmlParseDocTypeDecl(ctxt);
4844 ctxt->instate = XML_PARSER_PROLOG;
4845#ifdef DEBUG_PUSH
4846 xmlGenericError(xmlGenericErrorContext,
4847 "HPP: entering PROLOG\n");
4848#endif
4849 } else {
4850 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004851#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00004852 xmlGenericError(xmlGenericErrorContext,
4853 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004854#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00004855 }
Owen Taylor3473f882001-02-23 17:55:21 +00004856 break;
4857 case XML_PARSER_MISC:
4858 SKIP_BLANKS;
4859 if (in->buf == NULL)
4860 avail = in->length - (in->cur - in->base);
4861 else
4862 avail = in->buf->buffer->use - (in->cur - in->base);
4863 if (avail < 2)
4864 goto done;
4865 cur = in->cur[0];
4866 next = in->cur[1];
4867 if ((cur == '<') && (next == '!') &&
4868 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4869 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004870 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004871 goto done;
4872#ifdef DEBUG_PUSH
4873 xmlGenericError(xmlGenericErrorContext,
4874 "HPP: Parsing Comment\n");
4875#endif
4876 htmlParseComment(ctxt);
4877 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004878 } else if ((cur == '<') && (next == '?')) {
4879 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004880 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004881 goto done;
4882#ifdef DEBUG_PUSH
4883 xmlGenericError(xmlGenericErrorContext,
4884 "HPP: Parsing PI\n");
4885#endif
4886 htmlParsePI(ctxt);
4887 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00004888 } else if ((cur == '<') && (next == '!') &&
4889 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4890 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4891 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4892 (UPP(8) == 'E')) {
4893 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004894 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004895 goto done;
4896#ifdef DEBUG_PUSH
4897 xmlGenericError(xmlGenericErrorContext,
4898 "HPP: Parsing internal subset\n");
4899#endif
4900 htmlParseDocTypeDecl(ctxt);
4901 ctxt->instate = XML_PARSER_PROLOG;
4902#ifdef DEBUG_PUSH
4903 xmlGenericError(xmlGenericErrorContext,
4904 "HPP: entering PROLOG\n");
4905#endif
4906 } else if ((cur == '<') && (next == '!') &&
4907 (avail < 9)) {
4908 goto done;
4909 } else {
4910 ctxt->instate = XML_PARSER_START_TAG;
4911#ifdef DEBUG_PUSH
4912 xmlGenericError(xmlGenericErrorContext,
4913 "HPP: entering START_TAG\n");
4914#endif
4915 }
4916 break;
4917 case XML_PARSER_PROLOG:
4918 SKIP_BLANKS;
4919 if (in->buf == NULL)
4920 avail = in->length - (in->cur - in->base);
4921 else
4922 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02004923 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00004924 goto done;
4925 cur = in->cur[0];
4926 next = in->cur[1];
4927 if ((cur == '<') && (next == '!') &&
4928 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4929 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004930 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004931 goto done;
4932#ifdef DEBUG_PUSH
4933 xmlGenericError(xmlGenericErrorContext,
4934 "HPP: Parsing Comment\n");
4935#endif
4936 htmlParseComment(ctxt);
4937 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004938 } else if ((cur == '<') && (next == '?')) {
4939 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004940 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004941 goto done;
4942#ifdef DEBUG_PUSH
4943 xmlGenericError(xmlGenericErrorContext,
4944 "HPP: Parsing PI\n");
4945#endif
4946 htmlParsePI(ctxt);
4947 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004948 } else if ((cur == '<') && (next == '!') &&
4949 (avail < 4)) {
4950 goto done;
4951 } else {
4952 ctxt->instate = XML_PARSER_START_TAG;
4953#ifdef DEBUG_PUSH
4954 xmlGenericError(xmlGenericErrorContext,
4955 "HPP: entering START_TAG\n");
4956#endif
4957 }
4958 break;
4959 case XML_PARSER_EPILOG:
4960 if (in->buf == NULL)
4961 avail = in->length - (in->cur - in->base);
4962 else
4963 avail = in->buf->buffer->use - (in->cur - in->base);
4964 if (avail < 1)
4965 goto done;
4966 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004967 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004968 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004969 goto done;
4970 }
4971 if (avail < 2)
4972 goto done;
4973 next = in->cur[1];
4974 if ((cur == '<') && (next == '!') &&
4975 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4976 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004977 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004978 goto done;
4979#ifdef DEBUG_PUSH
4980 xmlGenericError(xmlGenericErrorContext,
4981 "HPP: Parsing Comment\n");
4982#endif
4983 htmlParseComment(ctxt);
4984 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004985 } else if ((cur == '<') && (next == '?')) {
4986 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02004987 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004988 goto done;
4989#ifdef DEBUG_PUSH
4990 xmlGenericError(xmlGenericErrorContext,
4991 "HPP: Parsing PI\n");
4992#endif
4993 htmlParsePI(ctxt);
4994 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00004995 } else if ((cur == '<') && (next == '!') &&
4996 (avail < 4)) {
4997 goto done;
4998 } else {
4999 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005000 ctxt->wellFormed = 0;
5001 ctxt->instate = XML_PARSER_EOF;
5002#ifdef DEBUG_PUSH
5003 xmlGenericError(xmlGenericErrorContext,
5004 "HPP: entering EOF\n");
5005#endif
5006 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5007 ctxt->sax->endDocument(ctxt->userData);
5008 goto done;
5009 }
5010 break;
5011 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005012 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005013 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005014 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005015
5016 if (avail < 2)
5017 goto done;
5018 cur = in->cur[0];
5019 if (cur != '<') {
5020 ctxt->instate = XML_PARSER_CONTENT;
5021#ifdef DEBUG_PUSH
5022 xmlGenericError(xmlGenericErrorContext,
5023 "HPP: entering CONTENT\n");
5024#endif
5025 break;
5026 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005027 if (in->cur[1] == '/') {
5028 ctxt->instate = XML_PARSER_END_TAG;
5029 ctxt->checkIndex = 0;
5030#ifdef DEBUG_PUSH
5031 xmlGenericError(xmlGenericErrorContext,
5032 "HPP: entering END_TAG\n");
5033#endif
5034 break;
5035 }
Owen Taylor3473f882001-02-23 17:55:21 +00005036 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005037 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005038 goto done;
5039
Daniel Veillard597f1c12005-07-03 23:00:18 +00005040 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005041 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005042 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005043 (name == NULL)) {
5044 if (CUR == '>')
5045 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005046 break;
5047 }
Owen Taylor3473f882001-02-23 17:55:21 +00005048
5049 /*
5050 * Lookup the info for that element.
5051 */
5052 info = htmlTagLookup(name);
5053 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005054 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5055 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005056 }
5057
5058 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005059 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005060 */
5061 if ((CUR == '/') && (NXT(1) == '>')) {
5062 SKIP(2);
5063 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5064 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005065 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005066 ctxt->instate = XML_PARSER_CONTENT;
5067#ifdef DEBUG_PUSH
5068 xmlGenericError(xmlGenericErrorContext,
5069 "HPP: entering CONTENT\n");
5070#endif
5071 break;
5072 }
5073
5074 if (CUR == '>') {
5075 NEXT;
5076 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005077 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5078 "Couldn't find end of Start Tag %s\n",
5079 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005080
5081 /*
5082 * end of parsing of this node.
5083 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005084 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005085 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005086 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005087 }
Owen Taylor3473f882001-02-23 17:55:21 +00005088
5089 ctxt->instate = XML_PARSER_CONTENT;
5090#ifdef DEBUG_PUSH
5091 xmlGenericError(xmlGenericErrorContext,
5092 "HPP: entering CONTENT\n");
5093#endif
5094 break;
5095 }
5096
5097 /*
5098 * Check for an Empty Element from DTD definition
5099 */
5100 if ((info != NULL) && (info->empty)) {
5101 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5102 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005103 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005104 }
5105 ctxt->instate = XML_PARSER_CONTENT;
5106#ifdef DEBUG_PUSH
5107 xmlGenericError(xmlGenericErrorContext,
5108 "HPP: entering CONTENT\n");
5109#endif
5110 break;
5111 }
5112 case XML_PARSER_CONTENT: {
5113 long cons;
5114 /*
5115 * Handle preparsed entities and charRef
5116 */
5117 if (ctxt->token != 0) {
5118 xmlChar chr[2] = { 0 , 0 } ;
5119
5120 chr[0] = (xmlChar) ctxt->token;
5121 htmlCheckParagraph(ctxt);
5122 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5123 ctxt->sax->characters(ctxt->userData, chr, 1);
5124 ctxt->token = 0;
5125 ctxt->checkIndex = 0;
5126 }
5127 if ((avail == 1) && (terminate)) {
5128 cur = in->cur[0];
5129 if ((cur != '<') && (cur != '&')) {
5130 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005131 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005132 if (ctxt->sax->ignorableWhitespace != NULL)
5133 ctxt->sax->ignorableWhitespace(
5134 ctxt->userData, &cur, 1);
5135 } else {
5136 htmlCheckParagraph(ctxt);
5137 if (ctxt->sax->characters != NULL)
5138 ctxt->sax->characters(
5139 ctxt->userData, &cur, 1);
5140 }
5141 }
5142 ctxt->token = 0;
5143 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005144 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005145 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005146 }
Owen Taylor3473f882001-02-23 17:55:21 +00005147 }
5148 if (avail < 2)
5149 goto done;
5150 cur = in->cur[0];
5151 next = in->cur[1];
5152 cons = ctxt->nbChars;
5153 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5154 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5155 /*
5156 * Handle SCRIPT/STYLE separately
5157 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005158 if (!terminate) {
5159 int idx;
5160 xmlChar val;
5161
Jiri Netolicky446e1262009-08-07 17:05:36 +02005162 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005163 if (idx < 0)
5164 goto done;
5165 val = in->cur[idx + 2];
5166 if (val == 0) /* bad cut of input */
5167 goto done;
5168 }
Owen Taylor3473f882001-02-23 17:55:21 +00005169 htmlParseScript(ctxt);
5170 if ((cur == '<') && (next == '/')) {
5171 ctxt->instate = XML_PARSER_END_TAG;
5172 ctxt->checkIndex = 0;
5173#ifdef DEBUG_PUSH
5174 xmlGenericError(xmlGenericErrorContext,
5175 "HPP: entering END_TAG\n");
5176#endif
5177 break;
5178 }
5179 } else {
5180 /*
5181 * Sometimes DOCTYPE arrives in the middle of the document
5182 */
5183 if ((cur == '<') && (next == '!') &&
5184 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5185 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5186 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5187 (UPP(8) == 'E')) {
5188 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005189 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005190 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005191 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5192 "Misplaced DOCTYPE declaration\n",
5193 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005194 htmlParseDocTypeDecl(ctxt);
5195 } else if ((cur == '<') && (next == '!') &&
5196 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5197 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005198 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005199 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005200 goto done;
5201#ifdef DEBUG_PUSH
5202 xmlGenericError(xmlGenericErrorContext,
5203 "HPP: Parsing Comment\n");
5204#endif
5205 htmlParseComment(ctxt);
5206 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005207 } else if ((cur == '<') && (next == '?')) {
5208 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005209 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005210 goto done;
5211#ifdef DEBUG_PUSH
5212 xmlGenericError(xmlGenericErrorContext,
5213 "HPP: Parsing PI\n");
5214#endif
5215 htmlParsePI(ctxt);
5216 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005217 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5218 goto done;
5219 } else if ((cur == '<') && (next == '/')) {
5220 ctxt->instate = XML_PARSER_END_TAG;
5221 ctxt->checkIndex = 0;
5222#ifdef DEBUG_PUSH
5223 xmlGenericError(xmlGenericErrorContext,
5224 "HPP: entering END_TAG\n");
5225#endif
5226 break;
5227 } else if (cur == '<') {
5228 ctxt->instate = XML_PARSER_START_TAG;
5229 ctxt->checkIndex = 0;
5230#ifdef DEBUG_PUSH
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: entering START_TAG\n");
5233#endif
5234 break;
5235 } else if (cur == '&') {
5236 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005237 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005238 goto done;
5239#ifdef DEBUG_PUSH
5240 xmlGenericError(xmlGenericErrorContext,
5241 "HPP: Parsing Reference\n");
5242#endif
5243 /* TODO: check generation of subtrees if noent !!! */
5244 htmlParseReference(ctxt);
5245 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005246 /*
5247 * check that the text sequence is complete
5248 * before handing out the data to the parser
5249 * to avoid problems with erroneous end of
5250 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005251 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005252 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005253 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005254 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005255 ctxt->checkIndex = 0;
5256#ifdef DEBUG_PUSH
5257 xmlGenericError(xmlGenericErrorContext,
5258 "HPP: Parsing char data\n");
5259#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005260 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005261 }
5262 }
5263 if (cons == ctxt->nbChars) {
5264 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005265 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5266 "detected an error in element content\n",
5267 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005268 }
5269 NEXT;
5270 break;
5271 }
5272
5273 break;
5274 }
5275 case XML_PARSER_END_TAG:
5276 if (avail < 2)
5277 goto done;
5278 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005279 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005280 goto done;
5281 htmlParseEndTag(ctxt);
5282 if (ctxt->nameNr == 0) {
5283 ctxt->instate = XML_PARSER_EPILOG;
5284 } else {
5285 ctxt->instate = XML_PARSER_CONTENT;
5286 }
5287 ctxt->checkIndex = 0;
5288#ifdef DEBUG_PUSH
5289 xmlGenericError(xmlGenericErrorContext,
5290 "HPP: entering CONTENT\n");
5291#endif
5292 break;
5293 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005294 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5295 "HPP: internal error, state == CDATA\n",
5296 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005297 ctxt->instate = XML_PARSER_CONTENT;
5298 ctxt->checkIndex = 0;
5299#ifdef DEBUG_PUSH
5300 xmlGenericError(xmlGenericErrorContext,
5301 "HPP: entering CONTENT\n");
5302#endif
5303 break;
5304 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005305 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5306 "HPP: internal error, state == DTD\n",
5307 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005308 ctxt->instate = XML_PARSER_CONTENT;
5309 ctxt->checkIndex = 0;
5310#ifdef DEBUG_PUSH
5311 xmlGenericError(xmlGenericErrorContext,
5312 "HPP: entering CONTENT\n");
5313#endif
5314 break;
5315 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005316 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5317 "HPP: internal error, state == COMMENT\n",
5318 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005319 ctxt->instate = XML_PARSER_CONTENT;
5320 ctxt->checkIndex = 0;
5321#ifdef DEBUG_PUSH
5322 xmlGenericError(xmlGenericErrorContext,
5323 "HPP: entering CONTENT\n");
5324#endif
5325 break;
5326 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005327 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5328 "HPP: internal error, state == PI\n",
5329 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005330 ctxt->instate = XML_PARSER_CONTENT;
5331 ctxt->checkIndex = 0;
5332#ifdef DEBUG_PUSH
5333 xmlGenericError(xmlGenericErrorContext,
5334 "HPP: entering CONTENT\n");
5335#endif
5336 break;
5337 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005338 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5339 "HPP: internal error, state == ENTITY_DECL\n",
5340 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005341 ctxt->instate = XML_PARSER_CONTENT;
5342 ctxt->checkIndex = 0;
5343#ifdef DEBUG_PUSH
5344 xmlGenericError(xmlGenericErrorContext,
5345 "HPP: entering CONTENT\n");
5346#endif
5347 break;
5348 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005349 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5350 "HPP: internal error, state == ENTITY_VALUE\n",
5351 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005352 ctxt->instate = XML_PARSER_CONTENT;
5353 ctxt->checkIndex = 0;
5354#ifdef DEBUG_PUSH
5355 xmlGenericError(xmlGenericErrorContext,
5356 "HPP: entering DTD\n");
5357#endif
5358 break;
5359 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005360 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5361 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5362 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005363 ctxt->instate = XML_PARSER_START_TAG;
5364 ctxt->checkIndex = 0;
5365#ifdef DEBUG_PUSH
5366 xmlGenericError(xmlGenericErrorContext,
5367 "HPP: entering START_TAG\n");
5368#endif
5369 break;
5370 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005371 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5372 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5373 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005374 ctxt->instate = XML_PARSER_CONTENT;
5375 ctxt->checkIndex = 0;
5376#ifdef DEBUG_PUSH
5377 xmlGenericError(xmlGenericErrorContext,
5378 "HPP: entering CONTENT\n");
5379#endif
5380 break;
5381 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005382 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5383 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5384 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005385 ctxt->instate = XML_PARSER_CONTENT;
5386 ctxt->checkIndex = 0;
5387#ifdef DEBUG_PUSH
5388 xmlGenericError(xmlGenericErrorContext,
5389 "HPP: entering CONTENT\n");
5390#endif
5391 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005392 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005393 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5394 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5395 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005396 ctxt->instate = XML_PARSER_CONTENT;
5397 ctxt->checkIndex = 0;
5398#ifdef DEBUG_PUSH
5399 xmlGenericError(xmlGenericErrorContext,
5400 "HPP: entering CONTENT\n");
5401#endif
5402 break;
5403
Owen Taylor3473f882001-02-23 17:55:21 +00005404 }
5405 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005406done:
Owen Taylor3473f882001-02-23 17:55:21 +00005407 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005408 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005409 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005410 /*
5411 * SAX: end of the document processing.
5412 */
5413 ctxt->instate = XML_PARSER_EOF;
5414 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5415 ctxt->sax->endDocument(ctxt->userData);
5416 }
5417 }
5418 if ((ctxt->myDoc != NULL) &&
5419 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5420 (ctxt->instate == XML_PARSER_EPILOG))) {
5421 xmlDtdPtr dtd;
5422 dtd = xmlGetIntSubset(ctxt->myDoc);
5423 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005424 ctxt->myDoc->intSubset =
5425 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005426 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5427 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5428 }
5429#ifdef DEBUG_PUSH
5430 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5431#endif
5432 return(ret);
5433}
5434
5435/**
Owen Taylor3473f882001-02-23 17:55:21 +00005436 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005437 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005438 * @chunk: an char array
5439 * @size: the size in byte of the chunk
5440 * @terminate: last chunk indicator
5441 *
5442 * Parse a Chunk of memory
5443 *
5444 * Returns zero if no error, the xmlParserErrors otherwise.
5445 */
5446int
5447htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5448 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005449 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5450 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5451 "htmlParseChunk: context error\n", NULL, NULL);
5452 return(XML_ERR_INTERNAL_ERROR);
5453 }
Owen Taylor3473f882001-02-23 17:55:21 +00005454 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5455 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5456 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5457 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005458 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005459
5460 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005461 if (res < 0) {
5462 ctxt->errNo = XML_PARSER_EOF;
5463 ctxt->disableSAX = 1;
5464 return (XML_PARSER_EOF);
5465 }
Owen Taylor3473f882001-02-23 17:55:21 +00005466 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5467 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005468 ctxt->input->end =
5469 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005470#ifdef DEBUG_PUSH
5471 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5472#endif
5473
Daniel Veillard14f752c2003-08-09 11:44:50 +00005474#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005475 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5476 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005477#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005478 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005479 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5480 xmlParserInputBufferPtr in = ctxt->input->buf;
5481 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5482 (in->raw != NULL)) {
5483 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005484
Daniel Veillard14f752c2003-08-09 11:44:50 +00005485 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5486 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005487 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5488 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005489 return(XML_ERR_INVALID_ENCODING);
5490 }
5491 }
5492 }
Owen Taylor3473f882001-02-23 17:55:21 +00005493 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005494 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005495 if (terminate) {
5496 if ((ctxt->instate != XML_PARSER_EOF) &&
5497 (ctxt->instate != XML_PARSER_EPILOG) &&
5498 (ctxt->instate != XML_PARSER_MISC)) {
5499 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005500 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005501 }
Owen Taylor3473f882001-02-23 17:55:21 +00005502 if (ctxt->instate != XML_PARSER_EOF) {
5503 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5504 ctxt->sax->endDocument(ctxt->userData);
5505 }
5506 ctxt->instate = XML_PARSER_EOF;
5507 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005508 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005509}
5510
5511/************************************************************************
5512 * *
5513 * User entry points *
5514 * *
5515 ************************************************************************/
5516
5517/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005518 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005519 * @sax: a SAX handler
5520 * @user_data: The user data returned on SAX callbacks
5521 * @chunk: a pointer to an array of chars
5522 * @size: number of chars in the array
5523 * @filename: an optional file name or URI
5524 * @enc: an optional encoding
5525 *
5526 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005527 * The value of @filename is used for fetching external entities
5528 * and error/warning reports.
5529 *
5530 * Returns the new parser context or NULL
5531 */
5532htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005533htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005534 const char *chunk, int size, const char *filename,
5535 xmlCharEncoding enc) {
5536 htmlParserCtxtPtr ctxt;
5537 htmlParserInputPtr inputStream;
5538 xmlParserInputBufferPtr buf;
5539
Daniel Veillardd0463562001-10-13 09:15:48 +00005540 xmlInitParser();
5541
Owen Taylor3473f882001-02-23 17:55:21 +00005542 buf = xmlAllocParserInputBuffer(enc);
5543 if (buf == NULL) return(NULL);
5544
Daniel Veillardf403d292003-10-05 13:51:35 +00005545 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005546 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005547 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005548 return(NULL);
5549 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005550 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5551 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005552 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005553 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005554 xmlFree(ctxt->sax);
5555 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5556 if (ctxt->sax == NULL) {
5557 xmlFree(buf);
5558 xmlFree(ctxt);
5559 return(NULL);
5560 }
5561 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5562 if (user_data != NULL)
5563 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02005564 }
Owen Taylor3473f882001-02-23 17:55:21 +00005565 if (filename == NULL) {
5566 ctxt->directory = NULL;
5567 } else {
5568 ctxt->directory = xmlParserGetDirectory(filename);
5569 }
5570
5571 inputStream = htmlNewInputStream(ctxt);
5572 if (inputStream == NULL) {
5573 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005574 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005575 return(NULL);
5576 }
5577
5578 if (filename == NULL)
5579 inputStream->filename = NULL;
5580 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005581 inputStream->filename = (char *)
5582 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005583 inputStream->buf = buf;
5584 inputStream->base = inputStream->buf->buffer->content;
5585 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02005586 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00005587 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005588
5589 inputPush(ctxt, inputStream);
5590
5591 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02005592 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005593 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5594 int cur = ctxt->input->cur - ctxt->input->base;
5595
Daniel Veillarde77db162009-08-22 11:32:38 +02005596 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005597
5598 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5599 ctxt->input->cur = ctxt->input->base + cur;
5600 ctxt->input->end =
5601 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005602#ifdef DEBUG_PUSH
5603 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5604#endif
5605 }
Daniel Veillard68716a72006-10-16 09:32:17 +00005606 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00005607
5608 return(ctxt);
5609}
William M. Brack21e4ef22005-01-02 09:53:13 +00005610#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00005611
5612/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005613 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005614 * @cur: a pointer to an array of xmlChar
5615 * @encoding: a free form C string describing the HTML document encoding, or NULL
5616 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005617 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005618 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005619 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5620 * to handle parse events. If sax is NULL, fallback to the default DOM
5621 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005622 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005623 * Returns the resulting document tree unless SAX is NULL or the document is
5624 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005625 */
5626
5627htmlDocPtr
5628htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5629 htmlDocPtr ret;
5630 htmlParserCtxtPtr ctxt;
5631
Daniel Veillardd0463562001-10-13 09:15:48 +00005632 xmlInitParser();
5633
Owen Taylor3473f882001-02-23 17:55:21 +00005634 if (cur == NULL) return(NULL);
5635
5636
5637 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5638 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02005639 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005640 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005641 ctxt->sax = sax;
5642 ctxt->userData = userData;
5643 }
5644
5645 htmlParseDocument(ctxt);
5646 ret = ctxt->myDoc;
5647 if (sax != NULL) {
5648 ctxt->sax = NULL;
5649 ctxt->userData = NULL;
5650 }
5651 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005652
Owen Taylor3473f882001-02-23 17:55:21 +00005653 return(ret);
5654}
5655
5656/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005657 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005658 * @cur: a pointer to an array of xmlChar
5659 * @encoding: a free form C string describing the HTML document encoding, or NULL
5660 *
5661 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02005662 *
Owen Taylor3473f882001-02-23 17:55:21 +00005663 * Returns the resulting document tree
5664 */
5665
5666htmlDocPtr
5667htmlParseDoc(xmlChar *cur, const char *encoding) {
5668 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5669}
5670
5671
5672/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005673 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005674 * @filename: the filename
5675 * @encoding: a free form C string describing the HTML document encoding, or NULL
5676 *
Daniel Veillarde77db162009-08-22 11:32:38 +02005677 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00005678 * Automatic support for ZLIB/Compress compressed document is provided
5679 * by default if found at compile-time.
5680 *
5681 * Returns the new parser context or NULL
5682 */
5683htmlParserCtxtPtr
5684htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5685{
5686 htmlParserCtxtPtr ctxt;
5687 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005688 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005689 /* htmlCharEncoding enc; */
5690 xmlChar *content, *content_line = (xmlChar *) "charset=";
5691
Daniel Veillarda03e3652004-11-02 18:45:30 +00005692 if (filename == NULL)
5693 return(NULL);
5694
Daniel Veillardf403d292003-10-05 13:51:35 +00005695 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005696 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005697 return(NULL);
5698 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005699 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5700 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005701#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005702 if (xmlDefaultSAXHandler.error != NULL) {
5703 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5704 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005705#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005706 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005707 return(NULL);
5708 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005709
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005710 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5711 xmlFree(canonicFilename);
5712 if (inputStream == NULL) {
5713 xmlFreeParserCtxt(ctxt);
5714 return(NULL);
5715 }
Owen Taylor3473f882001-02-23 17:55:21 +00005716
5717 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005718
Owen Taylor3473f882001-02-23 17:55:21 +00005719 /* set encoding */
5720 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005721 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02005722 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00005723 strcpy ((char *)content, (char *)content_line);
5724 strcat ((char *)content, (char *)encoding);
5725 htmlCheckEncoding (ctxt, content);
5726 xmlFree (content);
5727 }
5728 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005729
Owen Taylor3473f882001-02-23 17:55:21 +00005730 return(ctxt);
5731}
5732
5733/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005734 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005735 * @filename: the filename
5736 * @encoding: a free form C string describing the HTML document encoding, or NULL
5737 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02005738 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00005739 *
5740 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5741 * compressed document is provided by default if found at compile-time.
5742 * It use the given SAX function block to handle the parsing callback.
5743 * If sax is NULL, fallback to the default DOM tree building routines.
5744 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005745 * Returns the resulting document tree unless SAX is NULL or the document is
5746 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005747 */
5748
5749htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005750htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00005751 void *userData) {
5752 htmlDocPtr ret;
5753 htmlParserCtxtPtr ctxt;
5754 htmlSAXHandlerPtr oldsax = NULL;
5755
Daniel Veillardd0463562001-10-13 09:15:48 +00005756 xmlInitParser();
5757
Owen Taylor3473f882001-02-23 17:55:21 +00005758 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5759 if (ctxt == NULL) return(NULL);
5760 if (sax != NULL) {
5761 oldsax = ctxt->sax;
5762 ctxt->sax = sax;
5763 ctxt->userData = userData;
5764 }
5765
5766 htmlParseDocument(ctxt);
5767
5768 ret = ctxt->myDoc;
5769 if (sax != NULL) {
5770 ctxt->sax = oldsax;
5771 ctxt->userData = NULL;
5772 }
5773 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005774
Owen Taylor3473f882001-02-23 17:55:21 +00005775 return(ret);
5776}
5777
5778/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005779 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005780 * @filename: the filename
5781 * @encoding: a free form C string describing the HTML document encoding, or NULL
5782 *
5783 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5784 * compressed document is provided by default if found at compile-time.
5785 *
5786 * Returns the resulting document tree
5787 */
5788
5789htmlDocPtr
5790htmlParseFile(const char *filename, const char *encoding) {
5791 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5792}
5793
5794/**
5795 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02005796 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00005797 *
5798 * Set and return the previous value for handling HTML omitted tags.
5799 *
5800 * Returns the last value for 0 for no handling, 1 for auto insertion.
5801 */
5802
5803int
5804htmlHandleOmittedElem(int val) {
5805 int old = htmlOmittedDefaultValue;
5806
5807 htmlOmittedDefaultValue = val;
5808 return(old);
5809}
5810
Daniel Veillard930dfb62003-02-05 10:17:38 +00005811/**
5812 * htmlElementAllowedHere:
5813 * @parent: HTML parent element
5814 * @elt: HTML element
5815 *
5816 * Checks whether an HTML element may be a direct child of a parent element.
5817 * Note - doesn't check for deprecated elements
5818 *
5819 * Returns 1 if allowed; 0 otherwise.
5820 */
5821int
5822htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5823 const char** p ;
5824
5825 if ( ! elt || ! parent || ! parent->subelts )
5826 return 0 ;
5827
5828 for ( p = parent->subelts; *p; ++p )
5829 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5830 return 1 ;
5831
5832 return 0 ;
5833}
5834/**
5835 * htmlElementStatusHere:
5836 * @parent: HTML parent element
5837 * @elt: HTML element
5838 *
5839 * Checks whether an HTML element may be a direct child of a parent element.
5840 * and if so whether it is valid or deprecated.
5841 *
5842 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5843 */
5844htmlStatus
5845htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5846 if ( ! parent || ! elt )
5847 return HTML_INVALID ;
5848 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5849 return HTML_INVALID ;
5850
5851 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5852}
5853/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005854 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005855 * @elt: HTML element
5856 * @attr: HTML attribute
5857 * @legacy: whether to allow deprecated attributes
5858 *
5859 * Checks whether an attribute is valid for an element
5860 * Has full knowledge of Required and Deprecated attributes
5861 *
5862 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5863 */
5864htmlStatus
5865htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5866 const char** p ;
5867
5868 if ( !elt || ! attr )
5869 return HTML_INVALID ;
5870
5871 if ( elt->attrs_req )
5872 for ( p = elt->attrs_req; *p; ++p)
5873 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5874 return HTML_REQUIRED ;
5875
5876 if ( elt->attrs_opt )
5877 for ( p = elt->attrs_opt; *p; ++p)
5878 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5879 return HTML_VALID ;
5880
5881 if ( legacy && elt->attrs_depr )
5882 for ( p = elt->attrs_depr; *p; ++p)
5883 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5884 return HTML_DEPRECATED ;
5885
5886 return HTML_INVALID ;
5887}
5888/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005889 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005890 * @node: an htmlNodePtr in a tree
5891 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005892 * for Element nodes)
5893 *
5894 * Checks whether the tree node is valid. Experimental (the author
5895 * only uses the HTML enhancements in a SAX parser)
5896 *
5897 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5898 * legacy allowed) or htmlElementStatusHere (otherwise).
5899 * for Attribute nodes, a return from htmlAttrAllowed
5900 * for other nodes, HTML_NA (no checks performed)
5901 */
5902htmlStatus
5903htmlNodeStatus(const htmlNodePtr node, int legacy) {
5904 if ( ! node )
5905 return HTML_INVALID ;
5906
5907 switch ( node->type ) {
5908 case XML_ELEMENT_NODE:
5909 return legacy
5910 ? ( htmlElementAllowedHere (
5911 htmlTagLookup(node->parent->name) , node->name
5912 ) ? HTML_VALID : HTML_INVALID )
5913 : htmlElementStatusHere(
5914 htmlTagLookup(node->parent->name) ,
5915 htmlTagLookup(node->name) )
5916 ;
5917 case XML_ATTRIBUTE_NODE:
5918 return htmlAttrAllowed(
5919 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5920 default: return HTML_NA ;
5921 }
5922}
Daniel Veillard9475a352003-09-26 12:47:50 +00005923/************************************************************************
5924 * *
5925 * New set (2.6.0) of simpler and more flexible APIs *
5926 * *
5927 ************************************************************************/
5928/**
5929 * DICT_FREE:
5930 * @str: a string
5931 *
5932 * Free a string if it is not owned by the "dict" dictionnary in the
5933 * current scope
5934 */
5935#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02005936 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00005937 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5938 xmlFree((char *)(str));
5939
5940/**
5941 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005942 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005943 *
5944 * Reset a parser context
5945 */
5946void
5947htmlCtxtReset(htmlParserCtxtPtr ctxt)
5948{
5949 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00005950 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02005951
Daniel Veillarda03e3652004-11-02 18:45:30 +00005952 if (ctxt == NULL)
5953 return;
5954
Daniel Veillardf1a27c62006-10-13 22:33:03 +00005955 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00005956 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00005957
5958 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5959 xmlFreeInputStream(input);
5960 }
5961 ctxt->inputNr = 0;
5962 ctxt->input = NULL;
5963
5964 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00005965 if (ctxt->spaceTab != NULL) {
5966 ctxt->spaceTab[0] = -1;
5967 ctxt->space = &ctxt->spaceTab[0];
5968 } else {
5969 ctxt->space = NULL;
5970 }
Daniel Veillard9475a352003-09-26 12:47:50 +00005971
5972
5973 ctxt->nodeNr = 0;
5974 ctxt->node = NULL;
5975
5976 ctxt->nameNr = 0;
5977 ctxt->name = NULL;
5978
5979 DICT_FREE(ctxt->version);
5980 ctxt->version = NULL;
5981 DICT_FREE(ctxt->encoding);
5982 ctxt->encoding = NULL;
5983 DICT_FREE(ctxt->directory);
5984 ctxt->directory = NULL;
5985 DICT_FREE(ctxt->extSubURI);
5986 ctxt->extSubURI = NULL;
5987 DICT_FREE(ctxt->extSubSystem);
5988 ctxt->extSubSystem = NULL;
5989 if (ctxt->myDoc != NULL)
5990 xmlFreeDoc(ctxt->myDoc);
5991 ctxt->myDoc = NULL;
5992
5993 ctxt->standalone = -1;
5994 ctxt->hasExternalSubset = 0;
5995 ctxt->hasPErefs = 0;
5996 ctxt->html = 1;
5997 ctxt->external = 0;
5998 ctxt->instate = XML_PARSER_START;
5999 ctxt->token = 0;
6000
6001 ctxt->wellFormed = 1;
6002 ctxt->nsWellFormed = 1;
6003 ctxt->valid = 1;
6004 ctxt->vctxt.userData = ctxt;
6005 ctxt->vctxt.error = xmlParserValidityError;
6006 ctxt->vctxt.warning = xmlParserValidityWarning;
6007 ctxt->record_info = 0;
6008 ctxt->nbChars = 0;
6009 ctxt->checkIndex = 0;
6010 ctxt->inSubset = 0;
6011 ctxt->errNo = XML_ERR_OK;
6012 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006013 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006014 ctxt->catalogs = NULL;
6015 xmlInitNodeInfoSeq(&ctxt->node_seq);
6016
6017 if (ctxt->attsDefault != NULL) {
6018 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6019 ctxt->attsDefault = NULL;
6020 }
6021 if (ctxt->attsSpecial != NULL) {
6022 xmlHashFree(ctxt->attsSpecial, NULL);
6023 ctxt->attsSpecial = NULL;
6024 }
6025}
6026
6027/**
6028 * htmlCtxtUseOptions:
6029 * @ctxt: an HTML parser context
6030 * @options: a combination of htmlParserOption(s)
6031 *
6032 * Applies the options to the parser context
6033 *
6034 * Returns 0 in case of success, the set of unknown or unimplemented options
6035 * in case of error.
6036 */
6037int
6038htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6039{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006040 if (ctxt == NULL)
6041 return(-1);
6042
Daniel Veillard9475a352003-09-26 12:47:50 +00006043 if (options & HTML_PARSE_NOWARNING) {
6044 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006045 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006046 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006047 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006048 }
6049 if (options & HTML_PARSE_NOERROR) {
6050 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006051 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006052 ctxt->sax->fatalError = NULL;
6053 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006054 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006055 }
6056 if (options & HTML_PARSE_PEDANTIC) {
6057 ctxt->pedantic = 1;
6058 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006059 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006060 } else
6061 ctxt->pedantic = 0;
6062 if (options & XML_PARSE_NOBLANKS) {
6063 ctxt->keepBlanks = 0;
6064 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6065 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006066 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006067 } else
6068 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006069 if (options & HTML_PARSE_RECOVER) {
6070 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006071 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006072 } else
6073 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006074 if (options & HTML_PARSE_COMPACT) {
6075 ctxt->options |= HTML_PARSE_COMPACT;
6076 options -= HTML_PARSE_COMPACT;
6077 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006078 if (options & XML_PARSE_HUGE) {
6079 ctxt->options |= XML_PARSE_HUGE;
6080 options -= XML_PARSE_HUGE;
6081 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006082 ctxt->dictNames = 0;
6083 return (options);
6084}
6085
6086/**
6087 * htmlDoRead:
6088 * @ctxt: an HTML parser context
6089 * @URL: the base URL to use for the document
6090 * @encoding: the document encoding, or NULL
6091 * @options: a combination of htmlParserOption(s)
6092 * @reuse: keep the context for reuse
6093 *
6094 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006095 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006096 * Returns the resulting document tree or NULL
6097 */
6098static htmlDocPtr
6099htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6100 int options, int reuse)
6101{
6102 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006103
Daniel Veillard9475a352003-09-26 12:47:50 +00006104 htmlCtxtUseOptions(ctxt, options);
6105 ctxt->html = 1;
6106 if (encoding != NULL) {
6107 xmlCharEncodingHandlerPtr hdlr;
6108
6109 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006110 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006111 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006112 if (ctxt->input->encoding != NULL)
6113 xmlFree((xmlChar *) ctxt->input->encoding);
6114 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6115 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006116 }
6117 if ((URL != NULL) && (ctxt->input != NULL) &&
6118 (ctxt->input->filename == NULL))
6119 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6120 htmlParseDocument(ctxt);
6121 ret = ctxt->myDoc;
6122 ctxt->myDoc = NULL;
6123 if (!reuse) {
6124 if ((ctxt->dictNames) &&
6125 (ret != NULL) &&
6126 (ret->dict == ctxt->dict))
6127 ctxt->dict = NULL;
6128 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006129 }
6130 return (ret);
6131}
6132
6133/**
6134 * htmlReadDoc:
6135 * @cur: a pointer to a zero terminated string
6136 * @URL: the base URL to use for the document
6137 * @encoding: the document encoding, or NULL
6138 * @options: a combination of htmlParserOption(s)
6139 *
6140 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006141 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006142 * Returns the resulting document tree
6143 */
6144htmlDocPtr
6145htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6146{
6147 htmlParserCtxtPtr ctxt;
6148
6149 if (cur == NULL)
6150 return (NULL);
6151
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006152 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006153 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006154 if (ctxt == NULL)
6155 return (NULL);
6156 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6157}
6158
6159/**
6160 * htmlReadFile:
6161 * @filename: a file or URL
6162 * @encoding: the document encoding, or NULL
6163 * @options: a combination of htmlParserOption(s)
6164 *
6165 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006166 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006167 * Returns the resulting document tree
6168 */
6169htmlDocPtr
6170htmlReadFile(const char *filename, const char *encoding, int options)
6171{
6172 htmlParserCtxtPtr ctxt;
6173
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006174 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006175 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6176 if (ctxt == NULL)
6177 return (NULL);
6178 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6179}
6180
6181/**
6182 * htmlReadMemory:
6183 * @buffer: a pointer to a char array
6184 * @size: the size of the array
6185 * @URL: the base URL to use for the document
6186 * @encoding: the document encoding, or NULL
6187 * @options: a combination of htmlParserOption(s)
6188 *
6189 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006190 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006191 * Returns the resulting document tree
6192 */
6193htmlDocPtr
6194htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6195{
6196 htmlParserCtxtPtr ctxt;
6197
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006198 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006199 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6200 if (ctxt == NULL)
6201 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006202 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006203 if (ctxt->sax != NULL)
6204 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006205 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6206}
6207
6208/**
6209 * htmlReadFd:
6210 * @fd: an open file descriptor
6211 * @URL: the base URL to use for the document
6212 * @encoding: the document encoding, or NULL
6213 * @options: a combination of htmlParserOption(s)
6214 *
6215 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006216 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006217 * Returns the resulting document tree
6218 */
6219htmlDocPtr
6220htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6221{
6222 htmlParserCtxtPtr ctxt;
6223 xmlParserInputBufferPtr input;
6224 xmlParserInputPtr stream;
6225
6226 if (fd < 0)
6227 return (NULL);
6228
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006229 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006230 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6231 if (input == NULL)
6232 return (NULL);
6233 ctxt = xmlNewParserCtxt();
6234 if (ctxt == NULL) {
6235 xmlFreeParserInputBuffer(input);
6236 return (NULL);
6237 }
6238 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6239 if (stream == NULL) {
6240 xmlFreeParserInputBuffer(input);
6241 xmlFreeParserCtxt(ctxt);
6242 return (NULL);
6243 }
6244 inputPush(ctxt, stream);
6245 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6246}
6247
6248/**
6249 * htmlReadIO:
6250 * @ioread: an I/O read function
6251 * @ioclose: an I/O close function
6252 * @ioctx: an I/O handler
6253 * @URL: the base URL to use for the document
6254 * @encoding: the document encoding, or NULL
6255 * @options: a combination of htmlParserOption(s)
6256 *
6257 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006258 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006259 * Returns the resulting document tree
6260 */
6261htmlDocPtr
6262htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6263 void *ioctx, const char *URL, const char *encoding, int options)
6264{
6265 htmlParserCtxtPtr ctxt;
6266 xmlParserInputBufferPtr input;
6267 xmlParserInputPtr stream;
6268
6269 if (ioread == NULL)
6270 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006271 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006272
6273 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6274 XML_CHAR_ENCODING_NONE);
6275 if (input == NULL)
6276 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006277 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006278 if (ctxt == NULL) {
6279 xmlFreeParserInputBuffer(input);
6280 return (NULL);
6281 }
6282 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6283 if (stream == NULL) {
6284 xmlFreeParserInputBuffer(input);
6285 xmlFreeParserCtxt(ctxt);
6286 return (NULL);
6287 }
6288 inputPush(ctxt, stream);
6289 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6290}
6291
6292/**
6293 * htmlCtxtReadDoc:
6294 * @ctxt: an HTML parser context
6295 * @cur: a pointer to a zero terminated string
6296 * @URL: the base URL to use for the document
6297 * @encoding: the document encoding, or NULL
6298 * @options: a combination of htmlParserOption(s)
6299 *
6300 * parse an XML in-memory document and build a tree.
6301 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006302 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006303 * Returns the resulting document tree
6304 */
6305htmlDocPtr
6306htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6307 const char *URL, const char *encoding, int options)
6308{
6309 xmlParserInputPtr stream;
6310
6311 if (cur == NULL)
6312 return (NULL);
6313 if (ctxt == NULL)
6314 return (NULL);
6315
6316 htmlCtxtReset(ctxt);
6317
6318 stream = xmlNewStringInputStream(ctxt, cur);
6319 if (stream == NULL) {
6320 return (NULL);
6321 }
6322 inputPush(ctxt, stream);
6323 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6324}
6325
6326/**
6327 * htmlCtxtReadFile:
6328 * @ctxt: an HTML parser context
6329 * @filename: a file or URL
6330 * @encoding: the document encoding, or NULL
6331 * @options: a combination of htmlParserOption(s)
6332 *
6333 * parse an XML file from the filesystem or the network.
6334 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006335 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006336 * Returns the resulting document tree
6337 */
6338htmlDocPtr
6339htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6340 const char *encoding, int options)
6341{
6342 xmlParserInputPtr stream;
6343
6344 if (filename == NULL)
6345 return (NULL);
6346 if (ctxt == NULL)
6347 return (NULL);
6348
6349 htmlCtxtReset(ctxt);
6350
Daniel Veillard29614c72004-11-26 10:47:26 +00006351 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006352 if (stream == NULL) {
6353 return (NULL);
6354 }
6355 inputPush(ctxt, stream);
6356 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6357}
6358
6359/**
6360 * htmlCtxtReadMemory:
6361 * @ctxt: an HTML parser context
6362 * @buffer: a pointer to a char array
6363 * @size: the size of the array
6364 * @URL: the base URL to use for the document
6365 * @encoding: the document encoding, or NULL
6366 * @options: a combination of htmlParserOption(s)
6367 *
6368 * parse an XML in-memory document and build a tree.
6369 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006370 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006371 * Returns the resulting document tree
6372 */
6373htmlDocPtr
6374htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6375 const char *URL, const char *encoding, int options)
6376{
6377 xmlParserInputBufferPtr input;
6378 xmlParserInputPtr stream;
6379
6380 if (ctxt == NULL)
6381 return (NULL);
6382 if (buffer == NULL)
6383 return (NULL);
6384
6385 htmlCtxtReset(ctxt);
6386
6387 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6388 if (input == NULL) {
6389 return(NULL);
6390 }
6391
6392 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6393 if (stream == NULL) {
6394 xmlFreeParserInputBuffer(input);
6395 return(NULL);
6396 }
6397
6398 inputPush(ctxt, stream);
6399 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6400}
6401
6402/**
6403 * htmlCtxtReadFd:
6404 * @ctxt: an HTML parser context
6405 * @fd: an open file descriptor
6406 * @URL: the base URL to use for the document
6407 * @encoding: the document encoding, or NULL
6408 * @options: a combination of htmlParserOption(s)
6409 *
6410 * parse an XML from a file descriptor and build a tree.
6411 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006412 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006413 * Returns the resulting document tree
6414 */
6415htmlDocPtr
6416htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6417 const char *URL, const char *encoding, int options)
6418{
6419 xmlParserInputBufferPtr input;
6420 xmlParserInputPtr stream;
6421
6422 if (fd < 0)
6423 return (NULL);
6424 if (ctxt == NULL)
6425 return (NULL);
6426
6427 htmlCtxtReset(ctxt);
6428
6429
6430 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6431 if (input == NULL)
6432 return (NULL);
6433 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6434 if (stream == NULL) {
6435 xmlFreeParserInputBuffer(input);
6436 return (NULL);
6437 }
6438 inputPush(ctxt, stream);
6439 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6440}
6441
6442/**
6443 * htmlCtxtReadIO:
6444 * @ctxt: an HTML parser context
6445 * @ioread: an I/O read function
6446 * @ioclose: an I/O close function
6447 * @ioctx: an I/O handler
6448 * @URL: the base URL to use for the document
6449 * @encoding: the document encoding, or NULL
6450 * @options: a combination of htmlParserOption(s)
6451 *
6452 * parse an HTML document from I/O functions and source and build a tree.
6453 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006454 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006455 * Returns the resulting document tree
6456 */
6457htmlDocPtr
6458htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6459 xmlInputCloseCallback ioclose, void *ioctx,
6460 const char *URL,
6461 const char *encoding, int options)
6462{
6463 xmlParserInputBufferPtr input;
6464 xmlParserInputPtr stream;
6465
6466 if (ioread == NULL)
6467 return (NULL);
6468 if (ctxt == NULL)
6469 return (NULL);
6470
6471 htmlCtxtReset(ctxt);
6472
6473 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6474 XML_CHAR_ENCODING_NONE);
6475 if (input == NULL)
6476 return (NULL);
6477 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6478 if (stream == NULL) {
6479 xmlFreeParserInputBuffer(input);
6480 return (NULL);
6481 }
6482 inputPush(ctxt, stream);
6483 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6484}
6485
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006486#define bottom_HTMLparser
6487#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006488#endif /* LIBXML_HTML_ENABLED */