blob: 04bfbd4e403a3e10168c726b7a274a098e61544b [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
Eugene Pimenov615904f2010-03-15 15:16:02 +0100212/**
213 * htmlNodeInfoPush:
214 * @ctxt: an HTML parser context
215 * @value: the node info
216 *
217 * Pushes a new element name on top of the node info stack
218 *
219 * Returns 0 in case of error, the index in the stack otherwise
220 */
221static int
222htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
223{
224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225 if (ctxt->nodeInfoMax == 0)
226 ctxt->nodeInfoMax = 5;
227 ctxt->nodeInfoMax *= 2;
228 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
230 ctxt->nodeInfoMax *
231 sizeof(ctxt->nodeInfoTab[0]));
232 if (ctxt->nodeInfoTab == NULL) {
233 htmlErrMemory(ctxt, NULL);
234 return (0);
235 }
236 }
237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239 return (ctxt->nodeInfoNr++);
240}
241
242/**
243 * htmlNodeInfoPop:
244 * @ctxt: an HTML parser context
245 *
246 * Pops the top element name from the node info stack
247 *
248 * Returns 0 in case of error, the pointer to NodeInfo otherwise
249 */
250static htmlParserNodeInfo *
251htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
252{
253 if (ctxt->nodeInfoNr <= 0)
254 return (NULL);
255 ctxt->nodeInfoNr--;
256 if (ctxt->nodeInfoNr < 0)
257 return (NULL);
258 if (ctxt->nodeInfoNr > 0)
259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
260 else
261 ctxt->nodeInfo = NULL;
262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
263}
264
Owen Taylor3473f882001-02-23 17:55:21 +0000265/*
266 * Macros for accessing the content. Those should be used only by the parser,
267 * and not exported.
268 *
269 * Dirty macros, i.e. one need to make assumption on the context to use them
270 *
271 * CUR_PTR return the current pointer to the xmlChar to be parsed.
272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
274 * in UNICODE mode. This should be used internally by the parser
275 * only to compare to ASCII values otherwise it would break when
276 * running with UTF-8 encoding.
277 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
278 * to compare on ASCII based substring.
279 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
280 * it should be used only to compare on ASCII based substring.
281 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000282 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000283 *
284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
285 *
286 * CURRENT Returns the current char value, with the full decoding of
287 * UTF-8 if we are using this mode. It returns an int.
288 * NEXT Skip to the next character, this does the proper decoding
289 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000290 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000291 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
292 */
293
294#define UPPER (toupper(*ctxt->input->cur))
295
Daniel Veillard77a90a72003-03-22 00:04:05 +0000296#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000297
298#define NXT(val) ctxt->input->cur[(val)]
299
300#define UPP(val) (toupper(ctxt->input->cur[(val)]))
301
302#define CUR_PTR ctxt->input->cur
303
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000304#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
305 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
306 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000307
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000308#define GROW if ((ctxt->progressive == 0) && \
309 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
310 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000311
312#define CURRENT ((int) (*ctxt->input->cur))
313
314#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
315
316/* Inported from XML */
317
Daniel Veillard561b7f82002-03-20 21:55:57 +0000318/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
319#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000320#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000321
Daniel Veillard561b7f82002-03-20 21:55:57 +0000322#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000323
324
325#define NEXTL(l) do { \
326 if (*(ctxt->input->cur) == '\n') { \
327 ctxt->input->line++; ctxt->input->col = 1; \
328 } else ctxt->input->col++; \
329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
330 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200331
Owen Taylor3473f882001-02-23 17:55:21 +0000332/************
333 \
334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
336 ************/
337
338#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
339#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
340
341#define COPY_BUF(l,b,i,v) \
342 if (l == 1) b[i++] = (xmlChar) v; \
343 else i += xmlCopyChar(l,&b[i],v)
344
345/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200346 * htmlFindEncoding:
347 * @the HTML parser context
348 *
349 * Ty to find and encoding in the current data available in the input
350 * buffer this is needed to try to switch to the proper encoding when
351 * one face a character error.
352 * That's an heuristic, since it's operating outside of parsing it could
353 * try to use a meta which had been commented out, that's the reason it
354 * should only be used in case of error, not as a default.
355 *
356 * Returns an encoding string or NULL if not found, the string need to
357 * be freed
358 */
359static xmlChar *
360htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361 const xmlChar *start, *cur, *end;
362
363 if ((ctxt == NULL) || (ctxt->input == NULL) ||
364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365 (ctxt->input->buf->encoder != NULL))
366 return(NULL);
367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
368 return(NULL);
369
370 start = ctxt->input->cur;
371 end = ctxt->input->end;
372 /* we also expect the input buffer to be zero terminated */
373 if (*end != 0)
374 return(NULL);
375
376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
377 if (cur == NULL)
378 return(NULL);
379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
380 if (cur == NULL)
381 return(NULL);
382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
383 if (cur == NULL)
384 return(NULL);
385 cur += 8;
386 start = cur;
387 while (((*cur >= 'A') && (*cur <= 'Z')) ||
388 ((*cur >= 'a') && (*cur <= 'z')) ||
389 ((*cur >= '0') && (*cur <= '9')) ||
390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
391 cur++;
392 if (cur == start)
393 return(NULL);
394 return(xmlStrndup(start, cur - start));
395}
396
397/**
Owen Taylor3473f882001-02-23 17:55:21 +0000398 * htmlCurrentChar:
399 * @ctxt: the HTML parser context
400 * @len: pointer to the length of the char read
401 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000402 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000403 * bytes in the input buffer. Implement the end of line normalization:
404 * 2.11 End-of-Line Handling
405 * If the encoding is unspecified, in the case we find an ISO-Latin-1
406 * char, then the encoding converter is plugged in automatically.
407 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000408 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000409 */
410
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000411static int
Owen Taylor3473f882001-02-23 17:55:21 +0000412htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
413 if (ctxt->instate == XML_PARSER_EOF)
414 return(0);
415
416 if (ctxt->token != 0) {
417 *len = 0;
418 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200419 }
Owen Taylor3473f882001-02-23 17:55:21 +0000420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
421 /*
422 * We are supposed to handle UTF8, check it's valid
423 * From rfc2044: encoding of the Unicode values on UTF-8:
424 *
425 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
426 * 0000 0000-0000 007F 0xxxxxxx
427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200428 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000429 *
430 * Check for the 0x110000 limit too
431 */
432 const unsigned char *cur = ctxt->input->cur;
433 unsigned char c;
434 unsigned int val;
435
436 c = *cur;
437 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200438 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200440 cur = ctxt->input->cur;
441 }
Owen Taylor3473f882001-02-23 17:55:21 +0000442 if ((cur[1] & 0xc0) != 0x80)
443 goto encoding_error;
444 if ((c & 0xe0) == 0xe0) {
445
Adiel Mittmann8a103792009-08-25 11:27:13 +0200446 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200448 cur = ctxt->input->cur;
449 }
Owen Taylor3473f882001-02-23 17:55:21 +0000450 if ((cur[2] & 0xc0) != 0x80)
451 goto encoding_error;
452 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200453 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200455 cur = ctxt->input->cur;
456 }
Owen Taylor3473f882001-02-23 17:55:21 +0000457 if (((c & 0xf8) != 0xf0) ||
458 ((cur[3] & 0xc0) != 0x80))
459 goto encoding_error;
460 /* 4-byte code */
461 *len = 4;
462 val = (cur[0] & 0x7) << 18;
463 val |= (cur[1] & 0x3f) << 12;
464 val |= (cur[2] & 0x3f) << 6;
465 val |= cur[3] & 0x3f;
466 } else {
467 /* 3-byte code */
468 *len = 3;
469 val = (cur[0] & 0xf) << 12;
470 val |= (cur[1] & 0x3f) << 6;
471 val |= cur[2] & 0x3f;
472 }
473 } else {
474 /* 2-byte code */
475 *len = 2;
476 val = (cur[0] & 0x1f) << 6;
477 val |= cur[1] & 0x3f;
478 }
479 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
481 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200482 }
Owen Taylor3473f882001-02-23 17:55:21 +0000483 return(val);
484 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200485 if ((*ctxt->input->cur == 0) &&
486 (ctxt->input->cur < ctxt->input->end)) {
487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488 "Char 0x%X out of allowed range\n", 0);
489 *len = 1;
490 return(' ');
491 }
Owen Taylor3473f882001-02-23 17:55:21 +0000492 /* 1-byte code */
493 *len = 1;
494 return((int) *ctxt->input->cur);
495 }
496 }
497 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000498 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000499 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000500 * XML constructs only use < 128 chars
501 */
502 *len = 1;
503 if ((int) *ctxt->input->cur < 0x80)
504 return((int) *ctxt->input->cur);
505
506 /*
507 * Humm this is bad, do an automatic flow conversion
508 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200509 {
510 xmlChar * guess;
511 xmlCharEncodingHandlerPtr handler;
512
513 guess = htmlFindEncoding(ctxt);
514 if (guess == NULL) {
515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
516 } else {
517 if (ctxt->input->encoding != NULL)
518 xmlFree((xmlChar *) ctxt->input->encoding);
519 ctxt->input->encoding = guess;
520 handler = xmlFindCharEncodingHandler((const char *) guess);
521 if (handler != NULL) {
522 xmlSwitchToEncoding(ctxt, handler);
523 } else {
524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525 "Unsupported encoding %s", guess, NULL);
526 }
527 }
528 ctxt->charset = XML_CHAR_ENCODING_UTF8;
529 }
530
Owen Taylor3473f882001-02-23 17:55:21 +0000531 return(xmlCurrentChar(ctxt, len));
532
533encoding_error:
534 /*
535 * If we detect an UTF8 error that probably mean that the
536 * input encoding didn't get properly advertized in the
537 * declaration header. Report the error and switch the encoding
538 * to ISO-Latin-1 (if you don't like this policy, just declare the
539 * encoding !)
540 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000541 {
542 char buffer[150];
543
Daniel Veillard861101d2007-06-12 08:38:57 +0000544 if (ctxt->input->end - ctxt->input->cur >= 4) {
545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
546 ctxt->input->cur[0], ctxt->input->cur[1],
547 ctxt->input->cur[2], ctxt->input->cur[3]);
548 } else {
549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
550 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
552 "Input is not proper UTF-8, indicate encoding !\n",
553 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000554 }
555
Daniel Veillarde77db162009-08-22 11:32:38 +0200556 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000557 *len = 1;
558 return((int) *ctxt->input->cur);
559}
560
561/**
Owen Taylor3473f882001-02-23 17:55:21 +0000562 * htmlSkipBlankChars:
563 * @ctxt: the HTML parser context
564 *
565 * skip all blanks character found at that point in the input streams.
566 *
567 * Returns the number of space chars skipped
568 */
569
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000570static int
Owen Taylor3473f882001-02-23 17:55:21 +0000571htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
572 int res = 0;
573
William M. Brack76e95df2003-10-18 16:20:14 +0000574 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000575 if ((*ctxt->input->cur == 0) &&
576 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
577 xmlPopInput(ctxt);
578 } else {
579 if (*(ctxt->input->cur) == '\n') {
580 ctxt->input->line++; ctxt->input->col = 1;
581 } else ctxt->input->col++;
582 ctxt->input->cur++;
583 ctxt->nbChars++;
584 if (*ctxt->input->cur == 0)
585 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
586 }
587 res++;
588 }
589 return(res);
590}
591
592
593
594/************************************************************************
595 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200596 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000597 * *
598 ************************************************************************/
599
600/*
601 * Start Tag: 1 means the start tag can be ommited
602 * End Tag: 1 means the end tag can be ommited
603 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000604 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000605 * Depr: this element is deprecated
606 * DTD: 1 means that this element is valid only in the Loose DTD
607 * 2 means that this element is valid only in the Frameset DTD
608 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000610 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000611 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000612
613/* Definitions and a couple of vars for HTML Elements */
614
615#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000616#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000617#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000618#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000619#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
620#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100621#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000622#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100623#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000624#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000625#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000626#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000628#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000629#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000630#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000631#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000632#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000633#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000634#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000635#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000636#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000637#define EMPTY NULL
638
639
Daniel Veillard065abe82006-07-03 08:55:04 +0000640static const char* const html_flow[] = { FLOW, NULL } ;
641static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000642
643/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000644static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000645#define html_cdata html_pcdata
646
647
648/* ... and for HTML Attributes */
649
650#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000651#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000652#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000653#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000654#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000655#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000656#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000657#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000658#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000659#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000660#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000661#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000662
Daniel Veillard065abe82006-07-03 08:55:04 +0000663static const char* const html_attrs[] = { ATTRS, NULL } ;
664static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
665static const char* const core_attrs[] = { COREATTRS, NULL } ;
666static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000667
668
669/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000670static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000671 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
672 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000673static const char* const target_attr[] = { "target", NULL } ;
674static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
675static const char* const alt_attr[] = { "alt", NULL } ;
676static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
677static const char* const href_attrs[] = { "href", NULL } ;
678static const char* const clear_attrs[] = { "clear", NULL } ;
679static const char* const inline_p[] = { INLINE, "p", NULL } ;
680
681static const char* const flow_param[] = { FLOW, "param", NULL } ;
682static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000683 "archive", "alt", "name", "height", "width", "align",
684 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000685static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000686 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000687static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000688 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000689static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
690static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
691static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
692static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000693 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000694static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000695 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
696
697
Daniel Veillard065abe82006-07-03 08:55:04 +0000698static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
699static const char* const col_elt[] = { "col", NULL } ;
700static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
701static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
702static const char* const dl_contents[] = { "dt", "dd", NULL } ;
703static const char* const compact_attr[] = { "compact", NULL } ;
704static const char* const label_attr[] = { "label", NULL } ;
705static const char* const fieldset_contents[] = { FLOW, "legend" } ;
706static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
707static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
708static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
709static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
710static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
711static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
712static const char* const head_attrs[] = { I18N, "profile", NULL } ;
713static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
714static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
715static const char* const version_attr[] = { "version", NULL } ;
716static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
717static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
718static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000719static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000720static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
721static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
722static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
723static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
724static const char* const align_attr[] = { "align", NULL } ;
725static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
726static const char* const map_contents[] = { BLOCK, "area", NULL } ;
727static const char* const name_attr[] = { "name", NULL } ;
728static const char* const action_attr[] = { "action", NULL } ;
729static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
730static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
731static const char* const content_attr[] = { "content", NULL } ;
732static const char* const type_attr[] = { "type", NULL } ;
733static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
734static const char* const object_contents[] = { FLOW, "param", NULL } ;
735static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
736static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
737static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
738static const char* const option_elt[] = { "option", NULL } ;
739static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
740static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
741static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
742static const char* const width_attr[] = { "width", NULL } ;
743static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
744static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
745static const char* const language_attr[] = { "language", NULL } ;
746static const char* const select_content[] = { "optgroup", "option", NULL } ;
747static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
748static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200749static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000750static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
751static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
752static const char* const tr_elt[] = { "tr", NULL } ;
753static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
754static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
755static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
756static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
757static const char* const tr_contents[] = { "th", "td", NULL } ;
758static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
759static const char* const li_elt[] = { "li", NULL } ;
760static const char* const ul_depr[] = { "type", "compact", NULL} ;
761static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000762
763#define DECL (const char**)
764
Daniel Veillard22090732001-07-16 00:06:07 +0000765static const htmlElemDesc
766html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000767{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
768 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
769},
770{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772},
773{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775},
776{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
777 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
778},
779{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
780 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
781},
782{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
783 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
784},
785{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787},
788{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
789 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
790},
791{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
792 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
793},
794{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
795 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
796},
797{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
798 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
799},
800{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
801 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
802},
803{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
804 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
805},
806{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
807 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
808},
809{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
810 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
811},
812{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
813 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
814},
815{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
816 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
817},
818{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820},
821{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823},
824{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
825 EMPTY , NULL , DECL col_attrs , NULL, NULL
826},
827{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
828 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
829},
830{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
831 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
832},
833{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
834 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
835},
836{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
837 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
838},
839{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
840 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
841},
842{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
843 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
844},
845{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000846 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000847},
848{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850},
851{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000854{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000855 EMPTY, NULL, DECL embed_attrs, NULL, NULL
856},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000857{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
858 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
859},
860{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
861 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
862},
863{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
864 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
865},
866{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
867 EMPTY, NULL, NULL, DECL frame_attrs, NULL
868},
869{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
870 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
871},
872{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
873 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
874},
875{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877},
878{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880},
881{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883},
884{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886},
887{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889},
890{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
891 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
892},
893{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
894 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
895},
896{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
897 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
898},
899{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
900 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
901},
902{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
903 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
904},
905{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000906 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000907},
908{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
909 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
910},
911{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
912 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
913},
914{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
915 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
916},
917{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919},
920{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
921 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
922},
923{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
924 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
925},
926{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
927 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
928},
929{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
930 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
931},
932{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000933 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000934},
935{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
936 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
937},
938{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
939 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
940},
941{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
942 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
943},
944{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
945 DECL html_flow, "div", DECL html_attrs, NULL, NULL
946},
947{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
948 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
949},
950{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
951 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
952},
953{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000954 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000955},
956{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
957 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
958},
959{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
960 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
961},
962{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000963 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000964},
965{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
966 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
967},
968{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
969 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
970},
971{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
972 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
973},
974{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976},
977{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
978 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
979},
980{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
981 DECL select_content, NULL, DECL select_attrs, NULL, NULL
982},
983{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985},
986{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988},
989{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
990 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
991},
992{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
993 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
994},
995{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
996 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
997},
998{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
999 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000},
1001{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003},
1004{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1005 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006},
1007{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1008 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009},
1010{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1011 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012},
1013{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015},
1016{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1017 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018},
1019{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1020 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021},
1022{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1023 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024},
1025{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1026 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027},
1028{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1029 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030},
1031{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033},
1034{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036},
1037{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039},
1040{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042}
Owen Taylor3473f882001-02-23 17:55:21 +00001043};
1044
1045/*
Owen Taylor3473f882001-02-23 17:55:21 +00001046 * start tags that imply the end of current element
1047 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001048static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001049"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1051 "listing", "xmp", "head", NULL,
1052"head", "p", NULL,
1053"title", "p", NULL,
1054"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001055"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001056"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057 "pre", "listing", "xmp", "head", "li", NULL,
1058"hr", "p", "head", NULL,
1059"h1", "p", "head", NULL,
1060"h2", "p", "head", NULL,
1061"h3", "p", "head", NULL,
1062"h4", "p", "head", NULL,
1063"h5", "p", "head", NULL,
1064"h6", "p", "head", NULL,
1065"dir", "p", "head", NULL,
1066"address", "p", "head", "ul", NULL,
1067"pre", "p", "head", "ul", NULL,
1068"listing", "p", "head", NULL,
1069"xmp", "p", "head", NULL,
1070"blockquote", "p", "head", NULL,
1071"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1072 "xmp", "head", NULL,
1073"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1074 "head", "dd", NULL,
1075"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1076 "head", "dt", NULL,
1077"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1078 "listing", "xmp", NULL,
1079"ol", "p", "head", "ul", NULL,
1080"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001081"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001082"div", "p", "head", NULL,
1083"noscript", "p", "head", NULL,
1084"center", "font", "b", "i", "p", "head", NULL,
1085"a", "a", NULL,
1086"caption", "p", NULL,
1087"colgroup", "caption", "colgroup", "col", "p", NULL,
1088"col", "caption", "col", "p", NULL,
1089"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001091"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001092"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001093"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094"thead", "caption", "col", "colgroup", NULL,
1095"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1096 "tbody", "p", NULL,
1097"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1098 "tfoot", "tbody", "p", NULL,
1099"optgroup", "option", NULL,
1100"option", "option", NULL,
1101"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102 "pre", "listing", "xmp", "a", NULL,
1103NULL
1104};
1105
1106/*
1107 * The list of HTML elements which are supposed not to have
1108 * CDATA content and where a p element will be implied
1109 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001110 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001111 * implied paragraph
1112 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001113static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001114 "html",
1115 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001116 NULL
1117};
1118
1119/*
1120 * The list of HTML attributes which are of content %Script;
1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122 * it assumes the name starts with 'on'
1123 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001124static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001125 "onclick",
1126 "ondblclick",
1127 "onmousedown",
1128 "onmouseup",
1129 "onmouseover",
1130 "onmousemove",
1131 "onmouseout",
1132 "onkeypress",
1133 "onkeydown",
1134 "onkeyup",
1135 "onload",
1136 "onunload",
1137 "onfocus",
1138 "onblur",
1139 "onsubmit",
1140 "onrest",
1141 "onchange",
1142 "onselect"
1143};
1144
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001145/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001146 * This table is used by the htmlparser to know what to do with
1147 * broken html pages. By assigning different priorities to different
1148 * elements the parser can decide how to handle extra endtags.
1149 * Endtags are only allowed to close elements with lower or equal
1150 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001151 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001152
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001153typedef struct {
1154 const char *name;
1155 int priority;
1156} elementPriority;
1157
Daniel Veillard22090732001-07-16 00:06:07 +00001158static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001159 {"div", 150},
1160 {"td", 160},
1161 {"th", 160},
1162 {"tr", 170},
1163 {"thead", 180},
1164 {"tbody", 180},
1165 {"tfoot", 180},
1166 {"table", 190},
1167 {"head", 200},
1168 {"body", 200},
1169 {"html", 220},
1170 {NULL, 100} /* Default priority */
1171};
Owen Taylor3473f882001-02-23 17:55:21 +00001172
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001173static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001174static int htmlStartCloseIndexinitialized = 0;
1175
1176/************************************************************************
1177 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001178 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001179 * *
1180 ************************************************************************/
1181
1182/**
1183 * htmlInitAutoClose:
1184 *
1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186 * This is not reentrant. Call xmlInitParser() once before processing in
1187 * case of use in multithreaded programs.
1188 */
1189void
1190htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001191 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001192
1193 if (htmlStartCloseIndexinitialized) return;
1194
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001195 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196 indx = 0;
1197 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001198 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001199 while (htmlStartClose[i] != NULL) i++;
1200 i++;
1201 }
1202 htmlStartCloseIndexinitialized = 1;
1203}
1204
1205/**
1206 * htmlTagLookup:
1207 * @tag: The tag name in lowercase
1208 *
1209 * Lookup the HTML tag in the ElementTable
1210 *
1211 * Returns the related htmlElemDescPtr or NULL if not found.
1212 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001213const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001214htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001215 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001216
1217 for (i = 0; i < (sizeof(html40ElementTable) /
1218 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001220 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001221 }
1222 return(NULL);
1223}
1224
1225/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001226 * htmlGetEndPriority:
1227 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001228 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001229 * Return value: The "endtag" priority.
1230 **/
1231static int
1232htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001234
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001235 while ((htmlEndPriority[i].name != NULL) &&
1236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001238
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001239 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001240}
1241
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001242
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001243/**
Owen Taylor3473f882001-02-23 17:55:21 +00001244 * htmlCheckAutoClose:
1245 * @newtag: The new tag name
1246 * @oldtag: The old tag name
1247 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001248 * Checks whether the new tag is one of the registered valid tags for
1249 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251 *
1252 * Returns 0 if no, 1 if yes.
1253 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001254static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001255htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001257 int i, indx;
1258 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001259
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001260 if (htmlStartCloseIndexinitialized == 0)
1261 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001262
1263 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001264 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001265 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001266 if (closed == NULL)
1267 return (0);
1268 if (xmlStrEqual(BAD_CAST * closed, newtag))
1269 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001270 }
1271
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001272 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001273 i++;
1274 while (htmlStartClose[i] != NULL) {
1275 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001276 return (1);
1277 }
1278 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001279 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001281}
1282
1283/**
1284 * htmlAutoCloseOnClose:
1285 * @ctxt: an HTML parser context
1286 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001287 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001288 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001289 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001290 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001291static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001292htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293{
1294 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001295 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001296
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001298
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001299 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001300
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001301 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302 break;
1303 /*
1304 * A missplaced endtag can only close elements with lower
1305 * or equal priority, so if we find an element with higher
1306 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001307 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001308 */
1309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001311 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 if (i < 0)
1313 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001314
1315 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001316 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001317 if ((info != NULL) && (info->endTag == 3)) {
1318 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001320 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001321 }
1322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001324 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001325 }
1326}
1327
1328/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001329 * htmlAutoCloseOnEnd:
1330 * @ctxt: an HTML parser context
1331 *
1332 * Close all remaining tags at the end of the stream
1333 */
1334static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001335htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001337 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001338
William M. Brack899e64a2003-09-26 18:03:42 +00001339 if (ctxt->nameNr == 0)
1340 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001341 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001342 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001344 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001345 }
1346}
1347
1348/**
Owen Taylor3473f882001-02-23 17:55:21 +00001349 * htmlAutoClose:
1350 * @ctxt: an HTML parser context
1351 * @newtag: The new tag name or NULL
1352 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001353 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001354 * The list is kept in htmlStartClose array. This function is
1355 * called when a new tag has been detected and generates the
1356 * appropriates closes if possible/needed.
1357 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001358 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001359 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001360static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001361htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001363 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001364 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001367 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001368 }
1369 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001370 htmlAutoCloseOnEnd(ctxt);
1371 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001372 }
1373 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001374 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001379 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001380 }
Owen Taylor3473f882001-02-23 17:55:21 +00001381}
1382
1383/**
1384 * htmlAutoCloseTag:
1385 * @doc: the HTML document
1386 * @name: The tag name
1387 * @elem: the HTML element
1388 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001389 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001390 * The list is kept in htmlStartClose array. This function checks
1391 * if the element or one of it's children would autoclose the
1392 * given tag.
1393 *
1394 * Returns 1 if autoclose, 0 otherwise
1395 */
1396int
1397htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398 htmlNodePtr child;
1399
1400 if (elem == NULL) return(1);
1401 if (xmlStrEqual(name, elem->name)) return(0);
1402 if (htmlCheckAutoClose(elem->name, name)) return(1);
1403 child = elem->children;
1404 while (child != NULL) {
1405 if (htmlAutoCloseTag(doc, name, child)) return(1);
1406 child = child->next;
1407 }
1408 return(0);
1409}
1410
1411/**
1412 * htmlIsAutoClosed:
1413 * @doc: the HTML document
1414 * @elem: the HTML element
1415 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001416 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001417 * The list is kept in htmlStartClose array. This function checks
1418 * if a tag is autoclosed by one of it's child
1419 *
1420 * Returns 1 if autoclosed, 0 otherwise
1421 */
1422int
1423htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424 htmlNodePtr child;
1425
1426 if (elem == NULL) return(1);
1427 child = elem->children;
1428 while (child != NULL) {
1429 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430 child = child->next;
1431 }
1432 return(0);
1433}
1434
1435/**
1436 * htmlCheckImplied:
1437 * @ctxt: an HTML parser context
1438 * @newtag: The new tag name
1439 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001440 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001441 * called when a new tag has been detected and generates the
1442 * appropriates implicit tags if missing
1443 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001444static void
Owen Taylor3473f882001-02-23 17:55:21 +00001445htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001446 int i;
1447
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001448 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001450 if (!htmlOmittedDefaultValue)
1451 return;
1452 if (xmlStrEqual(newtag, BAD_CAST"html"))
1453 return;
1454 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001455 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458 }
1459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001461 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001462 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001468 if (ctxt->html >= 3) {
1469 /* we already saw or generated an <head> before */
1470 return;
1471 }
1472 /*
1473 * dropped OBJECT ... i you put it first BODY will be
1474 * assumed !
1475 */
1476 htmlnamePush(ctxt, BAD_CAST"head");
1477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001482 if (ctxt->html >= 10) {
1483 /* we already saw or generated a <body> before */
1484 return;
1485 }
Owen Taylor3473f882001-02-23 17:55:21 +00001486 for (i = 0;i < ctxt->nameNr;i++) {
1487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488 return;
1489 }
1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491 return;
1492 }
1493 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001494
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001495 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498 }
1499}
1500
1501/**
1502 * htmlCheckParagraph
1503 * @ctxt: an HTML parser context
1504 *
1505 * Check whether a p element need to be implied before inserting
1506 * characters in the current element.
1507 *
1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509 * in case of error.
1510 */
1511
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001512static int
Owen Taylor3473f882001-02-23 17:55:21 +00001513htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514 const xmlChar *tag;
1515 int i;
1516
1517 if (ctxt == NULL)
1518 return(-1);
1519 tag = ctxt->name;
1520 if (tag == NULL) {
1521 htmlAutoClose(ctxt, BAD_CAST"p");
1522 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001523 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001524 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526 return(1);
1527 }
1528 if (!htmlOmittedDefaultValue)
1529 return(0);
1530 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001532 htmlAutoClose(ctxt, BAD_CAST"p");
1533 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001534 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001535 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537 return(1);
1538 }
1539 }
1540 return(0);
1541}
1542
1543/**
1544 * htmlIsScriptAttribute:
1545 * @name: an attribute name
1546 *
1547 * Check if an attribute is of content type Script
1548 *
1549 * Returns 1 is the attribute is a script 0 otherwise
1550 */
1551int
1552htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001553 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001554
1555 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001556 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001557 /*
1558 * all script attributes start with 'on'
1559 */
1560 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001561 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001562 for (i = 0;
1563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564 i++) {
1565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566 return(1);
1567 }
1568 return(0);
1569}
1570
1571/************************************************************************
1572 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001573 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001574 * *
1575 ************************************************************************/
1576
1577
Daniel Veillard22090732001-07-16 00:06:07 +00001578static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001579/*
1580 * the 4 absolute ones, plus apostrophe.
1581 */
1582{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1583{ 38, "amp", "ampersand, U+0026 ISOnum" },
1584{ 39, "apos", "single quote" },
1585{ 60, "lt", "less-than sign, U+003C ISOnum" },
1586{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1587
1588/*
1589 * A bunch still in the 128-255 range
1590 * Replacing them depend really on the charset used.
1591 */
1592{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1593{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1595{ 163, "pound","pound sign, U+00A3 ISOnum" },
1596{ 164, "curren","currency sign, U+00A4 ISOnum" },
1597{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1598{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599{ 167, "sect", "section sign, U+00A7 ISOnum" },
1600{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1602{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1603{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604{ 172, "not", "not sign, U+00AC ISOnum" },
1605{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1607{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1609{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613{ 181, "micro","micro sign, U+00B5 ISOnum" },
1614{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1618{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1619{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1641{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1648{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1668{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1672{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1673{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679{ 247, "divide","division sign, U+00F7 ISOnum" },
1680{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1685{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1688
1689{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1691{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694
1695/*
1696 * Anything below should really be kept as entities references
1697 */
1698{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1699
1700{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1701{ 732, "tilde","small tilde, U+02DC ISOdia" },
1702
1703{ 913, "Alpha","greek capital letter alpha, U+0391" },
1704{ 914, "Beta", "greek capital letter beta, U+0392" },
1705{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1708{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1709{ 919, "Eta", "greek capital letter eta, U+0397" },
1710{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711{ 921, "Iota", "greek capital letter iota, U+0399" },
1712{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001713{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001714{ 924, "Mu", "greek capital letter mu, U+039C" },
1715{ 925, "Nu", "greek capital letter nu, U+039D" },
1716{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1717{ 927, "Omicron","greek capital letter omicron, U+039F" },
1718{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1719{ 929, "Rho", "greek capital letter rho, U+03A1" },
1720{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721{ 932, "Tau", "greek capital letter tau, U+03A4" },
1722{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1724{ 935, "Chi", "greek capital letter chi, U+03A7" },
1725{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1726{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727
1728{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1730{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1732{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1734{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1735{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1736{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1737{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1740{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1741{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1742{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1743{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1744{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1745{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1748{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1750{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1751{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1752{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1753{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1756
1757{ 8194, "ensp", "en space, U+2002 ISOpub" },
1758{ 8195, "emsp", "em space, U+2003 ISOpub" },
1759{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1760{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1761{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1762{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1763{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1764{ 8211, "ndash","en dash, U+2013 ISOpub" },
1765{ 8212, "mdash","em dash, U+2014 ISOpub" },
1766{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1767{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1768{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1769{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1770{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1771{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1772{ 8224, "dagger","dagger, U+2020 ISOpub" },
1773{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1774
1775{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1776{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777
1778{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1779
1780{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1781{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782
1783{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785
1786{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1787{ 8260, "frasl","fraction slash, U+2044 NEW" },
1788
1789{ 8364, "euro", "euro sign, U+20AC NEW" },
1790
1791{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1794{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1795{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1797{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1798{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1799{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1800{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1801{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1803{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1804{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1805{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1806{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1807
1808{ 8704, "forall","for all, U+2200 ISOtech" },
1809{ 8706, "part", "partial differential, U+2202 ISOtech" },
1810{ 8707, "exist","there exists, U+2203 ISOtech" },
1811{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1812{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1813{ 8712, "isin", "element of, U+2208 ISOtech" },
1814{ 8713, "notin","not an element of, U+2209 ISOtech" },
1815{ 8715, "ni", "contains as member, U+220B ISOtech" },
1816{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001817{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001818{ 8722, "minus","minus sign, U+2212 ISOtech" },
1819{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1820{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1821{ 8733, "prop", "proportional to, U+221D ISOtech" },
1822{ 8734, "infin","infinity, U+221E ISOtech" },
1823{ 8736, "ang", "angle, U+2220 ISOamso" },
1824{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1825{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1826{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1827{ 8746, "cup", "union = cup, U+222A ISOtech" },
1828{ 8747, "int", "integral, U+222B ISOtech" },
1829{ 8756, "there4","therefore, U+2234 ISOtech" },
1830{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1831{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1832{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1834{ 8801, "equiv","identical to, U+2261 ISOtech" },
1835{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1836{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1837{ 8834, "sub", "subset of, U+2282 ISOtech" },
1838{ 8835, "sup", "superset of, U+2283 ISOtech" },
1839{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1840{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1841{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1842{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1844{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1846{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1848{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1850{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1851{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1852{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1853
1854{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1855{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1856{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1857{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1858
1859};
1860
1861/************************************************************************
1862 * *
1863 * Commodity functions to handle entities *
1864 * *
1865 ************************************************************************/
1866
1867/*
1868 * Macro used to grow the current buffer.
1869 */
1870#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001871 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001872 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001873 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001875 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001876 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001877 return(NULL); \
1878 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001879 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001880}
1881
1882/**
1883 * htmlEntityLookup:
1884 * @name: the entity name
1885 *
1886 * Lookup the given entity in EntitiesTable
1887 *
1888 * TODO: the linear scan is really ugly, an hash table is really needed.
1889 *
1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001892const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001893htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001894 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001895
1896 for (i = 0;i < (sizeof(html40EntitiesTable)/
1897 sizeof(html40EntitiesTable[0]));i++) {
1898 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001899 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001900 }
1901 }
1902 return(NULL);
1903}
1904
1905/**
1906 * htmlEntityValueLookup:
1907 * @value: the entity's unicode value
1908 *
1909 * Lookup the given entity in EntitiesTable
1910 *
1911 * TODO: the linear scan is really ugly, an hash table is really needed.
1912 *
1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001915const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001916htmlEntityValueLookup(unsigned int value) {
1917 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001918
1919 for (i = 0;i < (sizeof(html40EntitiesTable)/
1920 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001921 if (html40EntitiesTable[i].value >= value) {
1922 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001923 break;
William M. Brack78637da2003-07-31 14:47:38 +00001924 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001925 }
Owen Taylor3473f882001-02-23 17:55:21 +00001926 }
1927 return(NULL);
1928}
1929
1930/**
1931 * UTF8ToHtml:
1932 * @out: a pointer to an array of bytes to store the result
1933 * @outlen: the length of @out
1934 * @in: a pointer to an array of UTF-8 chars
1935 * @inlen: the length of @in
1936 *
1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938 * plus HTML entities block of chars out.
1939 *
1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001942 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001943 * The value of @outlen after return is the number of octets consumed.
1944 */
1945int
1946UTF8ToHtml(unsigned char* out, int *outlen,
1947 const unsigned char* in, int *inlen) {
1948 const unsigned char* processed = in;
1949 const unsigned char* outend;
1950 const unsigned char* outstart = out;
1951 const unsigned char* instart = in;
1952 const unsigned char* inend;
1953 unsigned int c, d;
1954 int trailing;
1955
Daniel Veillardce682bc2004-11-05 17:22:25 +00001956 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001957 if (in == NULL) {
1958 /*
1959 * initialization nothing to do
1960 */
1961 *outlen = 0;
1962 *inlen = 0;
1963 return(0);
1964 }
1965 inend = in + (*inlen);
1966 outend = out + (*outlen);
1967 while (in < inend) {
1968 d = *in++;
1969 if (d < 0x80) { c= d; trailing= 0; }
1970 else if (d < 0xC0) {
1971 /* trailing byte in leading position */
1972 *outlen = out - outstart;
1973 *inlen = processed - instart;
1974 return(-2);
1975 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1976 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1978 else {
1979 /* no chance for this in Ascii */
1980 *outlen = out - outstart;
1981 *inlen = processed - instart;
1982 return(-2);
1983 }
1984
1985 if (inend - in < trailing) {
1986 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001987 }
Owen Taylor3473f882001-02-23 17:55:21 +00001988
1989 for ( ; trailing; trailing--) {
1990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991 break;
1992 c <<= 6;
1993 c |= d & 0x3F;
1994 }
1995
1996 /* assertion: c is a single UTF-4 value */
1997 if (c < 0x80) {
1998 if (out + 1 >= outend)
1999 break;
2000 *out++ = c;
2001 } else {
2002 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00002003 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00002004 const char *cp;
2005 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00002006
2007 /*
2008 * Try to lookup a predefined HTML entity for it
2009 */
2010
2011 ent = htmlEntityValueLookup(c);
2012 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00002013 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00002015 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00002016 else
2017 cp = ent->name;
2018 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00002019 if (out + 2 + len >= outend)
2020 break;
2021 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00002022 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00002023 out += len;
2024 *out++ = ';';
2025 }
2026 processed = in;
2027 }
2028 *outlen = out - outstart;
2029 *inlen = processed - instart;
2030 return(0);
2031}
2032
2033/**
2034 * htmlEncodeEntities:
2035 * @out: a pointer to an array of bytes to store the result
2036 * @outlen: the length of @out
2037 * @in: a pointer to an array of UTF-8 chars
2038 * @inlen: the length of @in
2039 * @quoteChar: the quote character to escape (' or ") or zero.
2040 *
2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042 * plus HTML entities block of chars out.
2043 *
2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002046 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00002047 * The value of @outlen after return is the number of octets consumed.
2048 */
2049int
2050htmlEncodeEntities(unsigned char* out, int *outlen,
2051 const unsigned char* in, int *inlen, int quoteChar) {
2052 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002053 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002054 const unsigned char* outstart = out;
2055 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002056 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002057 unsigned int c, d;
2058 int trailing;
2059
Daniel Veillardce682bc2004-11-05 17:22:25 +00002060 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061 return(-1);
2062 outend = out + (*outlen);
2063 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002064 while (in < inend) {
2065 d = *in++;
2066 if (d < 0x80) { c= d; trailing= 0; }
2067 else if (d < 0xC0) {
2068 /* trailing byte in leading position */
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2071 return(-2);
2072 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2073 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2074 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2075 else {
2076 /* no chance for this in Ascii */
2077 *outlen = out - outstart;
2078 *inlen = processed - instart;
2079 return(-2);
2080 }
2081
2082 if (inend - in < trailing)
2083 break;
2084
2085 while (trailing--) {
2086 if (((d= *in++) & 0xC0) != 0x80) {
2087 *outlen = out - outstart;
2088 *inlen = processed - instart;
2089 return(-2);
2090 }
2091 c <<= 6;
2092 c |= d & 0x3F;
2093 }
2094
2095 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002096 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002098 if (out >= outend)
2099 break;
2100 *out++ = c;
2101 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002102 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002103 const char *cp;
2104 char nbuf[16];
2105 int len;
2106
2107 /*
2108 * Try to lookup a predefined HTML entity for it
2109 */
2110 ent = htmlEntityValueLookup(c);
2111 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002112 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002113 cp = nbuf;
2114 }
2115 else
2116 cp = ent->name;
2117 len = strlen(cp);
2118 if (out + 2 + len > outend)
2119 break;
2120 *out++ = '&';
2121 memcpy(out, cp, len);
2122 out += len;
2123 *out++ = ';';
2124 }
2125 processed = in;
2126 }
2127 *outlen = out - outstart;
2128 *inlen = processed - instart;
2129 return(0);
2130}
2131
Owen Taylor3473f882001-02-23 17:55:21 +00002132/************************************************************************
2133 * *
2134 * Commodity functions to handle streams *
2135 * *
2136 ************************************************************************/
2137
2138/**
Owen Taylor3473f882001-02-23 17:55:21 +00002139 * htmlNewInputStream:
2140 * @ctxt: an HTML parser context
2141 *
2142 * Create a new input stream structure
2143 * Returns the new input stream or NULL
2144 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002145static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002146htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147 htmlParserInputPtr input;
2148
2149 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002151 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002152 return(NULL);
2153 }
2154 memset(input, 0, sizeof(htmlParserInput));
2155 input->filename = NULL;
2156 input->directory = NULL;
2157 input->base = NULL;
2158 input->cur = NULL;
2159 input->buf = NULL;
2160 input->line = 1;
2161 input->col = 1;
2162 input->buf = NULL;
2163 input->free = NULL;
2164 input->version = NULL;
2165 input->consumed = 0;
2166 input->length = 0;
2167 return(input);
2168}
2169
2170
2171/************************************************************************
2172 * *
2173 * Commodity functions, cleanup needed ? *
2174 * *
2175 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002176/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002177 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002178 * NOTE: it might be more apropriate to integrate this information
2179 * into the html40ElementTable array but I don't want to risk any
2180 * binary incomptibility
2181 */
2182static const char *allowPCData[] = {
2183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184 "blockquote", "body", "button", "caption", "center", "cite", "code",
2185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189};
Owen Taylor3473f882001-02-23 17:55:21 +00002190
2191/**
2192 * areBlanks:
2193 * @ctxt: an HTML parser context
2194 * @str: a xmlChar *
2195 * @len: the size of @str
2196 *
2197 * Is this a sequence of blank chars that one can ignore ?
2198 *
2199 * Returns 1 if ignorable 0 otherwise.
2200 */
2201
2202static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002203 unsigned int i;
2204 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002205 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002206 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002207
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002208 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002209 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002210
2211 if (CUR == 0) return(1);
2212 if (CUR != '<') return(0);
2213 if (ctxt->name == NULL)
2214 return(1);
2215 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216 return(1);
2217 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002219
2220 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222 dtd = xmlGetIntSubset(ctxt->myDoc);
2223 if (dtd != NULL && dtd->ExternalID != NULL) {
2224 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226 return(1);
2227 }
2228 }
2229
Owen Taylor3473f882001-02-23 17:55:21 +00002230 if (ctxt->node == NULL) return(0);
2231 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002234 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002235 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002237 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002238 for all tags "b" allowing PCDATA */
2239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241 return(0);
2242 }
2243 }
Owen Taylor3473f882001-02-23 17:55:21 +00002244 } else if (xmlNodeIsText(lastChild)) {
2245 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002246 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002247 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002248 for all tags "p" allowing PCDATA */
2249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251 return(0);
2252 }
2253 }
Owen Taylor3473f882001-02-23 17:55:21 +00002254 }
2255 return(1);
2256}
2257
2258/**
Owen Taylor3473f882001-02-23 17:55:21 +00002259 * htmlNewDocNoDtD:
2260 * @URI: URI for the dtd, or NULL
2261 * @ExternalID: the external ID of the DTD, or NULL
2262 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002263 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264 * are NULL
2265 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002266 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002267 */
2268htmlDocPtr
2269htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270 xmlDocPtr cur;
2271
2272 /*
2273 * Allocate a new document and fill the fields.
2274 */
2275 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002277 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002278 return(NULL);
2279 }
2280 memset(cur, 0, sizeof(xmlDoc));
2281
2282 cur->type = XML_HTML_DOCUMENT_NODE;
2283 cur->version = NULL;
2284 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002285 cur->doc = cur;
2286 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002287 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002288 cur->extSubset = NULL;
2289 cur->oldNs = NULL;
2290 cur->encoding = NULL;
2291 cur->standalone = 1;
2292 cur->compression = 0;
2293 cur->ids = NULL;
2294 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002295 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002296 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002298 if ((ExternalID != NULL) ||
2299 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002301 return(cur);
2302}
2303
2304/**
2305 * htmlNewDoc:
2306 * @URI: URI for the dtd, or NULL
2307 * @ExternalID: the external ID of the DTD, or NULL
2308 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002309 * Creates a new HTML document
2310 *
Owen Taylor3473f882001-02-23 17:55:21 +00002311 * Returns a new document
2312 */
2313htmlDocPtr
2314htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315 if ((URI == NULL) && (ExternalID == NULL))
2316 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002317 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002319
2320 return(htmlNewDocNoDtD(URI, ExternalID));
2321}
2322
2323
2324/************************************************************************
2325 * *
2326 * The parser itself *
2327 * Relates to http://www.w3.org/TR/html40 *
2328 * *
2329 ************************************************************************/
2330
2331/************************************************************************
2332 * *
2333 * The parser itself *
2334 * *
2335 ************************************************************************/
2336
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002337static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002338
Owen Taylor3473f882001-02-23 17:55:21 +00002339/**
2340 * htmlParseHTMLName:
2341 * @ctxt: an HTML parser context
2342 *
2343 * parse an HTML tag or attribute name, note that we convert it to lowercase
2344 * since HTML names are not case-sensitive.
2345 *
2346 * Returns the Tag Name parsed or NULL
2347 */
2348
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002349static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002350htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002351 int i = 0;
2352 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353
William M. Brackd1757ab2004-10-02 22:07:48 +00002354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002355 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002356
2357 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002359 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362 else loc[i] = CUR;
2363 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002364
Owen Taylor3473f882001-02-23 17:55:21 +00002365 NEXT;
2366 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002367
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002368 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002369}
2370
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002371
2372/**
2373 * htmlParseHTMLName_nonInvasive:
2374 * @ctxt: an HTML parser context
2375 *
2376 * parse an HTML tag or attribute name, note that we convert it to lowercase
2377 * since HTML names are not case-sensitive, this doesn't consume the data
2378 * from the stream, it's a look-ahead
2379 *
2380 * Returns the Tag Name parsed or NULL
2381 */
2382
2383static const xmlChar *
2384htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385 int i = 0;
2386 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387
2388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002390
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002391 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395 else loc[i] = NXT(1+i);
2396 i++;
2397 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002398
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002399 return(xmlDictLookup(ctxt->dict, loc, i));
2400}
2401
2402
Owen Taylor3473f882001-02-23 17:55:21 +00002403/**
2404 * htmlParseName:
2405 * @ctxt: an HTML parser context
2406 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002407 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002408 *
2409 * Returns the Name parsed or NULL
2410 */
2411
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002412static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002413htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002414 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002415 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002416 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002417
2418 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002419
2420 /*
2421 * Accelerator for simple ASCII names
2422 */
2423 in = ctxt->input->cur;
2424 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425 ((*in >= 0x41) && (*in <= 0x5A)) ||
2426 (*in == '_') || (*in == ':')) {
2427 in++;
2428 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429 ((*in >= 0x41) && (*in <= 0x5A)) ||
2430 ((*in >= 0x30) && (*in <= 0x39)) ||
2431 (*in == '_') || (*in == '-') ||
2432 (*in == ':') || (*in == '.'))
2433 in++;
2434 if ((*in > 0) && (*in < 0x80)) {
2435 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002436 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002437 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002438 ctxt->nbChars += count;
2439 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002440 return(ret);
2441 }
2442 }
2443 return(htmlParseNameComplex(ctxt));
2444}
2445
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002446static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002447htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002448 int len = 0, l;
2449 int c;
2450 int count = 0;
2451
2452 /*
2453 * Handler for more complex cases
2454 */
2455 GROW;
2456 c = CUR_CHAR(l);
2457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458 (!IS_LETTER(c) && (c != '_') &&
2459 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002460 return(NULL);
2461 }
2462
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002466 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002467 (IS_COMBINING(c)) ||
2468 (IS_EXTENDER(c)))) {
2469 if (count++ > 100) {
2470 count = 0;
2471 GROW;
2472 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002473 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002474 NEXTL(l);
2475 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002476 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002478}
2479
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002480
Owen Taylor3473f882001-02-23 17:55:21 +00002481/**
2482 * htmlParseHTMLAttribute:
2483 * @ctxt: an HTML parser context
2484 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002485 *
Owen Taylor3473f882001-02-23 17:55:21 +00002486 * parse an HTML attribute value till the stop (quote), if
2487 * stop is 0 then it stops at the first space
2488 *
2489 * Returns the attribute parsed or NULL
2490 */
2491
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002492static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002493htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494 xmlChar *buffer = NULL;
2495 int buffer_size = 0;
2496 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002497 const xmlChar *name = NULL;
2498 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002499 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002500
2501 /*
2502 * allocate a translation buffer.
2503 */
2504 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002505 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002506 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002507 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002508 return(NULL);
2509 }
2510 out = buffer;
2511
2512 /*
2513 * Ok loop until we reach one of the ending chars
2514 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002515 while ((CUR != 0) && (CUR != stop)) {
2516 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002517 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002518 if (CUR == '&') {
2519 if (NXT(1) == '#') {
2520 unsigned int c;
2521 int bits;
2522
2523 c = htmlParseCharRef(ctxt);
2524 if (c < 0x80)
2525 { *out++ = c; bits= -6; }
2526 else if (c < 0x800)
2527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2528 else if (c < 0x10000)
2529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002530 else
Owen Taylor3473f882001-02-23 17:55:21 +00002531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002532
Owen Taylor3473f882001-02-23 17:55:21 +00002533 for ( ; bits >= 0; bits-= 6) {
2534 *out++ = ((c >> bits) & 0x3F) | 0x80;
2535 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002536
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002537 if (out - buffer > buffer_size - 100) {
2538 int indx = out - buffer;
2539
2540 growBuffer(buffer);
2541 out = &buffer[indx];
2542 }
Owen Taylor3473f882001-02-23 17:55:21 +00002543 } else {
2544 ent = htmlParseEntityRef(ctxt, &name);
2545 if (name == NULL) {
2546 *out++ = '&';
2547 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002548 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002549
2550 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002551 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002552 }
2553 } else if (ent == NULL) {
2554 *out++ = '&';
2555 cur = name;
2556 while (*cur != 0) {
2557 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002558 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002559
2560 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002561 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002562 }
2563 *out++ = *cur++;
2564 }
Owen Taylor3473f882001-02-23 17:55:21 +00002565 } else {
2566 unsigned int c;
2567 int bits;
2568
2569 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002570 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002571
2572 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002573 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002574 }
Daniel Veillard48519092006-10-17 15:56:35 +00002575 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002576 if (c < 0x80)
2577 { *out++ = c; bits= -6; }
2578 else if (c < 0x800)
2579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2580 else if (c < 0x10000)
2581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002582 else
Owen Taylor3473f882001-02-23 17:55:21 +00002583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002584
Owen Taylor3473f882001-02-23 17:55:21 +00002585 for ( ; bits >= 0; bits-= 6) {
2586 *out++ = ((c >> bits) & 0x3F) | 0x80;
2587 }
Owen Taylor3473f882001-02-23 17:55:21 +00002588 }
2589 }
2590 } else {
2591 unsigned int c;
2592 int bits, l;
2593
2594 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002595 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002596
2597 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002598 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002599 }
2600 c = CUR_CHAR(l);
2601 if (c < 0x80)
2602 { *out++ = c; bits= -6; }
2603 else if (c < 0x800)
2604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2605 else if (c < 0x10000)
2606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002607 else
Owen Taylor3473f882001-02-23 17:55:21 +00002608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002609
Owen Taylor3473f882001-02-23 17:55:21 +00002610 for ( ; bits >= 0; bits-= 6) {
2611 *out++ = ((c >> bits) & 0x3F) | 0x80;
2612 }
2613 NEXT;
2614 }
2615 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002616 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002617 return(buffer);
2618}
2619
2620/**
Owen Taylor3473f882001-02-23 17:55:21 +00002621 * htmlParseEntityRef:
2622 * @ctxt: an HTML parser context
2623 * @str: location to store the entity name
2624 *
2625 * parse an HTML ENTITY references
2626 *
2627 * [68] EntityRef ::= '&' Name ';'
2628 *
2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630 * if non-NULL *str will have to be freed by the caller.
2631 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002632const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002633htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002635 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002636
2637 if (str != NULL) *str = NULL;
2638 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002639
2640 if (CUR == '&') {
2641 NEXT;
2642 name = htmlParseName(ctxt);
2643 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002646 } else {
2647 GROW;
2648 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002649 if (str != NULL)
2650 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002651
2652 /*
2653 * Lookup the entity in the table.
2654 */
2655 ent = htmlEntityLookup(name);
2656 if (ent != NULL) /* OK that's ugly !!! */
2657 NEXT;
2658 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002659 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660 "htmlParseEntityRef: expecting ';'\n",
2661 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002662 if (str != NULL)
2663 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002664 }
2665 }
2666 }
2667 return(ent);
2668}
2669
2670/**
2671 * htmlParseAttValue:
2672 * @ctxt: an HTML parser context
2673 *
2674 * parse a value for an attribute
2675 * Note: the parser won't do substitution of entities here, this
2676 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002677 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002678 *
2679 * Returns the AttValue parsed or NULL.
2680 */
2681
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002682static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002683htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684 xmlChar *ret = NULL;
2685
2686 if (CUR == '"') {
2687 NEXT;
2688 ret = htmlParseHTMLAttribute(ctxt, '"');
2689 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002690 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002692 } else
2693 NEXT;
2694 } else if (CUR == '\'') {
2695 NEXT;
2696 ret = htmlParseHTMLAttribute(ctxt, '\'');
2697 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002698 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002700 } else
2701 NEXT;
2702 } else {
2703 /*
2704 * That's an HTMLism, the attribute value may not be quoted
2705 */
2706 ret = htmlParseHTMLAttribute(ctxt, 0);
2707 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002708 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002710 }
2711 }
2712 return(ret);
2713}
2714
2715/**
2716 * htmlParseSystemLiteral:
2717 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002718 *
Owen Taylor3473f882001-02-23 17:55:21 +00002719 * parse an HTML Literal
2720 *
2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2722 *
2723 * Returns the SystemLiteral parsed or NULL
2724 */
2725
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002726static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002727htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2728 const xmlChar *q;
2729 xmlChar *ret = NULL;
2730
2731 if (CUR == '"') {
2732 NEXT;
2733 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002734 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002735 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002736 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002737 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002739 } else {
2740 ret = xmlStrndup(q, CUR_PTR - q);
2741 NEXT;
2742 }
2743 } else if (CUR == '\'') {
2744 NEXT;
2745 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002746 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002747 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002748 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002749 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002751 } else {
2752 ret = xmlStrndup(q, CUR_PTR - q);
2753 NEXT;
2754 }
2755 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002756 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002758 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002759
Owen Taylor3473f882001-02-23 17:55:21 +00002760 return(ret);
2761}
2762
2763/**
2764 * htmlParsePubidLiteral:
2765 * @ctxt: an HTML parser context
2766 *
2767 * parse an HTML public literal
2768 *
2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2770 *
2771 * Returns the PubidLiteral parsed or NULL.
2772 */
2773
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002774static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002775htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2776 const xmlChar *q;
2777 xmlChar *ret = NULL;
2778 /*
2779 * Name ::= (Letter | '_') (NameChar)*
2780 */
2781 if (CUR == '"') {
2782 NEXT;
2783 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002784 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002785 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002786 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002788 } else {
2789 ret = xmlStrndup(q, CUR_PTR - q);
2790 NEXT;
2791 }
2792 } else if (CUR == '\'') {
2793 NEXT;
2794 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002795 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002796 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002797 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002798 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002800 } else {
2801 ret = xmlStrndup(q, CUR_PTR - q);
2802 NEXT;
2803 }
2804 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002805 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002807 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002808
Owen Taylor3473f882001-02-23 17:55:21 +00002809 return(ret);
2810}
2811
2812/**
2813 * htmlParseScript:
2814 * @ctxt: an HTML parser context
2815 *
2816 * parse the content of an HTML SCRIPT or STYLE element
2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819 * http://www.w3.org/TR/html4/types.html#type-script
2820 * http://www.w3.org/TR/html4/types.html#h-6.15
2821 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2822 *
2823 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824 * element and the value of intrinsic event attributes. User agents must
2825 * not evaluate script data as HTML markup but instead must pass it on as
2826 * data to a script engine.
2827 * NOTES:
2828 * - The content is passed like CDATA
2829 * - the attributes for style and scripting "onXXX" are also described
2830 * as CDATA but SGML allows entities references in attributes so their
2831 * processing is identical as other attributes
2832 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002833static void
Owen Taylor3473f882001-02-23 17:55:21 +00002834htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002835 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002836 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002837 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002838
2839 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002840 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002841 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002842 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002843 /*
2844 * One should break here, the specification is clear:
2845 * Authors should therefore escape "</" within the content.
2846 * Escape mechanisms are specific to each scripting or
2847 * style sheet language.
2848 *
2849 * In recovery mode, only break if end tag match the
2850 * current tag, effectively ignoring all tags inside the
2851 * script/style block and treating the entire block as
2852 * CDATA.
2853 */
2854 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002855 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002857 {
2858 break; /* while */
2859 } else {
2860 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002861 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002862 ctxt->name, NULL);
2863 }
2864 } else {
2865 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002866 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002867 {
2868 break; /* while */
2869 }
2870 }
Owen Taylor3473f882001-02-23 17:55:21 +00002871 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002872 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002873 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874 if (ctxt->sax->cdataBlock!= NULL) {
2875 /*
2876 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2877 */
2878 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002879 } else if (ctxt->sax->characters != NULL) {
2880 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002881 }
2882 nbchar = 0;
2883 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002884 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002885 NEXTL(l);
2886 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002887 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002888
Daniel Veillard68716a72006-10-16 09:32:17 +00002889 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002890 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002892 NEXT;
2893 }
2894
2895 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2896 if (ctxt->sax->cdataBlock!= NULL) {
2897 /*
2898 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2899 */
2900 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002901 } else if (ctxt->sax->characters != NULL) {
2902 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002903 }
2904 }
2905}
2906
2907
2908/**
2909 * htmlParseCharData:
2910 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002911 *
2912 * parse a CharData section.
2913 * if we are within a CDATA section ']]>' marks an end of section.
2914 *
2915 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2916 */
2917
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002918static void
2919htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002920 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2921 int nbchar = 0;
2922 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002923 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002924
2925 SHRINK;
2926 cur = CUR_CHAR(l);
2927 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002928 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002929 (cur != 0)) {
2930 if (!(IS_CHAR(cur))) {
2931 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2932 "Invalid char in CDATA 0x%X\n", cur);
2933 } else {
2934 COPY_BUF(l,buf,nbchar,cur);
2935 }
Owen Taylor3473f882001-02-23 17:55:21 +00002936 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2937 /*
2938 * Ok the segment is to be consumed as chars.
2939 */
2940 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2941 if (areBlanks(ctxt, buf, nbchar)) {
2942 if (ctxt->sax->ignorableWhitespace != NULL)
2943 ctxt->sax->ignorableWhitespace(ctxt->userData,
2944 buf, nbchar);
2945 } else {
2946 htmlCheckParagraph(ctxt);
2947 if (ctxt->sax->characters != NULL)
2948 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2949 }
2950 }
2951 nbchar = 0;
2952 }
2953 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002954 chunk++;
2955 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2956 chunk = 0;
2957 SHRINK;
2958 GROW;
2959 }
Owen Taylor3473f882001-02-23 17:55:21 +00002960 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002961 if (cur == 0) {
2962 SHRINK;
2963 GROW;
2964 cur = CUR_CHAR(l);
2965 }
Owen Taylor3473f882001-02-23 17:55:21 +00002966 }
2967 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002968 buf[nbchar] = 0;
2969
Owen Taylor3473f882001-02-23 17:55:21 +00002970 /*
2971 * Ok the segment is to be consumed as chars.
2972 */
2973 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2974 if (areBlanks(ctxt, buf, nbchar)) {
2975 if (ctxt->sax->ignorableWhitespace != NULL)
2976 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2977 } else {
2978 htmlCheckParagraph(ctxt);
2979 if (ctxt->sax->characters != NULL)
2980 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2981 }
2982 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002983 } else {
2984 /*
2985 * Loop detection
2986 */
2987 if (cur == 0)
2988 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002989 }
2990}
2991
2992/**
2993 * htmlParseExternalID:
2994 * @ctxt: an HTML parser context
2995 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002996 *
2997 * Parse an External ID or a Public ID
2998 *
Owen Taylor3473f882001-02-23 17:55:21 +00002999 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3000 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3001 *
3002 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3003 *
3004 * Returns the function returns SystemLiteral and in the second
3005 * case publicID receives PubidLiteral, is strict is off
3006 * it is possible to return NULL and have publicID set.
3007 */
3008
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003009static xmlChar *
3010htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00003011 xmlChar *URI = NULL;
3012
3013 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3014 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3015 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3016 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003017 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003018 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3019 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003020 }
3021 SKIP_BLANKS;
3022 URI = htmlParseSystemLiteral(ctxt);
3023 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003024 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3025 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003026 }
3027 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3028 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3029 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3030 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003031 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003032 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3033 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003034 }
3035 SKIP_BLANKS;
3036 *publicID = htmlParsePubidLiteral(ctxt);
3037 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003038 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3039 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3040 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003041 }
3042 SKIP_BLANKS;
3043 if ((CUR == '"') || (CUR == '\'')) {
3044 URI = htmlParseSystemLiteral(ctxt);
3045 }
3046 }
3047 return(URI);
3048}
3049
3050/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003051 * xmlParsePI:
3052 * @ctxt: an XML parser context
3053 *
3054 * parse an XML Processing Instruction.
3055 *
3056 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3057 */
3058static void
3059htmlParsePI(htmlParserCtxtPtr ctxt) {
3060 xmlChar *buf = NULL;
3061 int len = 0;
3062 int size = HTML_PARSER_BUFFER_SIZE;
3063 int cur, l;
3064 const xmlChar *target;
3065 xmlParserInputState state;
3066 int count = 0;
3067
3068 if ((RAW == '<') && (NXT(1) == '?')) {
3069 state = ctxt->instate;
3070 ctxt->instate = XML_PARSER_PI;
3071 /*
3072 * this is a Processing Instruction.
3073 */
3074 SKIP(2);
3075 SHRINK;
3076
3077 /*
3078 * Parse the target name and check for special support like
3079 * namespace.
3080 */
3081 target = htmlParseName(ctxt);
3082 if (target != NULL) {
3083 if (RAW == '>') {
3084 SKIP(1);
3085
3086 /*
3087 * SAX: PI detected.
3088 */
3089 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3090 (ctxt->sax->processingInstruction != NULL))
3091 ctxt->sax->processingInstruction(ctxt->userData,
3092 target, NULL);
3093 ctxt->instate = state;
3094 return;
3095 }
3096 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3097 if (buf == NULL) {
3098 htmlErrMemory(ctxt, NULL);
3099 ctxt->instate = state;
3100 return;
3101 }
3102 cur = CUR;
3103 if (!IS_BLANK(cur)) {
3104 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3105 "ParsePI: PI %s space expected\n", target, NULL);
3106 }
3107 SKIP_BLANKS;
3108 cur = CUR_CHAR(l);
3109 while (IS_CHAR(cur) && (cur != '>')) {
3110 if (len + 5 >= size) {
3111 xmlChar *tmp;
3112
3113 size *= 2;
3114 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3115 if (tmp == NULL) {
3116 htmlErrMemory(ctxt, NULL);
3117 xmlFree(buf);
3118 ctxt->instate = state;
3119 return;
3120 }
3121 buf = tmp;
3122 }
3123 count++;
3124 if (count > 50) {
3125 GROW;
3126 count = 0;
3127 }
3128 COPY_BUF(l,buf,len,cur);
3129 NEXTL(l);
3130 cur = CUR_CHAR(l);
3131 if (cur == 0) {
3132 SHRINK;
3133 GROW;
3134 cur = CUR_CHAR(l);
3135 }
3136 }
3137 buf[len] = 0;
3138 if (cur != '>') {
3139 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3140 "ParsePI: PI %s never end ...\n", target, NULL);
3141 } else {
3142 SKIP(1);
3143
3144 /*
3145 * SAX: PI detected.
3146 */
3147 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3148 (ctxt->sax->processingInstruction != NULL))
3149 ctxt->sax->processingInstruction(ctxt->userData,
3150 target, buf);
3151 }
3152 xmlFree(buf);
3153 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003154 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003155 "PI is not started correctly", NULL, NULL);
3156 }
3157 ctxt->instate = state;
3158 }
3159}
3160
3161/**
Owen Taylor3473f882001-02-23 17:55:21 +00003162 * htmlParseComment:
3163 * @ctxt: an HTML parser context
3164 *
3165 * Parse an XML (SGML) comment <!-- .... -->
3166 *
3167 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3168 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003169static void
Owen Taylor3473f882001-02-23 17:55:21 +00003170htmlParseComment(htmlParserCtxtPtr ctxt) {
3171 xmlChar *buf = NULL;
3172 int len;
3173 int size = HTML_PARSER_BUFFER_SIZE;
3174 int q, ql;
3175 int r, rl;
3176 int cur, l;
3177 xmlParserInputState state;
3178
3179 /*
3180 * Check that there is a comment right here.
3181 */
3182 if ((RAW != '<') || (NXT(1) != '!') ||
3183 (NXT(2) != '-') || (NXT(3) != '-')) return;
3184
3185 state = ctxt->instate;
3186 ctxt->instate = XML_PARSER_COMMENT;
3187 SHRINK;
3188 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003189 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003190 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003191 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003192 ctxt->instate = state;
3193 return;
3194 }
3195 q = CUR_CHAR(ql);
3196 NEXTL(ql);
3197 r = CUR_CHAR(rl);
3198 NEXTL(rl);
3199 cur = CUR_CHAR(l);
3200 len = 0;
3201 while (IS_CHAR(cur) &&
3202 ((cur != '>') ||
3203 (r != '-') || (q != '-'))) {
3204 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003205 xmlChar *tmp;
3206
Owen Taylor3473f882001-02-23 17:55:21 +00003207 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003208 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3209 if (tmp == NULL) {
3210 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003211 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003212 ctxt->instate = state;
3213 return;
3214 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003215 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003216 }
3217 COPY_BUF(ql,buf,len,q);
3218 q = r;
3219 ql = rl;
3220 r = cur;
3221 rl = l;
3222 NEXTL(l);
3223 cur = CUR_CHAR(l);
3224 if (cur == 0) {
3225 SHRINK;
3226 GROW;
3227 cur = CUR_CHAR(l);
3228 }
3229 }
3230 buf[len] = 0;
3231 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003232 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3233 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003234 xmlFree(buf);
3235 } else {
3236 NEXT;
3237 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3238 (!ctxt->disableSAX))
3239 ctxt->sax->comment(ctxt->userData, buf);
3240 xmlFree(buf);
3241 }
3242 ctxt->instate = state;
3243}
3244
3245/**
3246 * htmlParseCharRef:
3247 * @ctxt: an HTML parser context
3248 *
3249 * parse Reference declarations
3250 *
3251 * [66] CharRef ::= '&#' [0-9]+ ';' |
3252 * '&#x' [0-9a-fA-F]+ ';'
3253 *
3254 * Returns the value parsed (as an int)
3255 */
3256int
3257htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3258 int val = 0;
3259
Daniel Veillarda03e3652004-11-02 18:45:30 +00003260 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3261 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3262 "htmlParseCharRef: context error\n",
3263 NULL, NULL);
3264 return(0);
3265 }
Owen Taylor3473f882001-02-23 17:55:21 +00003266 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003267 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003268 SKIP(3);
3269 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003270 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003271 val = val * 16 + (CUR - '0');
3272 else if ((CUR >= 'a') && (CUR <= 'f'))
3273 val = val * 16 + (CUR - 'a') + 10;
3274 else if ((CUR >= 'A') && (CUR <= 'F'))
3275 val = val * 16 + (CUR - 'A') + 10;
3276 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003277 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003278 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003279 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003280 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003281 }
3282 NEXT;
3283 }
3284 if (CUR == ';')
3285 NEXT;
3286 } else if ((CUR == '&') && (NXT(1) == '#')) {
3287 SKIP(2);
3288 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003289 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003290 val = val * 10 + (CUR - '0');
3291 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003292 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003293 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003294 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003295 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003296 }
3297 NEXT;
3298 }
3299 if (CUR == ';')
3300 NEXT;
3301 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003302 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3303 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003304 }
3305 /*
3306 * Check the value IS_CHAR ...
3307 */
3308 if (IS_CHAR(val)) {
3309 return(val);
3310 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003311 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3312 "htmlParseCharRef: invalid xmlChar value %d\n",
3313 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003314 }
3315 return(0);
3316}
3317
3318
3319/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003320 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003321 * @ctxt: an HTML parser context
3322 *
3323 * parse a DOCTYPE declaration
3324 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003325 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003326 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3327 */
3328
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003329static void
Owen Taylor3473f882001-02-23 17:55:21 +00003330htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003331 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003332 xmlChar *ExternalID = NULL;
3333 xmlChar *URI = NULL;
3334
3335 /*
3336 * We know that '<!DOCTYPE' has been detected.
3337 */
3338 SKIP(9);
3339
3340 SKIP_BLANKS;
3341
3342 /*
3343 * Parse the DOCTYPE name.
3344 */
3345 name = htmlParseName(ctxt);
3346 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003347 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3348 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3349 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003350 }
3351 /*
3352 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3353 */
3354
3355 SKIP_BLANKS;
3356
3357 /*
3358 * Check for SystemID and ExternalID
3359 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003360 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003361 SKIP_BLANKS;
3362
3363 /*
3364 * We should be at the end of the DOCTYPE declaration.
3365 */
3366 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003367 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3368 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003369 /* We shouldn't try to resynchronize ... */
3370 }
3371 NEXT;
3372
3373 /*
3374 * Create or update the document accordingly to the DOCTYPE
3375 */
3376 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3377 (!ctxt->disableSAX))
3378 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3379
3380 /*
3381 * Cleanup, since we don't use all those identifiers
3382 */
3383 if (URI != NULL) xmlFree(URI);
3384 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003385}
3386
3387/**
3388 * htmlParseAttribute:
3389 * @ctxt: an HTML parser context
3390 * @value: a xmlChar ** used to store the value of the attribute
3391 *
3392 * parse an attribute
3393 *
3394 * [41] Attribute ::= Name Eq AttValue
3395 *
3396 * [25] Eq ::= S? '=' S?
3397 *
3398 * With namespace:
3399 *
3400 * [NS 11] Attribute ::= QName Eq AttValue
3401 *
3402 * Also the case QName == xmlns:??? is handled independently as a namespace
3403 * definition.
3404 *
3405 * Returns the attribute name, and the value in *value.
3406 */
3407
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003408static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003409htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003410 const xmlChar *name;
3411 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003412
3413 *value = NULL;
3414 name = htmlParseHTMLName(ctxt);
3415 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003416 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3417 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003418 return(NULL);
3419 }
3420
3421 /*
3422 * read the value
3423 */
3424 SKIP_BLANKS;
3425 if (CUR == '=') {
3426 NEXT;
3427 SKIP_BLANKS;
3428 val = htmlParseAttValue(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003429 }
3430
3431 *value = val;
3432 return(name);
3433}
3434
3435/**
3436 * htmlCheckEncoding:
3437 * @ctxt: an HTML parser context
3438 * @attvalue: the attribute value
3439 *
3440 * Checks an http-equiv attribute from a Meta tag to detect
3441 * the encoding
3442 * If a new encoding is detected the parser is switched to decode
3443 * it and pass UTF8
3444 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003445static void
Owen Taylor3473f882001-02-23 17:55:21 +00003446htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3447 const xmlChar *encoding;
3448
3449 if ((ctxt == NULL) || (attvalue == NULL))
3450 return;
3451
Daniel Veillarde77db162009-08-22 11:32:38 +02003452 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003453 if (ctxt->input->encoding != NULL)
3454 return;
3455
3456 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3457 if (encoding != NULL) {
3458 encoding += 8;
3459 } else {
3460 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3461 if (encoding != NULL)
3462 encoding += 9;
3463 }
3464 if (encoding != NULL) {
3465 xmlCharEncoding enc;
3466 xmlCharEncodingHandlerPtr handler;
3467
3468 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3469
3470 if (ctxt->input->encoding != NULL)
3471 xmlFree((xmlChar *) ctxt->input->encoding);
3472 ctxt->input->encoding = xmlStrdup(encoding);
3473
3474 enc = xmlParseCharEncoding((const char *) encoding);
3475 /*
3476 * registered set of known encodings
3477 */
3478 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003479 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003480 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3481 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3482 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3483 (ctxt->input->buf != NULL) &&
3484 (ctxt->input->buf->encoder == NULL)) {
3485 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3486 "htmlCheckEncoding: wrong encoding meta\n",
3487 NULL, NULL);
3488 } else {
3489 xmlSwitchEncoding(ctxt, enc);
3490 }
Owen Taylor3473f882001-02-23 17:55:21 +00003491 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3492 } else {
3493 /*
3494 * fallback for unknown encodings
3495 */
3496 handler = xmlFindCharEncodingHandler((const char *) encoding);
3497 if (handler != NULL) {
3498 xmlSwitchToEncoding(ctxt, handler);
3499 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3500 } else {
3501 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3502 }
3503 }
3504
3505 if ((ctxt->input->buf != NULL) &&
3506 (ctxt->input->buf->encoder != NULL) &&
3507 (ctxt->input->buf->raw != NULL) &&
3508 (ctxt->input->buf->buffer != NULL)) {
3509 int nbchars;
3510 int processed;
3511
3512 /*
3513 * convert as much as possible to the parser reading buffer.
3514 */
3515 processed = ctxt->input->cur - ctxt->input->base;
3516 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3517 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3518 ctxt->input->buf->buffer,
3519 ctxt->input->buf->raw);
3520 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003521 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3522 "htmlCheckEncoding: encoder error\n",
3523 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003524 }
3525 ctxt->input->base =
3526 ctxt->input->cur = ctxt->input->buf->buffer->content;
Eugene Pimenov1e60fbc2010-03-10 18:10:49 +01003527 ctxt->input->end =
3528 &ctxt->input->base[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00003529 }
3530 }
3531}
3532
3533/**
3534 * htmlCheckMeta:
3535 * @ctxt: an HTML parser context
3536 * @atts: the attributes values
3537 *
3538 * Checks an attributes from a Meta tag
3539 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003540static void
Owen Taylor3473f882001-02-23 17:55:21 +00003541htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3542 int i;
3543 const xmlChar *att, *value;
3544 int http = 0;
3545 const xmlChar *content = NULL;
3546
3547 if ((ctxt == NULL) || (atts == NULL))
3548 return;
3549
3550 i = 0;
3551 att = atts[i++];
3552 while (att != NULL) {
3553 value = atts[i++];
3554 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3555 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3556 http = 1;
3557 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3558 content = value;
3559 att = atts[i++];
3560 }
3561 if ((http) && (content != NULL))
3562 htmlCheckEncoding(ctxt, content);
3563
3564}
3565
3566/**
3567 * htmlParseStartTag:
3568 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003569 *
Owen Taylor3473f882001-02-23 17:55:21 +00003570 * parse a start of tag either for rule element or
3571 * EmptyElement. In both case we don't parse the tag closing chars.
3572 *
3573 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3574 *
3575 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3576 *
3577 * With namespace:
3578 *
3579 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3580 *
3581 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3582 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003583 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003584 */
3585
Daniel Veillard597f1c12005-07-03 23:00:18 +00003586static int
Owen Taylor3473f882001-02-23 17:55:21 +00003587htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003588 const xmlChar *name;
3589 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003590 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003591 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003592 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003593 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003594 int meta = 0;
3595 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003596 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003597
Daniel Veillarde77db162009-08-22 11:32:38 +02003598 if (ctxt->instate == XML_PARSER_EOF)
3599 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003600 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3601 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3602 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003603 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003604 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003605 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003606 NEXT;
3607
Daniel Veillard30e76072006-03-09 14:13:55 +00003608 atts = ctxt->atts;
3609 maxatts = ctxt->maxatts;
3610
Owen Taylor3473f882001-02-23 17:55:21 +00003611 GROW;
3612 name = htmlParseHTMLName(ctxt);
3613 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003614 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3615 "htmlParseStartTag: invalid element name\n",
3616 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003617 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003618 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3619 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003620 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003621 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003622 }
3623 if (xmlStrEqual(name, BAD_CAST"meta"))
3624 meta = 1;
3625
3626 /*
3627 * Check for auto-closure of HTML elements.
3628 */
3629 htmlAutoClose(ctxt, name);
3630
3631 /*
3632 * Check for implied HTML elements.
3633 */
3634 htmlCheckImplied(ctxt, name);
3635
3636 /*
3637 * Avoid html at any level > 0, head at any level != 1
3638 * or any attempt to recurse body
3639 */
3640 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003641 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3642 "htmlParseStartTag: misplaced <html> tag\n",
3643 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003644 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003645 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003646 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003647 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003648 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003649 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3650 "htmlParseStartTag: misplaced <head> tag\n",
3651 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003652 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003653 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003654 }
3655 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003656 int indx;
3657 for (indx = 0;indx < ctxt->nameNr;indx++) {
3658 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003659 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3660 "htmlParseStartTag: misplaced <body> tag\n",
3661 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003662 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003663 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003664 }
3665 }
3666 }
3667
3668 /*
3669 * Now parse the attributes, it ends up with the ending
3670 *
3671 * (S Attribute)* S?
3672 */
3673 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003674 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003675 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003676 ((CUR != '/') || (NXT(1) != '>'))) {
3677 long cons = ctxt->nbChars;
3678
3679 GROW;
3680 attname = htmlParseAttribute(ctxt, &attvalue);
3681 if (attname != NULL) {
3682
3683 /*
3684 * Well formedness requires at most one declaration of an attribute
3685 */
3686 for (i = 0; i < nbatts;i += 2) {
3687 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003688 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3689 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003690 if (attvalue != NULL)
3691 xmlFree(attvalue);
3692 goto failed;
3693 }
3694 }
3695
3696 /*
3697 * Add the pair to atts
3698 */
3699 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003700 maxatts = 22; /* allow for 10 attrs by default */
3701 atts = (const xmlChar **)
3702 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003703 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003704 htmlErrMemory(ctxt, NULL);
3705 if (attvalue != NULL)
3706 xmlFree(attvalue);
3707 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003708 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003709 ctxt->atts = atts;
3710 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003711 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003712 const xmlChar **n;
3713
Owen Taylor3473f882001-02-23 17:55:21 +00003714 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003715 n = (const xmlChar **) xmlRealloc((void *) atts,
3716 maxatts * sizeof(const xmlChar *));
3717 if (n == NULL) {
3718 htmlErrMemory(ctxt, NULL);
3719 if (attvalue != NULL)
3720 xmlFree(attvalue);
3721 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003722 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003723 atts = n;
3724 ctxt->atts = atts;
3725 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003726 }
3727 atts[nbatts++] = attname;
3728 atts[nbatts++] = attvalue;
3729 atts[nbatts] = NULL;
3730 atts[nbatts + 1] = NULL;
3731 }
3732 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003733 if (attvalue != NULL)
3734 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003735 /* Dump the bogus attribute string up to the next blank or
3736 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003737 while ((IS_CHAR_CH(CUR)) &&
3738 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003739 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003740 NEXT;
3741 }
3742
3743failed:
3744 SKIP_BLANKS;
3745 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003746 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3747 "htmlParseStartTag: problem parsing attributes\n",
3748 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003749 break;
3750 }
3751 }
3752
3753 /*
3754 * Handle specific association to the META tag
3755 */
William M. Bracke978ae22007-03-21 06:16:02 +00003756 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003757 htmlCheckMeta(ctxt, atts);
3758
3759 /*
3760 * SAX: Start of Element !
3761 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003762 if (!discardtag) {
3763 htmlnamePush(ctxt, name);
3764 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3765 if (nbatts != 0)
3766 ctxt->sax->startElement(ctxt->userData, name, atts);
3767 else
3768 ctxt->sax->startElement(ctxt->userData, name, NULL);
3769 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003770 }
Owen Taylor3473f882001-02-23 17:55:21 +00003771
3772 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003773 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003774 if (atts[i] != NULL)
3775 xmlFree((xmlChar *) atts[i]);
3776 }
Owen Taylor3473f882001-02-23 17:55:21 +00003777 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003778
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003779 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003780}
3781
3782/**
3783 * htmlParseEndTag:
3784 * @ctxt: an HTML parser context
3785 *
3786 * parse an end of tag
3787 *
3788 * [42] ETag ::= '</' Name S? '>'
3789 *
3790 * With namespace
3791 *
3792 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003793 *
3794 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003795 */
3796
Daniel Veillardf420ac52001-07-04 16:04:09 +00003797static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003798htmlParseEndTag(htmlParserCtxtPtr ctxt)
3799{
3800 const xmlChar *name;
3801 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003802 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003803
3804 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003805 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3806 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003807 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003808 }
3809 SKIP(2);
3810
3811 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003812 if (name == NULL)
3813 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003814 /*
3815 * We should definitely be at the ending "S? '>'" part
3816 */
3817 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003818 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003819 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3820 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003821 if (ctxt->recovery) {
3822 /*
3823 * We're not at the ending > !!
3824 * Error, unless in recover mode where we search forwards
3825 * until we find a >
3826 */
3827 while (CUR != '\0' && CUR != '>') NEXT;
3828 NEXT;
3829 }
Owen Taylor3473f882001-02-23 17:55:21 +00003830 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003831 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003832
3833 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003834 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3835 * out now.
3836 */
3837 if ((ctxt->depth > 0) &&
3838 (xmlStrEqual(name, BAD_CAST "html") ||
3839 xmlStrEqual(name, BAD_CAST "body") ||
3840 xmlStrEqual(name, BAD_CAST "head"))) {
3841 ctxt->depth--;
3842 return (0);
3843 }
3844
3845 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003846 * If the name read is not one of the element in the parsing stack
3847 * then return, it's just an error.
3848 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003849 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3850 if (xmlStrEqual(name, ctxt->nameTab[i]))
3851 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003852 }
3853 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003854 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3855 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003856 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003857 }
3858
3859
3860 /*
3861 * Check for auto-closure of HTML elements.
3862 */
3863
3864 htmlAutoCloseOnClose(ctxt, name);
3865
3866 /*
3867 * Well formedness constraints, opening and closing must match.
3868 * With the exception that the autoclose may have popped stuff out
3869 * of the stack.
3870 */
3871 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003872 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003873 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3874 "Opening and ending tag mismatch: %s and %s\n",
3875 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003876 }
3877 }
3878
3879 /*
3880 * SAX: End of Tag
3881 */
3882 oldname = ctxt->name;
3883 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003884 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3885 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003886 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003887 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003888 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003889 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003890 }
3891
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003892 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003893}
3894
3895
3896/**
3897 * htmlParseReference:
3898 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003899 *
Owen Taylor3473f882001-02-23 17:55:21 +00003900 * parse and handle entity references in content,
3901 * this will end-up in a call to character() since this is either a
3902 * CharRef, or a predefined entity.
3903 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003904static void
Owen Taylor3473f882001-02-23 17:55:21 +00003905htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003906 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003907 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003908 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003909 if (CUR != '&') return;
3910
3911 if (NXT(1) == '#') {
3912 unsigned int c;
3913 int bits, i = 0;
3914
3915 c = htmlParseCharRef(ctxt);
3916 if (c == 0)
3917 return;
3918
3919 if (c < 0x80) { out[i++]= c; bits= -6; }
3920 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3921 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3922 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003923
Owen Taylor3473f882001-02-23 17:55:21 +00003924 for ( ; bits >= 0; bits-= 6) {
3925 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3926 }
3927 out[i] = 0;
3928
3929 htmlCheckParagraph(ctxt);
3930 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3931 ctxt->sax->characters(ctxt->userData, out, i);
3932 } else {
3933 ent = htmlParseEntityRef(ctxt, &name);
3934 if (name == NULL) {
3935 htmlCheckParagraph(ctxt);
3936 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3937 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3938 return;
3939 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003940 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003941 htmlCheckParagraph(ctxt);
3942 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3943 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3944 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3945 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3946 }
3947 } else {
3948 unsigned int c;
3949 int bits, i = 0;
3950
3951 c = ent->value;
3952 if (c < 0x80)
3953 { out[i++]= c; bits= -6; }
3954 else if (c < 0x800)
3955 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3956 else if (c < 0x10000)
3957 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003958 else
Owen Taylor3473f882001-02-23 17:55:21 +00003959 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003960
Owen Taylor3473f882001-02-23 17:55:21 +00003961 for ( ; bits >= 0; bits-= 6) {
3962 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3963 }
3964 out[i] = 0;
3965
3966 htmlCheckParagraph(ctxt);
3967 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3968 ctxt->sax->characters(ctxt->userData, out, i);
3969 }
Owen Taylor3473f882001-02-23 17:55:21 +00003970 }
3971}
3972
3973/**
3974 * htmlParseContent:
3975 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00003976 *
3977 * Parse a content: comment, sub-element, reference or text.
Eugene Pimenov615904f2010-03-15 15:16:02 +01003978 * Kept for compatibility with old code
Owen Taylor3473f882001-02-23 17:55:21 +00003979 */
3980
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003981static void
Owen Taylor3473f882001-02-23 17:55:21 +00003982htmlParseContent(htmlParserCtxtPtr ctxt) {
3983 xmlChar *currentNode;
3984 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00003985 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003986
3987 currentNode = xmlStrdup(ctxt->name);
3988 depth = ctxt->nameNr;
3989 while (1) {
3990 long cons = ctxt->nbChars;
3991
3992 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02003993
3994 if (ctxt->instate == XML_PARSER_EOF)
3995 break;
3996
Owen Taylor3473f882001-02-23 17:55:21 +00003997 /*
3998 * Our tag or one of it's parent or children is ending.
3999 */
4000 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00004001 if (htmlParseEndTag(ctxt) &&
4002 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4003 if (currentNode != NULL)
4004 xmlFree(currentNode);
4005 return;
4006 }
4007 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00004008 }
4009
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004010 else if ((CUR == '<') &&
4011 ((IS_ASCII_LETTER(NXT(1))) ||
4012 (NXT(1) == '_') || (NXT(1) == ':'))) {
4013 name = htmlParseHTMLName_nonInvasive(ctxt);
4014 if (name == NULL) {
4015 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4016 "htmlParseStartTag: invalid element name\n",
4017 NULL, NULL);
4018 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02004019 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004020 NEXT;
4021
4022 if (currentNode != NULL)
4023 xmlFree(currentNode);
4024 return;
4025 }
4026
4027 if (ctxt->name != NULL) {
4028 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4029 htmlAutoClose(ctxt, name);
4030 continue;
4031 }
Daniel Veillarde77db162009-08-22 11:32:38 +02004032 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004033 }
4034
Owen Taylor3473f882001-02-23 17:55:21 +00004035 /*
4036 * Has this node been popped out during parsing of
4037 * the next element
4038 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00004039 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4040 (!xmlStrEqual(currentNode, ctxt->name)))
4041 {
Owen Taylor3473f882001-02-23 17:55:21 +00004042 if (currentNode != NULL) xmlFree(currentNode);
4043 return;
4044 }
4045
Daniel Veillardf9533d12001-03-03 10:04:57 +00004046 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4047 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004048 /*
4049 * Handle SCRIPT/STYLE separately
4050 */
4051 htmlParseScript(ctxt);
4052 } else {
4053 /*
4054 * Sometimes DOCTYPE arrives in the middle of the document
4055 */
4056 if ((CUR == '<') && (NXT(1) == '!') &&
4057 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4058 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4059 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4060 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004061 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4062 "Misplaced DOCTYPE declaration\n",
4063 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004064 htmlParseDocTypeDecl(ctxt);
4065 }
4066
4067 /*
4068 * First case : a comment
4069 */
4070 if ((CUR == '<') && (NXT(1) == '!') &&
4071 (NXT(2) == '-') && (NXT(3) == '-')) {
4072 htmlParseComment(ctxt);
4073 }
4074
4075 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004076 * Second case : a Processing Instruction.
4077 */
4078 else if ((CUR == '<') && (NXT(1) == '?')) {
4079 htmlParsePI(ctxt);
4080 }
4081
4082 /*
4083 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004084 */
4085 else if (CUR == '<') {
4086 htmlParseElement(ctxt);
4087 }
4088
4089 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004090 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004091 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004092 */
4093 else if (CUR == '&') {
4094 htmlParseReference(ctxt);
4095 }
4096
4097 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004098 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004099 */
4100 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004101 htmlAutoCloseOnEnd(ctxt);
4102 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004103 }
4104
4105 /*
4106 * Last case, text. Note that References are handled directly.
4107 */
4108 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004109 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004110 }
4111
4112 if (cons == ctxt->nbChars) {
4113 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004114 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4115 "detected an error in element content\n",
4116 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004117 }
4118 break;
4119 }
4120 }
4121 GROW;
4122 }
4123 if (currentNode != NULL) xmlFree(currentNode);
4124}
4125
4126/**
4127 * htmlParseElement:
4128 * @ctxt: an HTML parser context
4129 *
4130 * parse an HTML element, this is highly recursive
Eugene Pimenov615904f2010-03-15 15:16:02 +01004131 * this is kept for compatibility with previous code versions
Owen Taylor3473f882001-02-23 17:55:21 +00004132 *
4133 * [39] element ::= EmptyElemTag | STag content ETag
4134 *
4135 * [41] Attribute ::= Name Eq AttValue
4136 */
4137
4138void
4139htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004140 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004141 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004142 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004143 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004144 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004145 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004146 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004147
Daniel Veillarda03e3652004-11-02 18:45:30 +00004148 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4149 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004150 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004151 return;
4152 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004153
4154 if (ctxt->instate == XML_PARSER_EOF)
4155 return;
4156
Owen Taylor3473f882001-02-23 17:55:21 +00004157 /* Capture start position */
4158 if (ctxt->record_info) {
4159 node_info.begin_pos = ctxt->input->consumed +
4160 (CUR_PTR - ctxt->input->base);
4161 node_info.begin_line = ctxt->input->line;
4162 }
4163
Daniel Veillard597f1c12005-07-03 23:00:18 +00004164 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004165 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004166 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004167 if (CUR == '>')
4168 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004169 return;
4170 }
Owen Taylor3473f882001-02-23 17:55:21 +00004171
4172 /*
4173 * Lookup the info for that element.
4174 */
4175 info = htmlTagLookup(name);
4176 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004177 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4178 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004179 }
4180
4181 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004182 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004183 */
4184 if ((CUR == '/') && (NXT(1) == '>')) {
4185 SKIP(2);
4186 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4187 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004188 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004189 return;
4190 }
4191
4192 if (CUR == '>') {
4193 NEXT;
4194 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004195 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4196 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004197
4198 /*
4199 * end of parsing of this node.
4200 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004201 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004202 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004203 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004204 }
Owen Taylor3473f882001-02-23 17:55:21 +00004205
4206 /*
4207 * Capture end position and add node
4208 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004209 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004210 node_info.end_pos = ctxt->input->consumed +
4211 (CUR_PTR - ctxt->input->base);
4212 node_info.end_line = ctxt->input->line;
4213 node_info.node = ctxt->node;
4214 xmlParserAddNodeInfo(ctxt, &node_info);
4215 }
4216 return;
4217 }
4218
4219 /*
4220 * Check for an Empty Element from DTD definition
4221 */
4222 if ((info != NULL) && (info->empty)) {
4223 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4224 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004225 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004226 return;
4227 }
4228
4229 /*
4230 * Parse the content of the element:
4231 */
4232 currentNode = xmlStrdup(ctxt->name);
4233 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004234 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004235 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004236 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004237 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004238 if (ctxt->nameNr < depth) break;
4239 }
Owen Taylor3473f882001-02-23 17:55:21 +00004240
Owen Taylor3473f882001-02-23 17:55:21 +00004241 /*
4242 * Capture end position and add node
4243 */
4244 if ( currentNode != NULL && ctxt->record_info ) {
4245 node_info.end_pos = ctxt->input->consumed +
4246 (CUR_PTR - ctxt->input->base);
4247 node_info.end_line = ctxt->input->line;
4248 node_info.node = ctxt->node;
4249 xmlParserAddNodeInfo(ctxt, &node_info);
4250 }
William M. Brack76e95df2003-10-18 16:20:14 +00004251 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004252 htmlAutoCloseOnEnd(ctxt);
4253 }
4254
Owen Taylor3473f882001-02-23 17:55:21 +00004255 if (currentNode != NULL)
4256 xmlFree(currentNode);
4257}
4258
Eugene Pimenov615904f2010-03-15 15:16:02 +01004259static void
4260htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4261 /*
4262 * Capture end position and add node
4263 */
4264 if ( ctxt->node != NULL && ctxt->record_info ) {
4265 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4266 (CUR_PTR - ctxt->input->base);
4267 ctxt->nodeInfo->end_line = ctxt->input->line;
4268 ctxt->nodeInfo->node = ctxt->node;
4269 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4270 htmlNodeInfoPop(ctxt);
4271 }
4272 if (!IS_CHAR_CH(CUR)) {
4273 htmlAutoCloseOnEnd(ctxt);
4274 }
4275}
4276
4277/**
4278 * htmlParseElementInternal:
4279 * @ctxt: an HTML parser context
4280 *
4281 * parse an HTML element, new version, non recursive
4282 *
4283 * [39] element ::= EmptyElemTag | STag content ETag
4284 *
4285 * [41] Attribute ::= Name Eq AttValue
4286 */
4287
4288static void
4289htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4290 const xmlChar *name;
4291 const htmlElemDesc * info;
4292 htmlParserNodeInfo node_info;
4293 int failed;
Eugene Pimenov615904f2010-03-15 15:16:02 +01004294
4295 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4296 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4297 "htmlParseElementInternal: context error\n", NULL, NULL);
4298 return;
4299 }
4300
4301 if (ctxt->instate == XML_PARSER_EOF)
4302 return;
4303
4304 /* Capture start position */
4305 if (ctxt->record_info) {
4306 node_info.begin_pos = ctxt->input->consumed +
4307 (CUR_PTR - ctxt->input->base);
4308 node_info.begin_line = ctxt->input->line;
4309 }
4310
4311 failed = htmlParseStartTag(ctxt);
4312 name = ctxt->name;
4313 if ((failed == -1) || (name == NULL)) {
4314 if (CUR == '>')
4315 NEXT;
4316 return;
4317 }
4318
4319 /*
4320 * Lookup the info for that element.
4321 */
4322 info = htmlTagLookup(name);
4323 if (info == NULL) {
4324 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4325 "Tag %s invalid\n", name, NULL);
4326 }
4327
4328 /*
4329 * Check for an Empty Element labeled the XML/SGML way
4330 */
4331 if ((CUR == '/') && (NXT(1) == '>')) {
4332 SKIP(2);
4333 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4334 ctxt->sax->endElement(ctxt->userData, name);
4335 htmlnamePop(ctxt);
4336 return;
4337 }
4338
4339 if (CUR == '>') {
4340 NEXT;
4341 } else {
4342 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4343 "Couldn't find end of Start Tag %s\n", name, NULL);
4344
4345 /*
4346 * end of parsing of this node.
4347 */
4348 if (xmlStrEqual(name, ctxt->name)) {
4349 nodePop(ctxt);
4350 htmlnamePop(ctxt);
4351 }
4352
4353 if (ctxt->record_info)
4354 htmlNodeInfoPush(ctxt, &node_info);
4355 htmlParserFinishElementParsing(ctxt);
4356 return;
4357 }
4358
4359 /*
4360 * Check for an Empty Element from DTD definition
4361 */
4362 if ((info != NULL) && (info->empty)) {
4363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4364 ctxt->sax->endElement(ctxt->userData, name);
4365 htmlnamePop(ctxt);
4366 return;
4367 }
4368
4369 if (ctxt->record_info)
4370 htmlNodeInfoPush(ctxt, &node_info);
4371}
4372
4373/**
4374 * htmlParseContentInternal:
4375 * @ctxt: an HTML parser context
4376 *
4377 * Parse a content: comment, sub-element, reference or text.
4378 * New version for non recursive htmlParseElementInternal
4379 */
4380
4381static void
4382htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4383 xmlChar *currentNode;
4384 int depth;
4385 const xmlChar *name;
4386
4387 currentNode = xmlStrdup(ctxt->name);
4388 depth = ctxt->nameNr;
4389 while (1) {
4390 long cons = ctxt->nbChars;
4391
4392 GROW;
4393
4394 if (ctxt->instate == XML_PARSER_EOF)
4395 break;
4396
4397 /*
4398 * Our tag or one of it's parent or children is ending.
4399 */
4400 if ((CUR == '<') && (NXT(1) == '/')) {
4401 if (htmlParseEndTag(ctxt) &&
4402 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4403 if (currentNode != NULL)
4404 xmlFree(currentNode);
4405
4406 currentNode = xmlStrdup(ctxt->name);
4407 depth = ctxt->nameNr;
4408 }
4409 continue; /* while */
4410 }
4411
4412 else if ((CUR == '<') &&
4413 ((IS_ASCII_LETTER(NXT(1))) ||
4414 (NXT(1) == '_') || (NXT(1) == ':'))) {
4415 name = htmlParseHTMLName_nonInvasive(ctxt);
4416 if (name == NULL) {
4417 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4418 "htmlParseStartTag: invalid element name\n",
4419 NULL, NULL);
4420 /* Dump the bogus tag like browsers do */
4421 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4422 NEXT;
4423
4424 htmlParserFinishElementParsing(ctxt);
4425 if (currentNode != NULL)
4426 xmlFree(currentNode);
4427
4428 currentNode = xmlStrdup(ctxt->name);
4429 depth = ctxt->nameNr;
4430 continue;
4431 }
4432
4433 if (ctxt->name != NULL) {
4434 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4435 htmlAutoClose(ctxt, name);
4436 continue;
4437 }
4438 }
4439 }
4440
4441 /*
4442 * Has this node been popped out during parsing of
4443 * the next element
4444 */
4445 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4446 (!xmlStrEqual(currentNode, ctxt->name)))
4447 {
4448 htmlParserFinishElementParsing(ctxt);
4449 if (currentNode != NULL) xmlFree(currentNode);
4450
4451 currentNode = xmlStrdup(ctxt->name);
4452 depth = ctxt->nameNr;
4453 continue;
4454 }
4455
4456 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4457 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4458 /*
4459 * Handle SCRIPT/STYLE separately
4460 */
4461 htmlParseScript(ctxt);
4462 } else {
4463 /*
4464 * Sometimes DOCTYPE arrives in the middle of the document
4465 */
4466 if ((CUR == '<') && (NXT(1) == '!') &&
4467 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4468 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4469 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4470 (UPP(8) == 'E')) {
4471 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4472 "Misplaced DOCTYPE declaration\n",
4473 BAD_CAST "DOCTYPE" , NULL);
4474 htmlParseDocTypeDecl(ctxt);
4475 }
4476
4477 /*
4478 * First case : a comment
4479 */
4480 if ((CUR == '<') && (NXT(1) == '!') &&
4481 (NXT(2) == '-') && (NXT(3) == '-')) {
4482 htmlParseComment(ctxt);
4483 }
4484
4485 /*
4486 * Second case : a Processing Instruction.
4487 */
4488 else if ((CUR == '<') && (NXT(1) == '?')) {
4489 htmlParsePI(ctxt);
4490 }
4491
4492 /*
4493 * Third case : a sub-element.
4494 */
4495 else if (CUR == '<') {
4496 htmlParseElementInternal(ctxt);
4497 if (currentNode != NULL) xmlFree(currentNode);
4498
4499 currentNode = xmlStrdup(ctxt->name);
4500 depth = ctxt->nameNr;
4501 }
4502
4503 /*
4504 * Fourth case : a reference. If if has not been resolved,
4505 * parsing returns it's Name, create the node
4506 */
4507 else if (CUR == '&') {
4508 htmlParseReference(ctxt);
4509 }
4510
4511 /*
4512 * Fifth case : end of the resource
4513 */
4514 else if (CUR == 0) {
4515 htmlAutoCloseOnEnd(ctxt);
4516 break;
4517 }
4518
4519 /*
4520 * Last case, text. Note that References are handled directly.
4521 */
4522 else {
4523 htmlParseCharData(ctxt);
4524 }
4525
4526 if (cons == ctxt->nbChars) {
4527 if (ctxt->node != NULL) {
4528 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4529 "detected an error in element content\n",
4530 NULL, NULL);
4531 }
4532 break;
4533 }
4534 }
4535 GROW;
4536 }
4537 if (currentNode != NULL) xmlFree(currentNode);
4538}
4539
4540/**
4541 * htmlParseContent:
4542 * @ctxt: an HTML parser context
4543 *
4544 * Parse a content: comment, sub-element, reference or text.
4545 * This is the entry point when called from parser.c
4546 */
4547
4548void
4549__htmlParseContent(void *ctxt) {
4550 if (ctxt != NULL)
4551 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4552}
4553
Owen Taylor3473f882001-02-23 17:55:21 +00004554/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004555 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004556 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004557 *
Owen Taylor3473f882001-02-23 17:55:21 +00004558 * parse an HTML document (and build a tree if using the standard SAX
4559 * interface).
4560 *
4561 * Returns 0, -1 in case of error. the parser context is augmented
4562 * as a result of the parsing.
4563 */
4564
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004565int
Owen Taylor3473f882001-02-23 17:55:21 +00004566htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004567 xmlChar start[4];
4568 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004569 xmlDtdPtr dtd;
4570
Daniel Veillardd0463562001-10-13 09:15:48 +00004571 xmlInitParser();
4572
Owen Taylor3473f882001-02-23 17:55:21 +00004573 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004574
Daniel Veillarda03e3652004-11-02 18:45:30 +00004575 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4576 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4577 "htmlParseDocument: context error\n", NULL, NULL);
4578 return(XML_ERR_INTERNAL_ERROR);
4579 }
4580 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004581 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004582 GROW;
4583 /*
4584 * SAX: beginning of the document processing.
4585 */
4586 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4587 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4588
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004589 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4590 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4591 /*
4592 * Get the 4 first bytes and decode the charset
4593 * if enc != XML_CHAR_ENCODING_NONE
4594 * plug some encoding conversion routines.
4595 */
4596 start[0] = RAW;
4597 start[1] = NXT(1);
4598 start[2] = NXT(2);
4599 start[3] = NXT(3);
4600 enc = xmlDetectCharEncoding(&start[0], 4);
4601 if (enc != XML_CHAR_ENCODING_NONE) {
4602 xmlSwitchEncoding(ctxt, enc);
4603 }
4604 }
4605
Owen Taylor3473f882001-02-23 17:55:21 +00004606 /*
4607 * Wipe out everything which is before the first '<'
4608 */
4609 SKIP_BLANKS;
4610 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004611 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004612 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004613 }
4614
4615 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4616 ctxt->sax->startDocument(ctxt->userData);
4617
4618
4619 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004620 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004621 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004622 while (((CUR == '<') && (NXT(1) == '!') &&
4623 (NXT(2) == '-') && (NXT(3) == '-')) ||
4624 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004625 htmlParseComment(ctxt);
4626 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004627 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004628 }
Owen Taylor3473f882001-02-23 17:55:21 +00004629
4630
4631 /*
4632 * Then possibly doc type declaration(s) and more Misc
4633 * (doctypedecl Misc*)?
4634 */
4635 if ((CUR == '<') && (NXT(1) == '!') &&
4636 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4637 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4638 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4639 (UPP(8) == 'E')) {
4640 htmlParseDocTypeDecl(ctxt);
4641 }
4642 SKIP_BLANKS;
4643
4644 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004645 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004646 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004647 while (((CUR == '<') && (NXT(1) == '!') &&
4648 (NXT(2) == '-') && (NXT(3) == '-')) ||
4649 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004650 htmlParseComment(ctxt);
4651 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004652 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004653 }
Owen Taylor3473f882001-02-23 17:55:21 +00004654
4655 /*
4656 * Time to start parsing the tree itself
4657 */
Eugene Pimenov615904f2010-03-15 15:16:02 +01004658 htmlParseContentInternal(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004659
4660 /*
4661 * autoclose
4662 */
4663 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004664 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004665
4666
4667 /*
4668 * SAX: end of the document processing.
4669 */
4670 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4671 ctxt->sax->endDocument(ctxt->userData);
4672
Daniel Veillardf1121c42010-07-26 14:02:42 +02004673 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004674 dtd = xmlGetIntSubset(ctxt->myDoc);
4675 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004676 ctxt->myDoc->intSubset =
4677 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004678 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4679 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4680 }
4681 if (! ctxt->wellFormed) return(-1);
4682 return(0);
4683}
4684
4685
4686/************************************************************************
4687 * *
4688 * Parser contexts handling *
4689 * *
4690 ************************************************************************/
4691
4692/**
William M. Brackedb65a72004-02-06 07:36:04 +00004693 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004694 * @ctxt: an HTML parser context
4695 *
4696 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004697 *
4698 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004699 */
4700
Daniel Veillardf403d292003-10-05 13:51:35 +00004701static int
Owen Taylor3473f882001-02-23 17:55:21 +00004702htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4703{
4704 htmlSAXHandler *sax;
4705
Daniel Veillardf403d292003-10-05 13:51:35 +00004706 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004707 memset(ctxt, 0, sizeof(htmlParserCtxt));
4708
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004709 ctxt->dict = xmlDictCreate();
4710 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004711 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4712 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004713 }
Owen Taylor3473f882001-02-23 17:55:21 +00004714 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4715 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004716 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4717 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004718 }
4719 else
4720 memset(sax, 0, sizeof(htmlSAXHandler));
4721
4722 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004723 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004724 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4725 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004726 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004727 ctxt->inputNr = 0;
4728 ctxt->inputMax = 0;
4729 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004730 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004731 }
4732 ctxt->inputNr = 0;
4733 ctxt->inputMax = 5;
4734 ctxt->input = NULL;
4735 ctxt->version = NULL;
4736 ctxt->encoding = NULL;
4737 ctxt->standalone = -1;
4738 ctxt->instate = XML_PARSER_START;
4739
4740 /* Allocate the Node stack */
4741 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4742 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004743 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004744 ctxt->nodeNr = 0;
4745 ctxt->nodeMax = 0;
4746 ctxt->node = NULL;
4747 ctxt->inputNr = 0;
4748 ctxt->inputMax = 0;
4749 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004750 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004751 }
4752 ctxt->nodeNr = 0;
4753 ctxt->nodeMax = 10;
4754 ctxt->node = NULL;
4755
4756 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004757 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004758 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004759 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004760 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004761 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004762 ctxt->name = NULL;
4763 ctxt->nodeNr = 0;
4764 ctxt->nodeMax = 0;
4765 ctxt->node = NULL;
4766 ctxt->inputNr = 0;
4767 ctxt->inputMax = 0;
4768 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004769 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004770 }
4771 ctxt->nameNr = 0;
4772 ctxt->nameMax = 10;
4773 ctxt->name = NULL;
4774
Eugene Pimenov615904f2010-03-15 15:16:02 +01004775 ctxt->nodeInfoTab = NULL;
4776 ctxt->nodeInfoNr = 0;
4777 ctxt->nodeInfoMax = 0;
4778
Daniel Veillard092643b2003-09-25 14:29:29 +00004779 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004780 else {
4781 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004782 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004783 }
4784 ctxt->userData = ctxt;
4785 ctxt->myDoc = NULL;
4786 ctxt->wellFormed = 1;
4787 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004788 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004789 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004790 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004791 ctxt->vctxt.userData = ctxt;
4792 ctxt->vctxt.error = xmlParserValidityError;
4793 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004794 ctxt->record_info = 0;
4795 ctxt->validate = 0;
4796 ctxt->nbChars = 0;
4797 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004798 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004799 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004800 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004801}
4802
4803/**
4804 * htmlFreeParserCtxt:
4805 * @ctxt: an HTML parser context
4806 *
4807 * Free all the memory used by a parser context. However the parsed
4808 * document in ctxt->myDoc is not freed.
4809 */
4810
4811void
4812htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4813{
4814 xmlFreeParserCtxt(ctxt);
4815}
4816
4817/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004818 * htmlNewParserCtxt:
4819 *
4820 * Allocate and initialize a new parser context.
4821 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004822 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004823 */
4824
Daniel Veillard34c647c2006-09-21 06:53:59 +00004825htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004826htmlNewParserCtxt(void)
4827{
4828 xmlParserCtxtPtr ctxt;
4829
4830 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4831 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004832 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004833 return(NULL);
4834 }
4835 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004836 if (htmlInitParserCtxt(ctxt) < 0) {
4837 htmlFreeParserCtxt(ctxt);
4838 return(NULL);
4839 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004840 return(ctxt);
4841}
4842
4843/**
4844 * htmlCreateMemoryParserCtxt:
4845 * @buffer: a pointer to a char array
4846 * @size: the size of the array
4847 *
4848 * Create a parser context for an HTML in-memory document.
4849 *
4850 * Returns the new parser context or NULL
4851 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004852htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004853htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4854 xmlParserCtxtPtr ctxt;
4855 xmlParserInputPtr input;
4856 xmlParserInputBufferPtr buf;
4857
4858 if (buffer == NULL)
4859 return(NULL);
4860 if (size <= 0)
4861 return(NULL);
4862
4863 ctxt = htmlNewParserCtxt();
4864 if (ctxt == NULL)
4865 return(NULL);
4866
4867 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4868 if (buf == NULL) return(NULL);
4869
4870 input = xmlNewInputStream(ctxt);
4871 if (input == NULL) {
4872 xmlFreeParserCtxt(ctxt);
4873 return(NULL);
4874 }
4875
4876 input->filename = NULL;
4877 input->buf = buf;
4878 input->base = input->buf->buffer->content;
4879 input->cur = input->buf->buffer->content;
4880 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4881
4882 inputPush(ctxt, input);
4883 return(ctxt);
4884}
4885
4886/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004887 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004888 * @cur: a pointer to an array of xmlChar
4889 * @encoding: a free form C string describing the HTML document encoding, or NULL
4890 *
4891 * Create a parser context for an HTML document.
4892 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004893 * TODO: check the need to add encoding handling there
4894 *
Owen Taylor3473f882001-02-23 17:55:21 +00004895 * Returns the new parser context or NULL
4896 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004897static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004898htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004899 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004900 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004901
Daniel Veillard1d995272002-07-22 16:43:32 +00004902 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004903 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004904 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004905 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004906 if (ctxt == NULL)
4907 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004908
4909 if (encoding != NULL) {
4910 xmlCharEncoding enc;
4911 xmlCharEncodingHandlerPtr handler;
4912
4913 if (ctxt->input->encoding != NULL)
4914 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004915 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004916
4917 enc = xmlParseCharEncoding(encoding);
4918 /*
4919 * registered set of known encodings
4920 */
4921 if (enc != XML_CHAR_ENCODING_ERROR) {
4922 xmlSwitchEncoding(ctxt, enc);
4923 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004924 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004925 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004926 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004927 }
4928 } else {
4929 /*
4930 * fallback for unknown encodings
4931 */
4932 handler = xmlFindCharEncodingHandler((const char *) encoding);
4933 if (handler != NULL) {
4934 xmlSwitchToEncoding(ctxt, handler);
4935 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004936 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4937 "Unsupported encoding %s\n",
4938 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004939 }
4940 }
4941 }
4942 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004943}
4944
Daniel Veillard73b013f2003-09-30 12:36:01 +00004945#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004946/************************************************************************
4947 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004948 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004949 * *
4950 ************************************************************************/
4951
4952/**
4953 * htmlParseLookupSequence:
4954 * @ctxt: an HTML parser context
4955 * @first: the first char to lookup
4956 * @next: the next char to lookup or zero
4957 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004958 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004959 *
4960 * Try to find if a sequence (first, next, third) or just (first next) or
4961 * (first) is available in the input stream.
4962 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4963 * to avoid rescanning sequences of bytes, it DOES change the state of the
4964 * parser, do not use liberally.
4965 * This is basically similar to xmlParseLookupSequence()
4966 *
4967 * Returns the index to the current parsing point if the full sequence
4968 * is available, -1 otherwise.
4969 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004970static int
Owen Taylor3473f882001-02-23 17:55:21 +00004971htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004972 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02004973 int ignoreattrval)
4974{
Owen Taylor3473f882001-02-23 17:55:21 +00004975 int base, len;
4976 htmlParserInputPtr in;
4977 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004978 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02004979 int invalue = 0;
4980 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00004981
4982 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004983 if (in == NULL)
4984 return (-1);
4985
Owen Taylor3473f882001-02-23 17:55:21 +00004986 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004987 if (base < 0)
4988 return (-1);
4989
Owen Taylor3473f882001-02-23 17:55:21 +00004990 if (ctxt->checkIndex > base)
4991 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02004992
Owen Taylor3473f882001-02-23 17:55:21 +00004993 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004994 buf = in->base;
4995 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00004996 } else {
Daniel Veillardeeb99322009-08-25 14:42:16 +02004997 buf = in->buf->buffer->content;
4998 len = in->buf->buffer->use;
Owen Taylor3473f882001-02-23 17:55:21 +00004999 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005000
Owen Taylor3473f882001-02-23 17:55:21 +00005001 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02005002 if (third)
5003 len -= 2;
5004 else if (next)
5005 len--;
5006 for (; base < len; base++) {
5007 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5008 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5009 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5010 incomment = 1;
5011 /* do not increment past <! - some people use <!--> */
5012 base += 2;
5013 }
5014 }
5015 if (ignoreattrval) {
5016 if (buf[base] == '"' || buf[base] == '\'') {
5017 if (invalue) {
5018 if (buf[base] == valdellim) {
5019 invalue = 0;
5020 continue;
5021 }
5022 } else {
5023 valdellim = buf[base];
5024 invalue = 1;
5025 continue;
5026 }
5027 } else if (invalue) {
5028 continue;
5029 }
5030 }
5031 if (incomment) {
5032 if (base + 3 > len)
5033 return (-1);
5034 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5035 (buf[base + 2] == '>')) {
5036 incomment = 0;
5037 base += 2;
5038 }
5039 continue;
5040 }
Owen Taylor3473f882001-02-23 17:55:21 +00005041 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005042 if (third != 0) {
5043 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5044 continue;
5045 } else if (next != 0) {
5046 if (buf[base + 1] != next)
5047 continue;
5048 }
5049 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00005050#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02005051 if (next == 0)
5052 xmlGenericError(xmlGenericErrorContext,
5053 "HPP: lookup '%c' found at %d\n",
5054 first, base);
5055 else if (third == 0)
5056 xmlGenericError(xmlGenericErrorContext,
5057 "HPP: lookup '%c%c' found at %d\n",
5058 first, next, base);
5059 else
5060 xmlGenericError(xmlGenericErrorContext,
5061 "HPP: lookup '%c%c%c' found at %d\n",
5062 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00005063#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005064 return (base - (in->cur - in->base));
5065 }
Owen Taylor3473f882001-02-23 17:55:21 +00005066 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005067 if ((!incomment) && (!invalue))
5068 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00005069#ifdef DEBUG_PUSH
5070 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005071 xmlGenericError(xmlGenericErrorContext,
5072 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00005073 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005074 xmlGenericError(xmlGenericErrorContext,
5075 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02005076 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02005077 xmlGenericError(xmlGenericErrorContext,
5078 "HPP: lookup '%c%c%c' failed\n", first, next,
5079 third);
Owen Taylor3473f882001-02-23 17:55:21 +00005080#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005081 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00005082}
5083
5084/**
Markus Kull56a03032009-08-24 19:00:23 +02005085 * htmlParseLookupChars:
5086 * @ctxt: an HTML parser context
5087 * @stop: Array of chars, which stop the lookup.
5088 * @stopLen: Length of stop-Array
5089 *
5090 * Try to find if any char of the stop-Array is available in the input
5091 * stream.
5092 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5093 * to avoid rescanning sequences of bytes, it DOES change the state of the
5094 * parser, do not use liberally.
5095 *
5096 * Returns the index to the current parsing point if a stopChar
5097 * is available, -1 otherwise.
5098 */
5099static int
5100htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5101 int stopLen)
5102{
5103 int base, len;
5104 htmlParserInputPtr in;
5105 const xmlChar *buf;
5106 int incomment = 0;
5107 int i;
5108
5109 in = ctxt->input;
5110 if (in == NULL)
5111 return (-1);
5112
5113 base = in->cur - in->base;
5114 if (base < 0)
5115 return (-1);
5116
5117 if (ctxt->checkIndex > base)
5118 base = ctxt->checkIndex;
5119
5120 if (in->buf == NULL) {
5121 buf = in->base;
5122 len = in->length;
5123 } else {
5124 buf = in->buf->buffer->content;
5125 len = in->buf->buffer->use;
5126 }
5127
5128 for (; base < len; base++) {
5129 if (!incomment && (base + 4 < len)) {
5130 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5131 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5132 incomment = 1;
5133 /* do not increment past <! - some people use <!--> */
5134 base += 2;
5135 }
5136 }
5137 if (incomment) {
5138 if (base + 3 > len)
5139 return (-1);
5140 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5141 (buf[base + 2] == '>')) {
5142 incomment = 0;
5143 base += 2;
5144 }
5145 continue;
5146 }
5147 for (i = 0; i < stopLen; ++i) {
5148 if (buf[base] == stop[i]) {
5149 ctxt->checkIndex = 0;
5150 return (base - (in->cur - in->base));
5151 }
5152 }
5153 }
5154 ctxt->checkIndex = base;
5155 return (-1);
5156}
5157
5158/**
Owen Taylor3473f882001-02-23 17:55:21 +00005159 * htmlParseTryOrFinish:
5160 * @ctxt: an HTML parser context
5161 * @terminate: last chunk indicator
5162 *
5163 * Try to progress on parsing
5164 *
5165 * Returns zero if no parsing was possible
5166 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005167static int
Owen Taylor3473f882001-02-23 17:55:21 +00005168htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5169 int ret = 0;
5170 htmlParserInputPtr in;
5171 int avail = 0;
5172 xmlChar cur, next;
5173
5174#ifdef DEBUG_PUSH
5175 switch (ctxt->instate) {
5176 case XML_PARSER_EOF:
5177 xmlGenericError(xmlGenericErrorContext,
5178 "HPP: try EOF\n"); break;
5179 case XML_PARSER_START:
5180 xmlGenericError(xmlGenericErrorContext,
5181 "HPP: try START\n"); break;
5182 case XML_PARSER_MISC:
5183 xmlGenericError(xmlGenericErrorContext,
5184 "HPP: try MISC\n");break;
5185 case XML_PARSER_COMMENT:
5186 xmlGenericError(xmlGenericErrorContext,
5187 "HPP: try COMMENT\n");break;
5188 case XML_PARSER_PROLOG:
5189 xmlGenericError(xmlGenericErrorContext,
5190 "HPP: try PROLOG\n");break;
5191 case XML_PARSER_START_TAG:
5192 xmlGenericError(xmlGenericErrorContext,
5193 "HPP: try START_TAG\n");break;
5194 case XML_PARSER_CONTENT:
5195 xmlGenericError(xmlGenericErrorContext,
5196 "HPP: try CONTENT\n");break;
5197 case XML_PARSER_CDATA_SECTION:
5198 xmlGenericError(xmlGenericErrorContext,
5199 "HPP: try CDATA_SECTION\n");break;
5200 case XML_PARSER_END_TAG:
5201 xmlGenericError(xmlGenericErrorContext,
5202 "HPP: try END_TAG\n");break;
5203 case XML_PARSER_ENTITY_DECL:
5204 xmlGenericError(xmlGenericErrorContext,
5205 "HPP: try ENTITY_DECL\n");break;
5206 case XML_PARSER_ENTITY_VALUE:
5207 xmlGenericError(xmlGenericErrorContext,
5208 "HPP: try ENTITY_VALUE\n");break;
5209 case XML_PARSER_ATTRIBUTE_VALUE:
5210 xmlGenericError(xmlGenericErrorContext,
5211 "HPP: try ATTRIBUTE_VALUE\n");break;
5212 case XML_PARSER_DTD:
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: try DTD\n");break;
5215 case XML_PARSER_EPILOG:
5216 xmlGenericError(xmlGenericErrorContext,
5217 "HPP: try EPILOG\n");break;
5218 case XML_PARSER_PI:
5219 xmlGenericError(xmlGenericErrorContext,
5220 "HPP: try PI\n");break;
5221 case XML_PARSER_SYSTEM_LITERAL:
5222 xmlGenericError(xmlGenericErrorContext,
5223 "HPP: try SYSTEM_LITERAL\n");break;
5224 }
5225#endif
5226
5227 while (1) {
5228
5229 in = ctxt->input;
5230 if (in == NULL) break;
5231 if (in->buf == NULL)
5232 avail = in->length - (in->cur - in->base);
5233 else
5234 avail = in->buf->buffer->use - (in->cur - in->base);
5235 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005236 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005237 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005238 /*
5239 * SAX: end of the document processing.
5240 */
5241 ctxt->instate = XML_PARSER_EOF;
5242 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5243 ctxt->sax->endDocument(ctxt->userData);
5244 }
5245 }
5246 if (avail < 1)
5247 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00005248 cur = in->cur[0];
5249 if (cur == 0) {
5250 SKIP(1);
5251 continue;
5252 }
5253
Owen Taylor3473f882001-02-23 17:55:21 +00005254 switch (ctxt->instate) {
5255 case XML_PARSER_EOF:
5256 /*
5257 * Document parsing is done !
5258 */
5259 goto done;
5260 case XML_PARSER_START:
5261 /*
5262 * Very first chars read from the document flow.
5263 */
5264 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005265 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005266 SKIP_BLANKS;
5267 if (in->buf == NULL)
5268 avail = in->length - (in->cur - in->base);
5269 else
5270 avail = in->buf->buffer->use - (in->cur - in->base);
5271 }
5272 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5273 ctxt->sax->setDocumentLocator(ctxt->userData,
5274 &xmlDefaultSAXLocator);
5275 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5276 (!ctxt->disableSAX))
5277 ctxt->sax->startDocument(ctxt->userData);
5278
5279 cur = in->cur[0];
5280 next = in->cur[1];
5281 if ((cur == '<') && (next == '!') &&
5282 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5283 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5284 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5285 (UPP(8) == 'E')) {
5286 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005287 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005288 goto done;
5289#ifdef DEBUG_PUSH
5290 xmlGenericError(xmlGenericErrorContext,
5291 "HPP: Parsing internal subset\n");
5292#endif
5293 htmlParseDocTypeDecl(ctxt);
5294 ctxt->instate = XML_PARSER_PROLOG;
5295#ifdef DEBUG_PUSH
5296 xmlGenericError(xmlGenericErrorContext,
5297 "HPP: entering PROLOG\n");
5298#endif
5299 } else {
5300 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005301#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00005302 xmlGenericError(xmlGenericErrorContext,
5303 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005304#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00005305 }
Owen Taylor3473f882001-02-23 17:55:21 +00005306 break;
5307 case XML_PARSER_MISC:
5308 SKIP_BLANKS;
5309 if (in->buf == NULL)
5310 avail = in->length - (in->cur - in->base);
5311 else
5312 avail = in->buf->buffer->use - (in->cur - in->base);
5313 if (avail < 2)
5314 goto done;
5315 cur = in->cur[0];
5316 next = in->cur[1];
5317 if ((cur == '<') && (next == '!') &&
5318 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5319 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005320 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005321 goto done;
5322#ifdef DEBUG_PUSH
5323 xmlGenericError(xmlGenericErrorContext,
5324 "HPP: Parsing Comment\n");
5325#endif
5326 htmlParseComment(ctxt);
5327 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005328 } else if ((cur == '<') && (next == '?')) {
5329 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005330 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005331 goto done;
5332#ifdef DEBUG_PUSH
5333 xmlGenericError(xmlGenericErrorContext,
5334 "HPP: Parsing PI\n");
5335#endif
5336 htmlParsePI(ctxt);
5337 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005338 } else if ((cur == '<') && (next == '!') &&
5339 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5340 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5341 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5342 (UPP(8) == 'E')) {
5343 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005344 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005345 goto done;
5346#ifdef DEBUG_PUSH
5347 xmlGenericError(xmlGenericErrorContext,
5348 "HPP: Parsing internal subset\n");
5349#endif
5350 htmlParseDocTypeDecl(ctxt);
5351 ctxt->instate = XML_PARSER_PROLOG;
5352#ifdef DEBUG_PUSH
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: entering PROLOG\n");
5355#endif
5356 } else if ((cur == '<') && (next == '!') &&
5357 (avail < 9)) {
5358 goto done;
5359 } else {
5360 ctxt->instate = XML_PARSER_START_TAG;
5361#ifdef DEBUG_PUSH
5362 xmlGenericError(xmlGenericErrorContext,
5363 "HPP: entering START_TAG\n");
5364#endif
5365 }
5366 break;
5367 case XML_PARSER_PROLOG:
5368 SKIP_BLANKS;
5369 if (in->buf == NULL)
5370 avail = in->length - (in->cur - in->base);
5371 else
5372 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005373 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005374 goto done;
5375 cur = in->cur[0];
5376 next = in->cur[1];
5377 if ((cur == '<') && (next == '!') &&
5378 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5379 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005380 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005381 goto done;
5382#ifdef DEBUG_PUSH
5383 xmlGenericError(xmlGenericErrorContext,
5384 "HPP: Parsing Comment\n");
5385#endif
5386 htmlParseComment(ctxt);
5387 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005388 } else if ((cur == '<') && (next == '?')) {
5389 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005390 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005391 goto done;
5392#ifdef DEBUG_PUSH
5393 xmlGenericError(xmlGenericErrorContext,
5394 "HPP: Parsing PI\n");
5395#endif
5396 htmlParsePI(ctxt);
5397 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005398 } else if ((cur == '<') && (next == '!') &&
5399 (avail < 4)) {
5400 goto done;
5401 } else {
5402 ctxt->instate = XML_PARSER_START_TAG;
5403#ifdef DEBUG_PUSH
5404 xmlGenericError(xmlGenericErrorContext,
5405 "HPP: entering START_TAG\n");
5406#endif
5407 }
5408 break;
5409 case XML_PARSER_EPILOG:
5410 if (in->buf == NULL)
5411 avail = in->length - (in->cur - in->base);
5412 else
5413 avail = in->buf->buffer->use - (in->cur - in->base);
5414 if (avail < 1)
5415 goto done;
5416 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005417 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005418 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005419 goto done;
5420 }
5421 if (avail < 2)
5422 goto done;
5423 next = in->cur[1];
5424 if ((cur == '<') && (next == '!') &&
5425 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5426 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005427 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005428 goto done;
5429#ifdef DEBUG_PUSH
5430 xmlGenericError(xmlGenericErrorContext,
5431 "HPP: Parsing Comment\n");
5432#endif
5433 htmlParseComment(ctxt);
5434 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005435 } else if ((cur == '<') && (next == '?')) {
5436 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005437 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005438 goto done;
5439#ifdef DEBUG_PUSH
5440 xmlGenericError(xmlGenericErrorContext,
5441 "HPP: Parsing PI\n");
5442#endif
5443 htmlParsePI(ctxt);
5444 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005445 } else if ((cur == '<') && (next == '!') &&
5446 (avail < 4)) {
5447 goto done;
5448 } else {
5449 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005450 ctxt->wellFormed = 0;
5451 ctxt->instate = XML_PARSER_EOF;
5452#ifdef DEBUG_PUSH
5453 xmlGenericError(xmlGenericErrorContext,
5454 "HPP: entering EOF\n");
5455#endif
5456 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5457 ctxt->sax->endDocument(ctxt->userData);
5458 goto done;
5459 }
5460 break;
5461 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005462 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005463 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005464 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005465
5466 if (avail < 2)
5467 goto done;
5468 cur = in->cur[0];
5469 if (cur != '<') {
5470 ctxt->instate = XML_PARSER_CONTENT;
5471#ifdef DEBUG_PUSH
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: entering CONTENT\n");
5474#endif
5475 break;
5476 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005477 if (in->cur[1] == '/') {
5478 ctxt->instate = XML_PARSER_END_TAG;
5479 ctxt->checkIndex = 0;
5480#ifdef DEBUG_PUSH
5481 xmlGenericError(xmlGenericErrorContext,
5482 "HPP: entering END_TAG\n");
5483#endif
5484 break;
5485 }
Owen Taylor3473f882001-02-23 17:55:21 +00005486 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005487 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005488 goto done;
5489
Daniel Veillard597f1c12005-07-03 23:00:18 +00005490 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005491 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005492 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005493 (name == NULL)) {
5494 if (CUR == '>')
5495 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005496 break;
5497 }
Owen Taylor3473f882001-02-23 17:55:21 +00005498
5499 /*
5500 * Lookup the info for that element.
5501 */
5502 info = htmlTagLookup(name);
5503 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005504 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5505 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005506 }
5507
5508 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005509 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005510 */
5511 if ((CUR == '/') && (NXT(1) == '>')) {
5512 SKIP(2);
5513 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5514 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005515 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005516 ctxt->instate = XML_PARSER_CONTENT;
5517#ifdef DEBUG_PUSH
5518 xmlGenericError(xmlGenericErrorContext,
5519 "HPP: entering CONTENT\n");
5520#endif
5521 break;
5522 }
5523
5524 if (CUR == '>') {
5525 NEXT;
5526 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005527 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5528 "Couldn't find end of Start Tag %s\n",
5529 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005530
5531 /*
5532 * end of parsing of this node.
5533 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005534 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005535 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005536 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005537 }
Owen Taylor3473f882001-02-23 17:55:21 +00005538
5539 ctxt->instate = XML_PARSER_CONTENT;
5540#ifdef DEBUG_PUSH
5541 xmlGenericError(xmlGenericErrorContext,
5542 "HPP: entering CONTENT\n");
5543#endif
5544 break;
5545 }
5546
5547 /*
5548 * Check for an Empty Element from DTD definition
5549 */
5550 if ((info != NULL) && (info->empty)) {
5551 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5552 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005553 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005554 }
5555 ctxt->instate = XML_PARSER_CONTENT;
5556#ifdef DEBUG_PUSH
5557 xmlGenericError(xmlGenericErrorContext,
5558 "HPP: entering CONTENT\n");
5559#endif
5560 break;
5561 }
5562 case XML_PARSER_CONTENT: {
5563 long cons;
5564 /*
5565 * Handle preparsed entities and charRef
5566 */
5567 if (ctxt->token != 0) {
5568 xmlChar chr[2] = { 0 , 0 } ;
5569
5570 chr[0] = (xmlChar) ctxt->token;
5571 htmlCheckParagraph(ctxt);
5572 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5573 ctxt->sax->characters(ctxt->userData, chr, 1);
5574 ctxt->token = 0;
5575 ctxt->checkIndex = 0;
5576 }
5577 if ((avail == 1) && (terminate)) {
5578 cur = in->cur[0];
5579 if ((cur != '<') && (cur != '&')) {
5580 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005581 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005582 if (ctxt->sax->ignorableWhitespace != NULL)
5583 ctxt->sax->ignorableWhitespace(
5584 ctxt->userData, &cur, 1);
5585 } else {
5586 htmlCheckParagraph(ctxt);
5587 if (ctxt->sax->characters != NULL)
5588 ctxt->sax->characters(
5589 ctxt->userData, &cur, 1);
5590 }
5591 }
5592 ctxt->token = 0;
5593 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005594 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005595 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005596 }
Owen Taylor3473f882001-02-23 17:55:21 +00005597 }
5598 if (avail < 2)
5599 goto done;
5600 cur = in->cur[0];
5601 next = in->cur[1];
5602 cons = ctxt->nbChars;
5603 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5604 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5605 /*
5606 * Handle SCRIPT/STYLE separately
5607 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005608 if (!terminate) {
5609 int idx;
5610 xmlChar val;
5611
Jiri Netolicky446e1262009-08-07 17:05:36 +02005612 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
Daniel Veillard68716a72006-10-16 09:32:17 +00005613 if (idx < 0)
5614 goto done;
5615 val = in->cur[idx + 2];
5616 if (val == 0) /* bad cut of input */
5617 goto done;
5618 }
Owen Taylor3473f882001-02-23 17:55:21 +00005619 htmlParseScript(ctxt);
5620 if ((cur == '<') && (next == '/')) {
5621 ctxt->instate = XML_PARSER_END_TAG;
5622 ctxt->checkIndex = 0;
5623#ifdef DEBUG_PUSH
5624 xmlGenericError(xmlGenericErrorContext,
5625 "HPP: entering END_TAG\n");
5626#endif
5627 break;
5628 }
5629 } else {
5630 /*
5631 * Sometimes DOCTYPE arrives in the middle of the document
5632 */
5633 if ((cur == '<') && (next == '!') &&
5634 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5635 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5636 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5637 (UPP(8) == 'E')) {
5638 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005639 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005640 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005641 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5642 "Misplaced DOCTYPE declaration\n",
5643 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005644 htmlParseDocTypeDecl(ctxt);
5645 } else if ((cur == '<') && (next == '!') &&
5646 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5647 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005648 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005649 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005650 goto done;
5651#ifdef DEBUG_PUSH
5652 xmlGenericError(xmlGenericErrorContext,
5653 "HPP: Parsing Comment\n");
5654#endif
5655 htmlParseComment(ctxt);
5656 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005657 } else if ((cur == '<') && (next == '?')) {
5658 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005659 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005660 goto done;
5661#ifdef DEBUG_PUSH
5662 xmlGenericError(xmlGenericErrorContext,
5663 "HPP: Parsing PI\n");
5664#endif
5665 htmlParsePI(ctxt);
5666 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005667 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5668 goto done;
5669 } else if ((cur == '<') && (next == '/')) {
5670 ctxt->instate = XML_PARSER_END_TAG;
5671 ctxt->checkIndex = 0;
5672#ifdef DEBUG_PUSH
5673 xmlGenericError(xmlGenericErrorContext,
5674 "HPP: entering END_TAG\n");
5675#endif
5676 break;
5677 } else if (cur == '<') {
5678 ctxt->instate = XML_PARSER_START_TAG;
5679 ctxt->checkIndex = 0;
5680#ifdef DEBUG_PUSH
5681 xmlGenericError(xmlGenericErrorContext,
5682 "HPP: entering START_TAG\n");
5683#endif
5684 break;
5685 } else if (cur == '&') {
5686 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005687 (htmlParseLookupChars(ctxt,
5688 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005689 goto done;
5690#ifdef DEBUG_PUSH
5691 xmlGenericError(xmlGenericErrorContext,
5692 "HPP: Parsing Reference\n");
5693#endif
5694 /* TODO: check generation of subtrees if noent !!! */
5695 htmlParseReference(ctxt);
5696 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005697 /*
5698 * check that the text sequence is complete
5699 * before handing out the data to the parser
5700 * to avoid problems with erroneous end of
5701 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005702 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005703 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005704 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005705 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005706 ctxt->checkIndex = 0;
5707#ifdef DEBUG_PUSH
5708 xmlGenericError(xmlGenericErrorContext,
5709 "HPP: Parsing char data\n");
5710#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005711 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005712 }
5713 }
5714 if (cons == ctxt->nbChars) {
5715 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005716 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5717 "detected an error in element content\n",
5718 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005719 }
5720 NEXT;
5721 break;
5722 }
5723
5724 break;
5725 }
5726 case XML_PARSER_END_TAG:
5727 if (avail < 2)
5728 goto done;
5729 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005730 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005731 goto done;
5732 htmlParseEndTag(ctxt);
5733 if (ctxt->nameNr == 0) {
5734 ctxt->instate = XML_PARSER_EPILOG;
5735 } else {
5736 ctxt->instate = XML_PARSER_CONTENT;
5737 }
5738 ctxt->checkIndex = 0;
5739#ifdef DEBUG_PUSH
5740 xmlGenericError(xmlGenericErrorContext,
5741 "HPP: entering CONTENT\n");
5742#endif
5743 break;
5744 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005745 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5746 "HPP: internal error, state == CDATA\n",
5747 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005748 ctxt->instate = XML_PARSER_CONTENT;
5749 ctxt->checkIndex = 0;
5750#ifdef DEBUG_PUSH
5751 xmlGenericError(xmlGenericErrorContext,
5752 "HPP: entering CONTENT\n");
5753#endif
5754 break;
5755 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005756 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5757 "HPP: internal error, state == DTD\n",
5758 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005759 ctxt->instate = XML_PARSER_CONTENT;
5760 ctxt->checkIndex = 0;
5761#ifdef DEBUG_PUSH
5762 xmlGenericError(xmlGenericErrorContext,
5763 "HPP: entering CONTENT\n");
5764#endif
5765 break;
5766 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005767 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5768 "HPP: internal error, state == COMMENT\n",
5769 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005770 ctxt->instate = XML_PARSER_CONTENT;
5771 ctxt->checkIndex = 0;
5772#ifdef DEBUG_PUSH
5773 xmlGenericError(xmlGenericErrorContext,
5774 "HPP: entering CONTENT\n");
5775#endif
5776 break;
5777 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005778 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5779 "HPP: internal error, state == PI\n",
5780 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005781 ctxt->instate = XML_PARSER_CONTENT;
5782 ctxt->checkIndex = 0;
5783#ifdef DEBUG_PUSH
5784 xmlGenericError(xmlGenericErrorContext,
5785 "HPP: entering CONTENT\n");
5786#endif
5787 break;
5788 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005789 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5790 "HPP: internal error, state == ENTITY_DECL\n",
5791 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005792 ctxt->instate = XML_PARSER_CONTENT;
5793 ctxt->checkIndex = 0;
5794#ifdef DEBUG_PUSH
5795 xmlGenericError(xmlGenericErrorContext,
5796 "HPP: entering CONTENT\n");
5797#endif
5798 break;
5799 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005800 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5801 "HPP: internal error, state == ENTITY_VALUE\n",
5802 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005803 ctxt->instate = XML_PARSER_CONTENT;
5804 ctxt->checkIndex = 0;
5805#ifdef DEBUG_PUSH
5806 xmlGenericError(xmlGenericErrorContext,
5807 "HPP: entering DTD\n");
5808#endif
5809 break;
5810 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005811 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5812 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5813 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005814 ctxt->instate = XML_PARSER_START_TAG;
5815 ctxt->checkIndex = 0;
5816#ifdef DEBUG_PUSH
5817 xmlGenericError(xmlGenericErrorContext,
5818 "HPP: entering START_TAG\n");
5819#endif
5820 break;
5821 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005822 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5823 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5824 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005825 ctxt->instate = XML_PARSER_CONTENT;
5826 ctxt->checkIndex = 0;
5827#ifdef DEBUG_PUSH
5828 xmlGenericError(xmlGenericErrorContext,
5829 "HPP: entering CONTENT\n");
5830#endif
5831 break;
5832 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005833 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5834 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5835 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005836 ctxt->instate = XML_PARSER_CONTENT;
5837 ctxt->checkIndex = 0;
5838#ifdef DEBUG_PUSH
5839 xmlGenericError(xmlGenericErrorContext,
5840 "HPP: entering CONTENT\n");
5841#endif
5842 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005843 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005844 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5845 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5846 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005847 ctxt->instate = XML_PARSER_CONTENT;
5848 ctxt->checkIndex = 0;
5849#ifdef DEBUG_PUSH
5850 xmlGenericError(xmlGenericErrorContext,
5851 "HPP: entering CONTENT\n");
5852#endif
5853 break;
5854
Owen Taylor3473f882001-02-23 17:55:21 +00005855 }
5856 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005857done:
Owen Taylor3473f882001-02-23 17:55:21 +00005858 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005859 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005860 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005861 /*
5862 * SAX: end of the document processing.
5863 */
5864 ctxt->instate = XML_PARSER_EOF;
5865 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5866 ctxt->sax->endDocument(ctxt->userData);
5867 }
5868 }
5869 if ((ctxt->myDoc != NULL) &&
5870 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5871 (ctxt->instate == XML_PARSER_EPILOG))) {
5872 xmlDtdPtr dtd;
5873 dtd = xmlGetIntSubset(ctxt->myDoc);
5874 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005875 ctxt->myDoc->intSubset =
5876 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005877 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5878 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5879 }
5880#ifdef DEBUG_PUSH
5881 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5882#endif
5883 return(ret);
5884}
5885
5886/**
Owen Taylor3473f882001-02-23 17:55:21 +00005887 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005888 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005889 * @chunk: an char array
5890 * @size: the size in byte of the chunk
5891 * @terminate: last chunk indicator
5892 *
5893 * Parse a Chunk of memory
5894 *
5895 * Returns zero if no error, the xmlParserErrors otherwise.
5896 */
5897int
5898htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5899 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005900 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5901 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5902 "htmlParseChunk: context error\n", NULL, NULL);
5903 return(XML_ERR_INTERNAL_ERROR);
5904 }
Owen Taylor3473f882001-02-23 17:55:21 +00005905 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5906 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5907 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5908 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005909 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005910
5911 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005912 if (res < 0) {
5913 ctxt->errNo = XML_PARSER_EOF;
5914 ctxt->disableSAX = 1;
5915 return (XML_PARSER_EOF);
5916 }
Owen Taylor3473f882001-02-23 17:55:21 +00005917 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5918 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005919 ctxt->input->end =
5920 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005921#ifdef DEBUG_PUSH
5922 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5923#endif
5924
Daniel Veillard14f752c2003-08-09 11:44:50 +00005925#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005926 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5927 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005928#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005929 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005930 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5931 xmlParserInputBufferPtr in = ctxt->input->buf;
5932 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5933 (in->raw != NULL)) {
5934 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02005935
Daniel Veillard14f752c2003-08-09 11:44:50 +00005936 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5937 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005938 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5939 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005940 return(XML_ERR_INVALID_ENCODING);
5941 }
5942 }
5943 }
Owen Taylor3473f882001-02-23 17:55:21 +00005944 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00005945 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00005946 if (terminate) {
5947 if ((ctxt->instate != XML_PARSER_EOF) &&
5948 (ctxt->instate != XML_PARSER_EPILOG) &&
5949 (ctxt->instate != XML_PARSER_MISC)) {
5950 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005951 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02005952 }
Owen Taylor3473f882001-02-23 17:55:21 +00005953 if (ctxt->instate != XML_PARSER_EOF) {
5954 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5955 ctxt->sax->endDocument(ctxt->userData);
5956 }
5957 ctxt->instate = XML_PARSER_EOF;
5958 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005959 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00005960}
5961
5962/************************************************************************
5963 * *
5964 * User entry points *
5965 * *
5966 ************************************************************************/
5967
5968/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005969 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005970 * @sax: a SAX handler
5971 * @user_data: The user data returned on SAX callbacks
5972 * @chunk: a pointer to an array of chars
5973 * @size: number of chars in the array
5974 * @filename: an optional file name or URI
5975 * @enc: an optional encoding
5976 *
5977 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005978 * The value of @filename is used for fetching external entities
5979 * and error/warning reports.
5980 *
5981 * Returns the new parser context or NULL
5982 */
5983htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02005984htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00005985 const char *chunk, int size, const char *filename,
5986 xmlCharEncoding enc) {
5987 htmlParserCtxtPtr ctxt;
5988 htmlParserInputPtr inputStream;
5989 xmlParserInputBufferPtr buf;
5990
Daniel Veillardd0463562001-10-13 09:15:48 +00005991 xmlInitParser();
5992
Owen Taylor3473f882001-02-23 17:55:21 +00005993 buf = xmlAllocParserInputBuffer(enc);
5994 if (buf == NULL) return(NULL);
5995
Daniel Veillardf403d292003-10-05 13:51:35 +00005996 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005997 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005998 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005999 return(NULL);
6000 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00006001 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6002 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00006003 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00006004 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00006005 xmlFree(ctxt->sax);
6006 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6007 if (ctxt->sax == NULL) {
6008 xmlFree(buf);
6009 xmlFree(ctxt);
6010 return(NULL);
6011 }
6012 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6013 if (user_data != NULL)
6014 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02006015 }
Owen Taylor3473f882001-02-23 17:55:21 +00006016 if (filename == NULL) {
6017 ctxt->directory = NULL;
6018 } else {
6019 ctxt->directory = xmlParserGetDirectory(filename);
6020 }
6021
6022 inputStream = htmlNewInputStream(ctxt);
6023 if (inputStream == NULL) {
6024 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00006025 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006026 return(NULL);
6027 }
6028
6029 if (filename == NULL)
6030 inputStream->filename = NULL;
6031 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00006032 inputStream->filename = (char *)
6033 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00006034 inputStream->buf = buf;
6035 inputStream->base = inputStream->buf->buffer->content;
6036 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02006037 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00006038 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00006039
6040 inputPush(ctxt, inputStream);
6041
6042 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02006043 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00006044 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6045 int cur = ctxt->input->cur - ctxt->input->base;
6046
Daniel Veillarde77db162009-08-22 11:32:38 +02006047 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006048
6049 ctxt->input->base = ctxt->input->buf->buffer->content + base;
6050 ctxt->input->cur = ctxt->input->base + cur;
6051 ctxt->input->end =
6052 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00006053#ifdef DEBUG_PUSH
6054 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6055#endif
6056 }
Daniel Veillard68716a72006-10-16 09:32:17 +00006057 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00006058
6059 return(ctxt);
6060}
William M. Brack21e4ef22005-01-02 09:53:13 +00006061#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00006062
6063/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006064 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006065 * @cur: a pointer to an array of xmlChar
6066 * @encoding: a free form C string describing the HTML document encoding, or NULL
6067 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006068 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006069 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006070 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6071 * to handle parse events. If sax is NULL, fallback to the default DOM
6072 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006073 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006074 * Returns the resulting document tree unless SAX is NULL or the document is
6075 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006076 */
6077
6078htmlDocPtr
6079htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6080 htmlDocPtr ret;
6081 htmlParserCtxtPtr ctxt;
6082
Daniel Veillardd0463562001-10-13 09:15:48 +00006083 xmlInitParser();
6084
Owen Taylor3473f882001-02-23 17:55:21 +00006085 if (cur == NULL) return(NULL);
6086
6087
6088 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6089 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02006090 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00006091 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00006092 ctxt->sax = sax;
6093 ctxt->userData = userData;
6094 }
6095
6096 htmlParseDocument(ctxt);
6097 ret = ctxt->myDoc;
6098 if (sax != NULL) {
6099 ctxt->sax = NULL;
6100 ctxt->userData = NULL;
6101 }
6102 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006103
Owen Taylor3473f882001-02-23 17:55:21 +00006104 return(ret);
6105}
6106
6107/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006108 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006109 * @cur: a pointer to an array of xmlChar
6110 * @encoding: a free form C string describing the HTML document encoding, or NULL
6111 *
6112 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006113 *
Owen Taylor3473f882001-02-23 17:55:21 +00006114 * Returns the resulting document tree
6115 */
6116
6117htmlDocPtr
6118htmlParseDoc(xmlChar *cur, const char *encoding) {
6119 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6120}
6121
6122
6123/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006124 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006125 * @filename: the filename
6126 * @encoding: a free form C string describing the HTML document encoding, or NULL
6127 *
Daniel Veillarde77db162009-08-22 11:32:38 +02006128 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00006129 * Automatic support for ZLIB/Compress compressed document is provided
6130 * by default if found at compile-time.
6131 *
6132 * Returns the new parser context or NULL
6133 */
6134htmlParserCtxtPtr
6135htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6136{
6137 htmlParserCtxtPtr ctxt;
6138 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006139 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00006140 /* htmlCharEncoding enc; */
6141 xmlChar *content, *content_line = (xmlChar *) "charset=";
6142
Daniel Veillarda03e3652004-11-02 18:45:30 +00006143 if (filename == NULL)
6144 return(NULL);
6145
Daniel Veillardf403d292003-10-05 13:51:35 +00006146 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006147 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00006148 return(NULL);
6149 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006150 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6151 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00006152#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006153 if (xmlDefaultSAXHandler.error != NULL) {
6154 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6155 }
Daniel Veillard87247e82004-01-13 20:42:02 +00006156#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00006157 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00006158 return(NULL);
6159 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006160
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006161 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6162 xmlFree(canonicFilename);
6163 if (inputStream == NULL) {
6164 xmlFreeParserCtxt(ctxt);
6165 return(NULL);
6166 }
Owen Taylor3473f882001-02-23 17:55:21 +00006167
6168 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006169
Owen Taylor3473f882001-02-23 17:55:21 +00006170 /* set encoding */
6171 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00006172 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02006173 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00006174 strcpy ((char *)content, (char *)content_line);
6175 strcat ((char *)content, (char *)encoding);
6176 htmlCheckEncoding (ctxt, content);
6177 xmlFree (content);
6178 }
6179 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006180
Owen Taylor3473f882001-02-23 17:55:21 +00006181 return(ctxt);
6182}
6183
6184/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006185 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006186 * @filename: the filename
6187 * @encoding: a free form C string describing the HTML document encoding, or NULL
6188 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006189 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006190 *
6191 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6192 * compressed document is provided by default if found at compile-time.
6193 * It use the given SAX function block to handle the parsing callback.
6194 * If sax is NULL, fallback to the default DOM tree building routines.
6195 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006196 * Returns the resulting document tree unless SAX is NULL or the document is
6197 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006198 */
6199
6200htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006201htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00006202 void *userData) {
6203 htmlDocPtr ret;
6204 htmlParserCtxtPtr ctxt;
6205 htmlSAXHandlerPtr oldsax = NULL;
6206
Daniel Veillardd0463562001-10-13 09:15:48 +00006207 xmlInitParser();
6208
Owen Taylor3473f882001-02-23 17:55:21 +00006209 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6210 if (ctxt == NULL) return(NULL);
6211 if (sax != NULL) {
6212 oldsax = ctxt->sax;
6213 ctxt->sax = sax;
6214 ctxt->userData = userData;
6215 }
6216
6217 htmlParseDocument(ctxt);
6218
6219 ret = ctxt->myDoc;
6220 if (sax != NULL) {
6221 ctxt->sax = oldsax;
6222 ctxt->userData = NULL;
6223 }
6224 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006225
Owen Taylor3473f882001-02-23 17:55:21 +00006226 return(ret);
6227}
6228
6229/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006230 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006231 * @filename: the filename
6232 * @encoding: a free form C string describing the HTML document encoding, or NULL
6233 *
6234 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6235 * compressed document is provided by default if found at compile-time.
6236 *
6237 * Returns the resulting document tree
6238 */
6239
6240htmlDocPtr
6241htmlParseFile(const char *filename, const char *encoding) {
6242 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6243}
6244
6245/**
6246 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02006247 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00006248 *
6249 * Set and return the previous value for handling HTML omitted tags.
6250 *
6251 * Returns the last value for 0 for no handling, 1 for auto insertion.
6252 */
6253
6254int
6255htmlHandleOmittedElem(int val) {
6256 int old = htmlOmittedDefaultValue;
6257
6258 htmlOmittedDefaultValue = val;
6259 return(old);
6260}
6261
Daniel Veillard930dfb62003-02-05 10:17:38 +00006262/**
6263 * htmlElementAllowedHere:
6264 * @parent: HTML parent element
6265 * @elt: HTML element
6266 *
6267 * Checks whether an HTML element may be a direct child of a parent element.
6268 * Note - doesn't check for deprecated elements
6269 *
6270 * Returns 1 if allowed; 0 otherwise.
6271 */
6272int
6273htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6274 const char** p ;
6275
6276 if ( ! elt || ! parent || ! parent->subelts )
6277 return 0 ;
6278
6279 for ( p = parent->subelts; *p; ++p )
6280 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6281 return 1 ;
6282
6283 return 0 ;
6284}
6285/**
6286 * htmlElementStatusHere:
6287 * @parent: HTML parent element
6288 * @elt: HTML element
6289 *
6290 * Checks whether an HTML element may be a direct child of a parent element.
6291 * and if so whether it is valid or deprecated.
6292 *
6293 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6294 */
6295htmlStatus
6296htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6297 if ( ! parent || ! elt )
6298 return HTML_INVALID ;
6299 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6300 return HTML_INVALID ;
6301
6302 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6303}
6304/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006305 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00006306 * @elt: HTML element
6307 * @attr: HTML attribute
6308 * @legacy: whether to allow deprecated attributes
6309 *
6310 * Checks whether an attribute is valid for an element
6311 * Has full knowledge of Required and Deprecated attributes
6312 *
6313 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6314 */
6315htmlStatus
6316htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6317 const char** p ;
6318
6319 if ( !elt || ! attr )
6320 return HTML_INVALID ;
6321
6322 if ( elt->attrs_req )
6323 for ( p = elt->attrs_req; *p; ++p)
6324 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6325 return HTML_REQUIRED ;
6326
6327 if ( elt->attrs_opt )
6328 for ( p = elt->attrs_opt; *p; ++p)
6329 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6330 return HTML_VALID ;
6331
6332 if ( legacy && elt->attrs_depr )
6333 for ( p = elt->attrs_depr; *p; ++p)
6334 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6335 return HTML_DEPRECATED ;
6336
6337 return HTML_INVALID ;
6338}
6339/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006340 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006341 * @node: an htmlNodePtr in a tree
6342 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006343 * for Element nodes)
6344 *
6345 * Checks whether the tree node is valid. Experimental (the author
6346 * only uses the HTML enhancements in a SAX parser)
6347 *
6348 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6349 * legacy allowed) or htmlElementStatusHere (otherwise).
6350 * for Attribute nodes, a return from htmlAttrAllowed
6351 * for other nodes, HTML_NA (no checks performed)
6352 */
6353htmlStatus
6354htmlNodeStatus(const htmlNodePtr node, int legacy) {
6355 if ( ! node )
6356 return HTML_INVALID ;
6357
6358 switch ( node->type ) {
6359 case XML_ELEMENT_NODE:
6360 return legacy
6361 ? ( htmlElementAllowedHere (
6362 htmlTagLookup(node->parent->name) , node->name
6363 ) ? HTML_VALID : HTML_INVALID )
6364 : htmlElementStatusHere(
6365 htmlTagLookup(node->parent->name) ,
6366 htmlTagLookup(node->name) )
6367 ;
6368 case XML_ATTRIBUTE_NODE:
6369 return htmlAttrAllowed(
6370 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6371 default: return HTML_NA ;
6372 }
6373}
Daniel Veillard9475a352003-09-26 12:47:50 +00006374/************************************************************************
6375 * *
6376 * New set (2.6.0) of simpler and more flexible APIs *
6377 * *
6378 ************************************************************************/
6379/**
6380 * DICT_FREE:
6381 * @str: a string
6382 *
6383 * Free a string if it is not owned by the "dict" dictionnary in the
6384 * current scope
6385 */
6386#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006387 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006388 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6389 xmlFree((char *)(str));
6390
6391/**
6392 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006393 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006394 *
6395 * Reset a parser context
6396 */
6397void
6398htmlCtxtReset(htmlParserCtxtPtr ctxt)
6399{
6400 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006401 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006402
Daniel Veillarda03e3652004-11-02 18:45:30 +00006403 if (ctxt == NULL)
6404 return;
6405
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006406 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006407 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006408
6409 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6410 xmlFreeInputStream(input);
6411 }
6412 ctxt->inputNr = 0;
6413 ctxt->input = NULL;
6414
6415 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006416 if (ctxt->spaceTab != NULL) {
6417 ctxt->spaceTab[0] = -1;
6418 ctxt->space = &ctxt->spaceTab[0];
6419 } else {
6420 ctxt->space = NULL;
6421 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006422
6423
6424 ctxt->nodeNr = 0;
6425 ctxt->node = NULL;
6426
6427 ctxt->nameNr = 0;
6428 ctxt->name = NULL;
6429
6430 DICT_FREE(ctxt->version);
6431 ctxt->version = NULL;
6432 DICT_FREE(ctxt->encoding);
6433 ctxt->encoding = NULL;
6434 DICT_FREE(ctxt->directory);
6435 ctxt->directory = NULL;
6436 DICT_FREE(ctxt->extSubURI);
6437 ctxt->extSubURI = NULL;
6438 DICT_FREE(ctxt->extSubSystem);
6439 ctxt->extSubSystem = NULL;
6440 if (ctxt->myDoc != NULL)
6441 xmlFreeDoc(ctxt->myDoc);
6442 ctxt->myDoc = NULL;
6443
6444 ctxt->standalone = -1;
6445 ctxt->hasExternalSubset = 0;
6446 ctxt->hasPErefs = 0;
6447 ctxt->html = 1;
6448 ctxt->external = 0;
6449 ctxt->instate = XML_PARSER_START;
6450 ctxt->token = 0;
6451
6452 ctxt->wellFormed = 1;
6453 ctxt->nsWellFormed = 1;
Daniel Veillard8ad29302010-10-28 11:51:22 +02006454 ctxt->disableSAX = 0;
Daniel Veillard9475a352003-09-26 12:47:50 +00006455 ctxt->valid = 1;
6456 ctxt->vctxt.userData = ctxt;
6457 ctxt->vctxt.error = xmlParserValidityError;
6458 ctxt->vctxt.warning = xmlParserValidityWarning;
6459 ctxt->record_info = 0;
6460 ctxt->nbChars = 0;
6461 ctxt->checkIndex = 0;
6462 ctxt->inSubset = 0;
6463 ctxt->errNo = XML_ERR_OK;
6464 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006465 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006466 ctxt->catalogs = NULL;
6467 xmlInitNodeInfoSeq(&ctxt->node_seq);
6468
6469 if (ctxt->attsDefault != NULL) {
6470 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6471 ctxt->attsDefault = NULL;
6472 }
6473 if (ctxt->attsSpecial != NULL) {
6474 xmlHashFree(ctxt->attsSpecial, NULL);
6475 ctxt->attsSpecial = NULL;
6476 }
6477}
6478
6479/**
6480 * htmlCtxtUseOptions:
6481 * @ctxt: an HTML parser context
6482 * @options: a combination of htmlParserOption(s)
6483 *
6484 * Applies the options to the parser context
6485 *
6486 * Returns 0 in case of success, the set of unknown or unimplemented options
6487 * in case of error.
6488 */
6489int
6490htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6491{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006492 if (ctxt == NULL)
6493 return(-1);
6494
Daniel Veillard9475a352003-09-26 12:47:50 +00006495 if (options & HTML_PARSE_NOWARNING) {
6496 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006497 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006498 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006499 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006500 }
6501 if (options & HTML_PARSE_NOERROR) {
6502 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006503 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006504 ctxt->sax->fatalError = NULL;
6505 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006506 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006507 }
6508 if (options & HTML_PARSE_PEDANTIC) {
6509 ctxt->pedantic = 1;
6510 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006511 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006512 } else
6513 ctxt->pedantic = 0;
6514 if (options & XML_PARSE_NOBLANKS) {
6515 ctxt->keepBlanks = 0;
6516 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6517 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006518 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006519 } else
6520 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006521 if (options & HTML_PARSE_RECOVER) {
6522 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006523 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006524 } else
6525 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006526 if (options & HTML_PARSE_COMPACT) {
6527 ctxt->options |= HTML_PARSE_COMPACT;
6528 options -= HTML_PARSE_COMPACT;
6529 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006530 if (options & XML_PARSE_HUGE) {
6531 ctxt->options |= XML_PARSE_HUGE;
6532 options -= XML_PARSE_HUGE;
6533 }
Daniel Veillardf1121c42010-07-26 14:02:42 +02006534 if (options & HTML_PARSE_NODEFDTD) {
6535 ctxt->options |= HTML_PARSE_NODEFDTD;
6536 options -= HTML_PARSE_NODEFDTD;
6537 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006538 ctxt->dictNames = 0;
6539 return (options);
6540}
6541
6542/**
6543 * htmlDoRead:
6544 * @ctxt: an HTML parser context
6545 * @URL: the base URL to use for the document
6546 * @encoding: the document encoding, or NULL
6547 * @options: a combination of htmlParserOption(s)
6548 * @reuse: keep the context for reuse
6549 *
6550 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006551 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006552 * Returns the resulting document tree or NULL
6553 */
6554static htmlDocPtr
6555htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6556 int options, int reuse)
6557{
6558 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006559
Daniel Veillard9475a352003-09-26 12:47:50 +00006560 htmlCtxtUseOptions(ctxt, options);
6561 ctxt->html = 1;
6562 if (encoding != NULL) {
6563 xmlCharEncodingHandlerPtr hdlr;
6564
6565 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006566 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006567 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006568 if (ctxt->input->encoding != NULL)
6569 xmlFree((xmlChar *) ctxt->input->encoding);
6570 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6571 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006572 }
6573 if ((URL != NULL) && (ctxt->input != NULL) &&
6574 (ctxt->input->filename == NULL))
6575 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6576 htmlParseDocument(ctxt);
6577 ret = ctxt->myDoc;
6578 ctxt->myDoc = NULL;
6579 if (!reuse) {
6580 if ((ctxt->dictNames) &&
6581 (ret != NULL) &&
6582 (ret->dict == ctxt->dict))
6583 ctxt->dict = NULL;
6584 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006585 }
6586 return (ret);
6587}
6588
6589/**
6590 * htmlReadDoc:
6591 * @cur: a pointer to a zero terminated string
6592 * @URL: the base URL to use for the document
6593 * @encoding: the document encoding, or NULL
6594 * @options: a combination of htmlParserOption(s)
6595 *
6596 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006597 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006598 * Returns the resulting document tree
6599 */
6600htmlDocPtr
6601htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6602{
6603 htmlParserCtxtPtr ctxt;
6604
6605 if (cur == NULL)
6606 return (NULL);
6607
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006608 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006609 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006610 if (ctxt == NULL)
6611 return (NULL);
6612 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6613}
6614
6615/**
6616 * htmlReadFile:
6617 * @filename: a file or URL
6618 * @encoding: the document encoding, or NULL
6619 * @options: a combination of htmlParserOption(s)
6620 *
6621 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006622 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006623 * Returns the resulting document tree
6624 */
6625htmlDocPtr
6626htmlReadFile(const char *filename, const char *encoding, int options)
6627{
6628 htmlParserCtxtPtr ctxt;
6629
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006630 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006631 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6632 if (ctxt == NULL)
6633 return (NULL);
6634 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6635}
6636
6637/**
6638 * htmlReadMemory:
6639 * @buffer: a pointer to a char array
6640 * @size: the size of the array
6641 * @URL: the base URL to use for the document
6642 * @encoding: the document encoding, or NULL
6643 * @options: a combination of htmlParserOption(s)
6644 *
6645 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006646 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006647 * Returns the resulting document tree
6648 */
6649htmlDocPtr
6650htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6651{
6652 htmlParserCtxtPtr ctxt;
6653
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006654 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006655 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6656 if (ctxt == NULL)
6657 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006658 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006659 if (ctxt->sax != NULL)
6660 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006661 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6662}
6663
6664/**
6665 * htmlReadFd:
6666 * @fd: an open file descriptor
6667 * @URL: the base URL to use for the document
6668 * @encoding: the document encoding, or NULL
6669 * @options: a combination of htmlParserOption(s)
6670 *
6671 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006672 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006673 * Returns the resulting document tree
6674 */
6675htmlDocPtr
6676htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6677{
6678 htmlParserCtxtPtr ctxt;
6679 xmlParserInputBufferPtr input;
6680 xmlParserInputPtr stream;
6681
6682 if (fd < 0)
6683 return (NULL);
6684
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006685 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006686 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6687 if (input == NULL)
6688 return (NULL);
6689 ctxt = xmlNewParserCtxt();
6690 if (ctxt == NULL) {
6691 xmlFreeParserInputBuffer(input);
6692 return (NULL);
6693 }
6694 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6695 if (stream == NULL) {
6696 xmlFreeParserInputBuffer(input);
6697 xmlFreeParserCtxt(ctxt);
6698 return (NULL);
6699 }
6700 inputPush(ctxt, stream);
6701 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6702}
6703
6704/**
6705 * htmlReadIO:
6706 * @ioread: an I/O read function
6707 * @ioclose: an I/O close function
6708 * @ioctx: an I/O handler
6709 * @URL: the base URL to use for the document
6710 * @encoding: the document encoding, or NULL
6711 * @options: a combination of htmlParserOption(s)
6712 *
6713 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006714 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006715 * Returns the resulting document tree
6716 */
6717htmlDocPtr
6718htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6719 void *ioctx, const char *URL, const char *encoding, int options)
6720{
6721 htmlParserCtxtPtr ctxt;
6722 xmlParserInputBufferPtr input;
6723 xmlParserInputPtr stream;
6724
6725 if (ioread == NULL)
6726 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006727 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006728
6729 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6730 XML_CHAR_ENCODING_NONE);
6731 if (input == NULL)
6732 return (NULL);
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006733 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006734 if (ctxt == NULL) {
6735 xmlFreeParserInputBuffer(input);
6736 return (NULL);
6737 }
6738 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6739 if (stream == NULL) {
6740 xmlFreeParserInputBuffer(input);
6741 xmlFreeParserCtxt(ctxt);
6742 return (NULL);
6743 }
6744 inputPush(ctxt, stream);
6745 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6746}
6747
6748/**
6749 * htmlCtxtReadDoc:
6750 * @ctxt: an HTML parser context
6751 * @cur: a pointer to a zero terminated string
6752 * @URL: the base URL to use for the document
6753 * @encoding: the document encoding, or NULL
6754 * @options: a combination of htmlParserOption(s)
6755 *
6756 * parse an XML in-memory document and build a tree.
6757 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006758 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006759 * Returns the resulting document tree
6760 */
6761htmlDocPtr
6762htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6763 const char *URL, const char *encoding, int options)
6764{
6765 xmlParserInputPtr stream;
6766
6767 if (cur == NULL)
6768 return (NULL);
6769 if (ctxt == NULL)
6770 return (NULL);
6771
6772 htmlCtxtReset(ctxt);
6773
6774 stream = xmlNewStringInputStream(ctxt, cur);
6775 if (stream == NULL) {
6776 return (NULL);
6777 }
6778 inputPush(ctxt, stream);
6779 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6780}
6781
6782/**
6783 * htmlCtxtReadFile:
6784 * @ctxt: an HTML parser context
6785 * @filename: a file or URL
6786 * @encoding: the document encoding, or NULL
6787 * @options: a combination of htmlParserOption(s)
6788 *
6789 * parse an XML file from the filesystem or the network.
6790 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006791 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006792 * Returns the resulting document tree
6793 */
6794htmlDocPtr
6795htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6796 const char *encoding, int options)
6797{
6798 xmlParserInputPtr stream;
6799
6800 if (filename == NULL)
6801 return (NULL);
6802 if (ctxt == NULL)
6803 return (NULL);
6804
6805 htmlCtxtReset(ctxt);
6806
Daniel Veillard29614c72004-11-26 10:47:26 +00006807 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006808 if (stream == NULL) {
6809 return (NULL);
6810 }
6811 inputPush(ctxt, stream);
6812 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6813}
6814
6815/**
6816 * htmlCtxtReadMemory:
6817 * @ctxt: an HTML parser context
6818 * @buffer: a pointer to a char array
6819 * @size: the size of the array
6820 * @URL: the base URL to use for the document
6821 * @encoding: the document encoding, or NULL
6822 * @options: a combination of htmlParserOption(s)
6823 *
6824 * parse an XML in-memory document and build a tree.
6825 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006826 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006827 * Returns the resulting document tree
6828 */
6829htmlDocPtr
6830htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6831 const char *URL, const char *encoding, int options)
6832{
6833 xmlParserInputBufferPtr input;
6834 xmlParserInputPtr stream;
6835
6836 if (ctxt == NULL)
6837 return (NULL);
6838 if (buffer == NULL)
6839 return (NULL);
6840
6841 htmlCtxtReset(ctxt);
6842
6843 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6844 if (input == NULL) {
6845 return(NULL);
6846 }
6847
6848 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6849 if (stream == NULL) {
6850 xmlFreeParserInputBuffer(input);
6851 return(NULL);
6852 }
6853
6854 inputPush(ctxt, stream);
6855 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6856}
6857
6858/**
6859 * htmlCtxtReadFd:
6860 * @ctxt: an HTML parser context
6861 * @fd: an open file descriptor
6862 * @URL: the base URL to use for the document
6863 * @encoding: the document encoding, or NULL
6864 * @options: a combination of htmlParserOption(s)
6865 *
6866 * parse an XML from a file descriptor and build a tree.
6867 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006868 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006869 * Returns the resulting document tree
6870 */
6871htmlDocPtr
6872htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6873 const char *URL, const char *encoding, int options)
6874{
6875 xmlParserInputBufferPtr input;
6876 xmlParserInputPtr stream;
6877
6878 if (fd < 0)
6879 return (NULL);
6880 if (ctxt == NULL)
6881 return (NULL);
6882
6883 htmlCtxtReset(ctxt);
6884
6885
6886 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6887 if (input == NULL)
6888 return (NULL);
6889 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6890 if (stream == NULL) {
6891 xmlFreeParserInputBuffer(input);
6892 return (NULL);
6893 }
6894 inputPush(ctxt, stream);
6895 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6896}
6897
6898/**
6899 * htmlCtxtReadIO:
6900 * @ctxt: an HTML parser context
6901 * @ioread: an I/O read function
6902 * @ioclose: an I/O close function
6903 * @ioctx: an I/O handler
6904 * @URL: the base URL to use for the document
6905 * @encoding: the document encoding, or NULL
6906 * @options: a combination of htmlParserOption(s)
6907 *
6908 * parse an HTML document from I/O functions and source and build a tree.
6909 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006910 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006911 * Returns the resulting document tree
6912 */
6913htmlDocPtr
6914htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6915 xmlInputCloseCallback ioclose, void *ioctx,
6916 const char *URL,
6917 const char *encoding, int options)
6918{
6919 xmlParserInputBufferPtr input;
6920 xmlParserInputPtr stream;
6921
6922 if (ioread == NULL)
6923 return (NULL);
6924 if (ctxt == NULL)
6925 return (NULL);
6926
6927 htmlCtxtReset(ctxt);
6928
6929 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6930 XML_CHAR_ENCODING_NONE);
6931 if (input == NULL)
6932 return (NULL);
6933 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6934 if (stream == NULL) {
6935 xmlFreeParserInputBuffer(input);
6936 return (NULL);
6937 }
6938 inputPush(ctxt, stream);
6939 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6940}
6941
Daniel Veillard5d4644e2005-04-01 13:11:58 +00006942#define bottom_HTMLparser
6943#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00006944#endif /* LIBXML_HTML_ENABLED */