blob: 66ff17bb3ade1c848bc4d3aea8362a5f8c2003ea [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020062 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000063 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000112 if (ctxt != NULL)
113 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000115 XML_ERR_ERROR, NULL, 0,
116 (const char *) str1, (const char *) str2,
117 NULL, 0, 0,
118 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000119 if (ctxt != NULL)
120 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt: an HTML parser context
126 * @error: the error number
127 * @msg: the error message
128 * @val: integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134 const char *msg, int val)
135{
Daniel Veillard157fee02003-10-31 10:36:03 +0000136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137 (ctxt->instate == XML_PARSER_EOF))
138 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000139 if (ctxt != NULL)
140 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000144 if (ctxt != NULL)
145 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000146}
147
148/************************************************************************
149 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200150 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000151 * *
152 ************************************************************************/
153
Daniel Veillard1c732d22002-11-30 11:22:59 +0000154/**
155 * htmlnamePush:
156 * @ctxt: an HTML parser context
157 * @value: the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000162 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000163static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000165{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 if (ctxt->nameNr >= ctxt->nameMax) {
171 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000172 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000173 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000174 ctxt->nameMax *
175 sizeof(ctxt->nameTab[0]));
176 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000177 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000178 return (0);
179 }
180 }
181 ctxt->nameTab[ctxt->nameNr] = value;
182 ctxt->name = value;
183 return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000193static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000197
Daniel Veillard1c732d22002-11-30 11:22:59 +0000198 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000199 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000200 ctxt->nameNr--;
201 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 if (ctxt->nameNr > 0)
204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205 else
206 ctxt->name = NULL;
207 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000208 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000209 return (ret);
210}
Owen Taylor3473f882001-02-23 17:55:21 +0000211
Eugene Pimenov615904f2010-03-15 15:16:02 +0100212/**
213 * htmlNodeInfoPush:
214 * @ctxt: an HTML parser context
215 * @value: the node info
216 *
217 * Pushes a new element name on top of the node info stack
218 *
219 * Returns 0 in case of error, the index in the stack otherwise
220 */
221static int
222htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
223{
224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225 if (ctxt->nodeInfoMax == 0)
226 ctxt->nodeInfoMax = 5;
227 ctxt->nodeInfoMax *= 2;
228 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
230 ctxt->nodeInfoMax *
231 sizeof(ctxt->nodeInfoTab[0]));
232 if (ctxt->nodeInfoTab == NULL) {
233 htmlErrMemory(ctxt, NULL);
234 return (0);
235 }
236 }
237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239 return (ctxt->nodeInfoNr++);
240}
241
242/**
243 * htmlNodeInfoPop:
244 * @ctxt: an HTML parser context
245 *
246 * Pops the top element name from the node info stack
247 *
248 * Returns 0 in case of error, the pointer to NodeInfo otherwise
249 */
250static htmlParserNodeInfo *
251htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
252{
253 if (ctxt->nodeInfoNr <= 0)
254 return (NULL);
255 ctxt->nodeInfoNr--;
256 if (ctxt->nodeInfoNr < 0)
257 return (NULL);
258 if (ctxt->nodeInfoNr > 0)
259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
260 else
261 ctxt->nodeInfo = NULL;
262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
263}
264
Owen Taylor3473f882001-02-23 17:55:21 +0000265/*
266 * Macros for accessing the content. Those should be used only by the parser,
267 * and not exported.
268 *
269 * Dirty macros, i.e. one need to make assumption on the context to use them
270 *
271 * CUR_PTR return the current pointer to the xmlChar to be parsed.
272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
274 * in UNICODE mode. This should be used internally by the parser
275 * only to compare to ASCII values otherwise it would break when
276 * running with UTF-8 encoding.
277 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
278 * to compare on ASCII based substring.
279 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
280 * it should be used only to compare on ASCII based substring.
281 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000282 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000283 *
284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
285 *
286 * CURRENT Returns the current char value, with the full decoding of
287 * UTF-8 if we are using this mode. It returns an int.
288 * NEXT Skip to the next character, this does the proper decoding
289 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000290 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000291 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
292 */
293
294#define UPPER (toupper(*ctxt->input->cur))
295
Daniel Veillard77a90a72003-03-22 00:04:05 +0000296#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000297
298#define NXT(val) ctxt->input->cur[(val)]
299
300#define UPP(val) (toupper(ctxt->input->cur[(val)]))
301
302#define CUR_PTR ctxt->input->cur
303
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000304#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
305 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
306 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000307
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000308#define GROW if ((ctxt->progressive == 0) && \
309 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
310 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000311
312#define CURRENT ((int) (*ctxt->input->cur))
313
314#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
315
316/* Inported from XML */
317
Daniel Veillard561b7f82002-03-20 21:55:57 +0000318/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
319#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000320#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000321
Daniel Veillard561b7f82002-03-20 21:55:57 +0000322#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000323
324
325#define NEXTL(l) do { \
326 if (*(ctxt->input->cur) == '\n') { \
327 ctxt->input->line++; ctxt->input->col = 1; \
328 } else ctxt->input->col++; \
329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
330 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200331
Owen Taylor3473f882001-02-23 17:55:21 +0000332/************
333 \
334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
336 ************/
337
338#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
339#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
340
341#define COPY_BUF(l,b,i,v) \
342 if (l == 1) b[i++] = (xmlChar) v; \
343 else i += xmlCopyChar(l,&b[i],v)
344
345/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200346 * htmlFindEncoding:
347 * @the HTML parser context
348 *
349 * Ty to find and encoding in the current data available in the input
350 * buffer this is needed to try to switch to the proper encoding when
351 * one face a character error.
352 * That's an heuristic, since it's operating outside of parsing it could
353 * try to use a meta which had been commented out, that's the reason it
354 * should only be used in case of error, not as a default.
355 *
356 * Returns an encoding string or NULL if not found, the string need to
357 * be freed
358 */
359static xmlChar *
360htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361 const xmlChar *start, *cur, *end;
362
363 if ((ctxt == NULL) || (ctxt->input == NULL) ||
364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365 (ctxt->input->buf->encoder != NULL))
366 return(NULL);
367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
368 return(NULL);
369
370 start = ctxt->input->cur;
371 end = ctxt->input->end;
372 /* we also expect the input buffer to be zero terminated */
373 if (*end != 0)
374 return(NULL);
375
376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
377 if (cur == NULL)
378 return(NULL);
379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
380 if (cur == NULL)
381 return(NULL);
382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
383 if (cur == NULL)
384 return(NULL);
385 cur += 8;
386 start = cur;
387 while (((*cur >= 'A') && (*cur <= 'Z')) ||
388 ((*cur >= 'a') && (*cur <= 'z')) ||
389 ((*cur >= '0') && (*cur <= '9')) ||
390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
391 cur++;
392 if (cur == start)
393 return(NULL);
394 return(xmlStrndup(start, cur - start));
395}
396
397/**
Owen Taylor3473f882001-02-23 17:55:21 +0000398 * htmlCurrentChar:
399 * @ctxt: the HTML parser context
400 * @len: pointer to the length of the char read
401 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000402 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000403 * bytes in the input buffer. Implement the end of line normalization:
404 * 2.11 End-of-Line Handling
405 * If the encoding is unspecified, in the case we find an ISO-Latin-1
406 * char, then the encoding converter is plugged in automatically.
407 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000408 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000409 */
410
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000411static int
Owen Taylor3473f882001-02-23 17:55:21 +0000412htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
413 if (ctxt->instate == XML_PARSER_EOF)
414 return(0);
415
416 if (ctxt->token != 0) {
417 *len = 0;
418 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200419 }
Owen Taylor3473f882001-02-23 17:55:21 +0000420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
421 /*
422 * We are supposed to handle UTF8, check it's valid
423 * From rfc2044: encoding of the Unicode values on UTF-8:
424 *
425 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
426 * 0000 0000-0000 007F 0xxxxxxx
427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200428 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000429 *
430 * Check for the 0x110000 limit too
431 */
432 const unsigned char *cur = ctxt->input->cur;
433 unsigned char c;
434 unsigned int val;
435
436 c = *cur;
437 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200438 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200440 cur = ctxt->input->cur;
441 }
Owen Taylor3473f882001-02-23 17:55:21 +0000442 if ((cur[1] & 0xc0) != 0x80)
443 goto encoding_error;
444 if ((c & 0xe0) == 0xe0) {
445
Adiel Mittmann8a103792009-08-25 11:27:13 +0200446 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200448 cur = ctxt->input->cur;
449 }
Owen Taylor3473f882001-02-23 17:55:21 +0000450 if ((cur[2] & 0xc0) != 0x80)
451 goto encoding_error;
452 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200453 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200455 cur = ctxt->input->cur;
456 }
Owen Taylor3473f882001-02-23 17:55:21 +0000457 if (((c & 0xf8) != 0xf0) ||
458 ((cur[3] & 0xc0) != 0x80))
459 goto encoding_error;
460 /* 4-byte code */
461 *len = 4;
462 val = (cur[0] & 0x7) << 18;
463 val |= (cur[1] & 0x3f) << 12;
464 val |= (cur[2] & 0x3f) << 6;
465 val |= cur[3] & 0x3f;
466 } else {
467 /* 3-byte code */
468 *len = 3;
469 val = (cur[0] & 0xf) << 12;
470 val |= (cur[1] & 0x3f) << 6;
471 val |= cur[2] & 0x3f;
472 }
473 } else {
474 /* 2-byte code */
475 *len = 2;
476 val = (cur[0] & 0x1f) << 6;
477 val |= cur[1] & 0x3f;
478 }
479 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
481 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200482 }
Owen Taylor3473f882001-02-23 17:55:21 +0000483 return(val);
484 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200485 if ((*ctxt->input->cur == 0) &&
486 (ctxt->input->cur < ctxt->input->end)) {
487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488 "Char 0x%X out of allowed range\n", 0);
489 *len = 1;
490 return(' ');
491 }
Owen Taylor3473f882001-02-23 17:55:21 +0000492 /* 1-byte code */
493 *len = 1;
494 return((int) *ctxt->input->cur);
495 }
496 }
497 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000498 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000499 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000500 * XML constructs only use < 128 chars
501 */
502 *len = 1;
503 if ((int) *ctxt->input->cur < 0x80)
504 return((int) *ctxt->input->cur);
505
506 /*
507 * Humm this is bad, do an automatic flow conversion
508 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200509 {
510 xmlChar * guess;
511 xmlCharEncodingHandlerPtr handler;
512
513 guess = htmlFindEncoding(ctxt);
514 if (guess == NULL) {
515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
516 } else {
517 if (ctxt->input->encoding != NULL)
518 xmlFree((xmlChar *) ctxt->input->encoding);
519 ctxt->input->encoding = guess;
520 handler = xmlFindCharEncodingHandler((const char *) guess);
521 if (handler != NULL) {
522 xmlSwitchToEncoding(ctxt, handler);
523 } else {
524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525 "Unsupported encoding %s", guess, NULL);
526 }
527 }
528 ctxt->charset = XML_CHAR_ENCODING_UTF8;
529 }
530
Owen Taylor3473f882001-02-23 17:55:21 +0000531 return(xmlCurrentChar(ctxt, len));
532
533encoding_error:
534 /*
535 * If we detect an UTF8 error that probably mean that the
536 * input encoding didn't get properly advertized in the
537 * declaration header. Report the error and switch the encoding
538 * to ISO-Latin-1 (if you don't like this policy, just declare the
539 * encoding !)
540 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000541 {
542 char buffer[150];
543
Daniel Veillard861101d2007-06-12 08:38:57 +0000544 if (ctxt->input->end - ctxt->input->cur >= 4) {
545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
546 ctxt->input->cur[0], ctxt->input->cur[1],
547 ctxt->input->cur[2], ctxt->input->cur[3]);
548 } else {
549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
550 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
552 "Input is not proper UTF-8, indicate encoding !\n",
553 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000554 }
555
Daniel Veillarde77db162009-08-22 11:32:38 +0200556 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000557 *len = 1;
558 return((int) *ctxt->input->cur);
559}
560
561/**
Owen Taylor3473f882001-02-23 17:55:21 +0000562 * htmlSkipBlankChars:
563 * @ctxt: the HTML parser context
564 *
565 * skip all blanks character found at that point in the input streams.
566 *
567 * Returns the number of space chars skipped
568 */
569
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000570static int
Owen Taylor3473f882001-02-23 17:55:21 +0000571htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
572 int res = 0;
573
William M. Brack76e95df2003-10-18 16:20:14 +0000574 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000575 if ((*ctxt->input->cur == 0) &&
576 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
577 xmlPopInput(ctxt);
578 } else {
579 if (*(ctxt->input->cur) == '\n') {
580 ctxt->input->line++; ctxt->input->col = 1;
581 } else ctxt->input->col++;
582 ctxt->input->cur++;
583 ctxt->nbChars++;
584 if (*ctxt->input->cur == 0)
585 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
586 }
587 res++;
588 }
589 return(res);
590}
591
592
593
594/************************************************************************
595 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200596 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000597 * *
598 ************************************************************************/
599
600/*
601 * Start Tag: 1 means the start tag can be ommited
602 * End Tag: 1 means the end tag can be ommited
603 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000604 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000605 * Depr: this element is deprecated
606 * DTD: 1 means that this element is valid only in the Loose DTD
607 * 2 means that this element is valid only in the Frameset DTD
608 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000610 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000611 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000612
613/* Definitions and a couple of vars for HTML Elements */
614
615#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000616#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000617#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000618#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000619#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
620#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100621#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000622#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100623#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000624#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000625#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000626#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000627#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000628#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000629#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000630#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000631#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000632#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000633#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000634#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000635#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000636#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000637#define EMPTY NULL
638
639
Daniel Veillard065abe82006-07-03 08:55:04 +0000640static const char* const html_flow[] = { FLOW, NULL } ;
641static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000642
643/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000644static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000645#define html_cdata html_pcdata
646
647
648/* ... and for HTML Attributes */
649
650#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000651#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000652#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000653#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000654#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000655#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000656#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000657#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000658#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000659#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000660#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000661#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000662
Daniel Veillard065abe82006-07-03 08:55:04 +0000663static const char* const html_attrs[] = { ATTRS, NULL } ;
664static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
665static const char* const core_attrs[] = { COREATTRS, NULL } ;
666static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000667
668
669/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000670static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000671 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
672 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000673static const char* const target_attr[] = { "target", NULL } ;
674static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
675static const char* const alt_attr[] = { "alt", NULL } ;
676static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
677static const char* const href_attrs[] = { "href", NULL } ;
678static const char* const clear_attrs[] = { "clear", NULL } ;
679static const char* const inline_p[] = { INLINE, "p", NULL } ;
680
681static const char* const flow_param[] = { FLOW, "param", NULL } ;
682static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000683 "archive", "alt", "name", "height", "width", "align",
684 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000685static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000686 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000687static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000688 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000689static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
690static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
691static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
692static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000693 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000694static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000695 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
696
697
Daniel Veillard065abe82006-07-03 08:55:04 +0000698static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
699static const char* const col_elt[] = { "col", NULL } ;
700static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
701static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
702static const char* const dl_contents[] = { "dt", "dd", NULL } ;
703static const char* const compact_attr[] = { "compact", NULL } ;
704static const char* const label_attr[] = { "label", NULL } ;
705static const char* const fieldset_contents[] = { FLOW, "legend" } ;
706static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
707static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
708static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
709static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
710static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
711static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
712static const char* const head_attrs[] = { I18N, "profile", NULL } ;
713static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
714static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
715static const char* const version_attr[] = { "version", NULL } ;
716static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
717static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
718static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000719static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000720static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
721static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
722static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
723static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
724static const char* const align_attr[] = { "align", NULL } ;
725static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
726static const char* const map_contents[] = { BLOCK, "area", NULL } ;
727static const char* const name_attr[] = { "name", NULL } ;
728static const char* const action_attr[] = { "action", NULL } ;
729static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
Denis Pauk868d92d2012-05-10 15:34:57 +0800730static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000731static const char* const content_attr[] = { "content", NULL } ;
732static const char* const type_attr[] = { "type", NULL } ;
733static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
734static const char* const object_contents[] = { FLOW, "param", NULL } ;
735static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
736static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
737static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
738static const char* const option_elt[] = { "option", NULL } ;
739static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
740static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
741static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
742static const char* const width_attr[] = { "width", NULL } ;
743static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
744static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
745static const char* const language_attr[] = { "language", NULL } ;
746static const char* const select_content[] = { "optgroup", "option", NULL } ;
747static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
748static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200749static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000750static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
751static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
752static const char* const tr_elt[] = { "tr", NULL } ;
753static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
754static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
755static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
756static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
757static const char* const tr_contents[] = { "th", "td", NULL } ;
758static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
759static const char* const li_elt[] = { "li", NULL } ;
760static const char* const ul_depr[] = { "type", "compact", NULL} ;
761static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000762
763#define DECL (const char**)
764
Daniel Veillard22090732001-07-16 00:06:07 +0000765static const htmlElemDesc
766html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000767{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
768 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
769},
770{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
771 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772},
773{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775},
776{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
777 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
778},
779{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
780 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
781},
782{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
783 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
784},
785{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
786 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787},
788{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
789 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
790},
791{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
792 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
793},
794{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
795 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
796},
797{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
798 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
799},
800{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
801 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
802},
803{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
804 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
805},
806{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
807 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
808},
809{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
810 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
811},
812{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
813 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
814},
815{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
816 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
817},
818{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820},
821{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823},
824{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
825 EMPTY , NULL , DECL col_attrs , NULL, NULL
826},
827{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
828 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
829},
830{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
831 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
832},
833{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
834 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
835},
836{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
837 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
838},
839{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
840 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
841},
842{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
843 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
844},
845{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000846 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000847},
848{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
849 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850},
851{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000854{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000855 EMPTY, NULL, DECL embed_attrs, NULL, NULL
856},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000857{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
858 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
859},
860{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
861 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
862},
863{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
864 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
865},
866{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
867 EMPTY, NULL, NULL, DECL frame_attrs, NULL
868},
869{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
870 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
871},
872{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
873 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
874},
875{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877},
878{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880},
881{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883},
884{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886},
887{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889},
890{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
891 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
892},
893{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
894 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
895},
896{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
897 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
898},
899{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
900 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
901},
902{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
903 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
904},
905{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000906 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000907},
908{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
909 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
910},
911{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
912 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
913},
914{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
915 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
916},
917{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
918 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919},
920{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
921 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
922},
923{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
924 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
925},
926{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
927 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
928},
929{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
930 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
931},
932{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000933 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000934},
935{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
936 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
937},
938{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
939 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
940},
941{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
942 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
943},
944{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
945 DECL html_flow, "div", DECL html_attrs, NULL, NULL
946},
947{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
948 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
949},
950{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
951 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
952},
953{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000954 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000955},
956{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
957 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
958},
959{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
960 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
961},
962{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000963 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000964},
965{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
966 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
967},
968{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
969 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
970},
971{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
972 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
973},
974{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
975 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976},
977{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
978 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
979},
980{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
981 DECL select_content, NULL, DECL select_attrs, NULL, NULL
982},
983{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
984 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985},
986{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988},
989{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
990 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
991},
992{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
993 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
994},
995{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
996 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
997},
998{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
999 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000},
1001{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003},
1004{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1005 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006},
1007{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1008 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009},
1010{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1011 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012},
1013{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015},
1016{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1017 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018},
1019{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1020 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021},
1022{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1023 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024},
1025{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1026 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027},
1028{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1029 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030},
1031{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033},
1034{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036},
1037{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039},
1040{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042}
Owen Taylor3473f882001-02-23 17:55:21 +00001043};
1044
1045/*
Owen Taylor3473f882001-02-23 17:55:21 +00001046 * start tags that imply the end of current element
1047 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001048static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001049"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1051 "listing", "xmp", "head", NULL,
1052"head", "p", NULL,
1053"title", "p", NULL,
1054"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001055"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001056"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057 "pre", "listing", "xmp", "head", "li", NULL,
1058"hr", "p", "head", NULL,
1059"h1", "p", "head", NULL,
1060"h2", "p", "head", NULL,
1061"h3", "p", "head", NULL,
1062"h4", "p", "head", NULL,
1063"h5", "p", "head", NULL,
1064"h6", "p", "head", NULL,
1065"dir", "p", "head", NULL,
1066"address", "p", "head", "ul", NULL,
1067"pre", "p", "head", "ul", NULL,
1068"listing", "p", "head", NULL,
1069"xmp", "p", "head", NULL,
1070"blockquote", "p", "head", NULL,
1071"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1072 "xmp", "head", NULL,
1073"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1074 "head", "dd", NULL,
1075"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1076 "head", "dt", NULL,
1077"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1078 "listing", "xmp", NULL,
1079"ol", "p", "head", "ul", NULL,
1080"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001081"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001082"div", "p", "head", NULL,
Denis Pauka0cd0752012-05-11 19:31:12 +08001083"noscript", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001084"center", "font", "b", "i", "p", "head", NULL,
1085"a", "a", NULL,
1086"caption", "p", NULL,
1087"colgroup", "caption", "colgroup", "col", "p", NULL,
1088"col", "caption", "col", "p", NULL,
1089"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001091"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001092"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001093"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094"thead", "caption", "col", "colgroup", NULL,
1095"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1096 "tbody", "p", NULL,
1097"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1098 "tfoot", "tbody", "p", NULL,
1099"optgroup", "option", NULL,
1100"option", "option", NULL,
1101"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102 "pre", "listing", "xmp", "a", NULL,
1103NULL
1104};
1105
1106/*
1107 * The list of HTML elements which are supposed not to have
1108 * CDATA content and where a p element will be implied
1109 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001110 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001111 * implied paragraph
1112 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001113static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001114 "html",
1115 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001116 NULL
1117};
1118
1119/*
1120 * The list of HTML attributes which are of content %Script;
1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122 * it assumes the name starts with 'on'
1123 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001124static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001125 "onclick",
1126 "ondblclick",
1127 "onmousedown",
1128 "onmouseup",
1129 "onmouseover",
1130 "onmousemove",
1131 "onmouseout",
1132 "onkeypress",
1133 "onkeydown",
1134 "onkeyup",
1135 "onload",
1136 "onunload",
1137 "onfocus",
1138 "onblur",
1139 "onsubmit",
1140 "onrest",
1141 "onchange",
1142 "onselect"
1143};
1144
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001145/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001146 * This table is used by the htmlparser to know what to do with
1147 * broken html pages. By assigning different priorities to different
1148 * elements the parser can decide how to handle extra endtags.
1149 * Endtags are only allowed to close elements with lower or equal
1150 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001151 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001152
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001153typedef struct {
1154 const char *name;
1155 int priority;
1156} elementPriority;
1157
Daniel Veillard22090732001-07-16 00:06:07 +00001158static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001159 {"div", 150},
1160 {"td", 160},
1161 {"th", 160},
1162 {"tr", 170},
1163 {"thead", 180},
1164 {"tbody", 180},
1165 {"tfoot", 180},
1166 {"table", 190},
1167 {"head", 200},
1168 {"body", 200},
1169 {"html", 220},
1170 {NULL, 100} /* Default priority */
1171};
Owen Taylor3473f882001-02-23 17:55:21 +00001172
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001173static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001174static int htmlStartCloseIndexinitialized = 0;
1175
1176/************************************************************************
1177 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001178 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001179 * *
1180 ************************************************************************/
1181
1182/**
1183 * htmlInitAutoClose:
1184 *
1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186 * This is not reentrant. Call xmlInitParser() once before processing in
1187 * case of use in multithreaded programs.
1188 */
1189void
1190htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001191 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001192
1193 if (htmlStartCloseIndexinitialized) return;
1194
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001195 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196 indx = 0;
1197 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001198 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001199 while (htmlStartClose[i] != NULL) i++;
1200 i++;
1201 }
1202 htmlStartCloseIndexinitialized = 1;
1203}
1204
1205/**
1206 * htmlTagLookup:
1207 * @tag: The tag name in lowercase
1208 *
1209 * Lookup the HTML tag in the ElementTable
1210 *
1211 * Returns the related htmlElemDescPtr or NULL if not found.
1212 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001213const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001214htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001215 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001216
1217 for (i = 0; i < (sizeof(html40ElementTable) /
1218 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001220 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001221 }
1222 return(NULL);
1223}
1224
1225/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001226 * htmlGetEndPriority:
1227 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001228 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001229 * Return value: The "endtag" priority.
1230 **/
1231static int
1232htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001233 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001234
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001235 while ((htmlEndPriority[i].name != NULL) &&
1236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001238
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001239 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001240}
1241
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001242
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001243/**
Owen Taylor3473f882001-02-23 17:55:21 +00001244 * htmlCheckAutoClose:
1245 * @newtag: The new tag name
1246 * @oldtag: The old tag name
1247 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001248 * Checks whether the new tag is one of the registered valid tags for
1249 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251 *
1252 * Returns 0 if no, 1 if yes.
1253 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001254static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001255htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001257 int i, indx;
1258 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001259
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001260 if (htmlStartCloseIndexinitialized == 0)
1261 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001262
1263 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001264 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001265 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001266 if (closed == NULL)
1267 return (0);
1268 if (xmlStrEqual(BAD_CAST * closed, newtag))
1269 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001270 }
1271
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001272 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001273 i++;
1274 while (htmlStartClose[i] != NULL) {
1275 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001276 return (1);
1277 }
1278 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001279 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001281}
1282
1283/**
1284 * htmlAutoCloseOnClose:
1285 * @ctxt: an HTML parser context
1286 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001287 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001288 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001289 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001290 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001291static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001292htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293{
1294 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001295 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001296
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001298
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001299 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001300
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001301 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302 break;
1303 /*
1304 * A missplaced endtag can only close elements with lower
1305 * or equal priority, so if we find an element with higher
1306 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001307 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001308 */
1309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001311 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 if (i < 0)
1313 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001314
1315 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001316 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001317 if ((info != NULL) && (info->endTag == 3)) {
1318 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001320 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001321 }
1322 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001324 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001325 }
1326}
1327
1328/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001329 * htmlAutoCloseOnEnd:
1330 * @ctxt: an HTML parser context
1331 *
1332 * Close all remaining tags at the end of the stream
1333 */
1334static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001335htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001337 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001338
William M. Brack899e64a2003-09-26 18:03:42 +00001339 if (ctxt->nameNr == 0)
1340 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001341 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001342 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001344 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001345 }
1346}
1347
1348/**
Owen Taylor3473f882001-02-23 17:55:21 +00001349 * htmlAutoClose:
1350 * @ctxt: an HTML parser context
1351 * @newtag: The new tag name or NULL
1352 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001353 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001354 * The list is kept in htmlStartClose array. This function is
1355 * called when a new tag has been detected and generates the
1356 * appropriates closes if possible/needed.
1357 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001358 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001359 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001360static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001361htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001363 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001364 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001367 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001368 }
1369 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001370 htmlAutoCloseOnEnd(ctxt);
1371 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001372 }
1373 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001374 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001377 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001379 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001380 }
Owen Taylor3473f882001-02-23 17:55:21 +00001381}
1382
1383/**
1384 * htmlAutoCloseTag:
1385 * @doc: the HTML document
1386 * @name: The tag name
1387 * @elem: the HTML element
1388 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001389 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001390 * The list is kept in htmlStartClose array. This function checks
1391 * if the element or one of it's children would autoclose the
1392 * given tag.
1393 *
1394 * Returns 1 if autoclose, 0 otherwise
1395 */
1396int
1397htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398 htmlNodePtr child;
1399
1400 if (elem == NULL) return(1);
1401 if (xmlStrEqual(name, elem->name)) return(0);
1402 if (htmlCheckAutoClose(elem->name, name)) return(1);
1403 child = elem->children;
1404 while (child != NULL) {
1405 if (htmlAutoCloseTag(doc, name, child)) return(1);
1406 child = child->next;
1407 }
1408 return(0);
1409}
1410
1411/**
1412 * htmlIsAutoClosed:
1413 * @doc: the HTML document
1414 * @elem: the HTML element
1415 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001416 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001417 * The list is kept in htmlStartClose array. This function checks
1418 * if a tag is autoclosed by one of it's child
1419 *
1420 * Returns 1 if autoclosed, 0 otherwise
1421 */
1422int
1423htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424 htmlNodePtr child;
1425
1426 if (elem == NULL) return(1);
1427 child = elem->children;
1428 while (child != NULL) {
1429 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430 child = child->next;
1431 }
1432 return(0);
1433}
1434
1435/**
1436 * htmlCheckImplied:
1437 * @ctxt: an HTML parser context
1438 * @newtag: The new tag name
1439 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001440 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001441 * called when a new tag has been detected and generates the
1442 * appropriates implicit tags if missing
1443 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001444static void
Owen Taylor3473f882001-02-23 17:55:21 +00001445htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001446 int i;
1447
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001448 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001450 if (!htmlOmittedDefaultValue)
1451 return;
1452 if (xmlStrEqual(newtag, BAD_CAST"html"))
1453 return;
1454 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001455 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458 }
1459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001461 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001462 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001468 if (ctxt->html >= 3) {
1469 /* we already saw or generated an <head> before */
1470 return;
1471 }
1472 /*
1473 * dropped OBJECT ... i you put it first BODY will be
1474 * assumed !
1475 */
1476 htmlnamePush(ctxt, BAD_CAST"head");
1477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001482 if (ctxt->html >= 10) {
1483 /* we already saw or generated a <body> before */
1484 return;
1485 }
Owen Taylor3473f882001-02-23 17:55:21 +00001486 for (i = 0;i < ctxt->nameNr;i++) {
1487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488 return;
1489 }
1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491 return;
1492 }
1493 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001494
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001495 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498 }
1499}
1500
1501/**
1502 * htmlCheckParagraph
1503 * @ctxt: an HTML parser context
1504 *
1505 * Check whether a p element need to be implied before inserting
1506 * characters in the current element.
1507 *
1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509 * in case of error.
1510 */
1511
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001512static int
Owen Taylor3473f882001-02-23 17:55:21 +00001513htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514 const xmlChar *tag;
1515 int i;
1516
1517 if (ctxt == NULL)
1518 return(-1);
1519 tag = ctxt->name;
1520 if (tag == NULL) {
1521 htmlAutoClose(ctxt, BAD_CAST"p");
1522 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001523 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001524 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526 return(1);
1527 }
1528 if (!htmlOmittedDefaultValue)
1529 return(0);
1530 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001532 htmlAutoClose(ctxt, BAD_CAST"p");
1533 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001534 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001535 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537 return(1);
1538 }
1539 }
1540 return(0);
1541}
1542
1543/**
1544 * htmlIsScriptAttribute:
1545 * @name: an attribute name
1546 *
1547 * Check if an attribute is of content type Script
1548 *
1549 * Returns 1 is the attribute is a script 0 otherwise
1550 */
1551int
1552htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001553 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001554
1555 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001556 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001557 /*
1558 * all script attributes start with 'on'
1559 */
1560 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001561 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001562 for (i = 0;
1563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564 i++) {
1565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566 return(1);
1567 }
1568 return(0);
1569}
1570
1571/************************************************************************
1572 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001573 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001574 * *
1575 ************************************************************************/
1576
1577
Daniel Veillard22090732001-07-16 00:06:07 +00001578static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001579/*
1580 * the 4 absolute ones, plus apostrophe.
1581 */
1582{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1583{ 38, "amp", "ampersand, U+0026 ISOnum" },
1584{ 39, "apos", "single quote" },
1585{ 60, "lt", "less-than sign, U+003C ISOnum" },
1586{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1587
1588/*
1589 * A bunch still in the 128-255 range
1590 * Replacing them depend really on the charset used.
1591 */
1592{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1593{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1595{ 163, "pound","pound sign, U+00A3 ISOnum" },
1596{ 164, "curren","currency sign, U+00A4 ISOnum" },
1597{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1598{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599{ 167, "sect", "section sign, U+00A7 ISOnum" },
1600{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1602{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1603{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604{ 172, "not", "not sign, U+00AC ISOnum" },
1605{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1607{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1609{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613{ 181, "micro","micro sign, U+00B5 ISOnum" },
1614{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1618{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1619{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1641{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1648{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1668{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1672{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1673{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679{ 247, "divide","division sign, U+00F7 ISOnum" },
1680{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1685{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1688
1689{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1691{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694
1695/*
1696 * Anything below should really be kept as entities references
1697 */
1698{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1699
1700{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1701{ 732, "tilde","small tilde, U+02DC ISOdia" },
1702
1703{ 913, "Alpha","greek capital letter alpha, U+0391" },
1704{ 914, "Beta", "greek capital letter beta, U+0392" },
1705{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1708{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1709{ 919, "Eta", "greek capital letter eta, U+0397" },
1710{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711{ 921, "Iota", "greek capital letter iota, U+0399" },
1712{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001713{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001714{ 924, "Mu", "greek capital letter mu, U+039C" },
1715{ 925, "Nu", "greek capital letter nu, U+039D" },
1716{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1717{ 927, "Omicron","greek capital letter omicron, U+039F" },
1718{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1719{ 929, "Rho", "greek capital letter rho, U+03A1" },
1720{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721{ 932, "Tau", "greek capital letter tau, U+03A4" },
1722{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1724{ 935, "Chi", "greek capital letter chi, U+03A7" },
1725{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1726{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727
1728{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1730{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1732{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1734{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1735{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1736{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1737{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1740{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1741{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1742{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1743{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1744{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1745{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1748{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1750{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1751{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1752{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1753{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1756
1757{ 8194, "ensp", "en space, U+2002 ISOpub" },
1758{ 8195, "emsp", "em space, U+2003 ISOpub" },
1759{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1760{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1761{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1762{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1763{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1764{ 8211, "ndash","en dash, U+2013 ISOpub" },
1765{ 8212, "mdash","em dash, U+2014 ISOpub" },
1766{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1767{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1768{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1769{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1770{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1771{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1772{ 8224, "dagger","dagger, U+2020 ISOpub" },
1773{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1774
1775{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1776{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777
1778{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1779
1780{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1781{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782
1783{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785
1786{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1787{ 8260, "frasl","fraction slash, U+2044 NEW" },
1788
1789{ 8364, "euro", "euro sign, U+20AC NEW" },
1790
1791{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1794{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1795{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1797{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1798{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1799{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1800{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1801{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1803{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1804{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1805{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1806{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1807
1808{ 8704, "forall","for all, U+2200 ISOtech" },
1809{ 8706, "part", "partial differential, U+2202 ISOtech" },
1810{ 8707, "exist","there exists, U+2203 ISOtech" },
1811{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1812{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1813{ 8712, "isin", "element of, U+2208 ISOtech" },
1814{ 8713, "notin","not an element of, U+2209 ISOtech" },
1815{ 8715, "ni", "contains as member, U+220B ISOtech" },
1816{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001817{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001818{ 8722, "minus","minus sign, U+2212 ISOtech" },
1819{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1820{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1821{ 8733, "prop", "proportional to, U+221D ISOtech" },
1822{ 8734, "infin","infinity, U+221E ISOtech" },
1823{ 8736, "ang", "angle, U+2220 ISOamso" },
1824{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1825{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1826{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1827{ 8746, "cup", "union = cup, U+222A ISOtech" },
1828{ 8747, "int", "integral, U+222B ISOtech" },
1829{ 8756, "there4","therefore, U+2234 ISOtech" },
1830{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1831{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1832{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1834{ 8801, "equiv","identical to, U+2261 ISOtech" },
1835{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1836{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1837{ 8834, "sub", "subset of, U+2282 ISOtech" },
1838{ 8835, "sup", "superset of, U+2283 ISOtech" },
1839{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1840{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1841{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1842{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1844{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1846{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1848{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1850{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1851{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1852{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1853
1854{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1855{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1856{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1857{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1858
1859};
1860
1861/************************************************************************
1862 * *
1863 * Commodity functions to handle entities *
1864 * *
1865 ************************************************************************/
1866
1867/*
1868 * Macro used to grow the current buffer.
1869 */
1870#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001871 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001872 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001873 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001875 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001876 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001877 return(NULL); \
1878 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001879 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001880}
1881
1882/**
1883 * htmlEntityLookup:
1884 * @name: the entity name
1885 *
1886 * Lookup the given entity in EntitiesTable
1887 *
1888 * TODO: the linear scan is really ugly, an hash table is really needed.
1889 *
1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001892const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001893htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001894 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001895
1896 for (i = 0;i < (sizeof(html40EntitiesTable)/
1897 sizeof(html40EntitiesTable[0]));i++) {
1898 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001899 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001900 }
1901 }
1902 return(NULL);
1903}
1904
1905/**
1906 * htmlEntityValueLookup:
1907 * @value: the entity's unicode value
1908 *
1909 * Lookup the given entity in EntitiesTable
1910 *
1911 * TODO: the linear scan is really ugly, an hash table is really needed.
1912 *
1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001915const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001916htmlEntityValueLookup(unsigned int value) {
1917 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001918
1919 for (i = 0;i < (sizeof(html40EntitiesTable)/
1920 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001921 if (html40EntitiesTable[i].value >= value) {
1922 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001923 break;
William M. Brack78637da2003-07-31 14:47:38 +00001924 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001925 }
Owen Taylor3473f882001-02-23 17:55:21 +00001926 }
1927 return(NULL);
1928}
1929
1930/**
1931 * UTF8ToHtml:
1932 * @out: a pointer to an array of bytes to store the result
1933 * @outlen: the length of @out
1934 * @in: a pointer to an array of UTF-8 chars
1935 * @inlen: the length of @in
1936 *
1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938 * plus HTML entities block of chars out.
1939 *
1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001942 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001943 * The value of @outlen after return is the number of octets consumed.
1944 */
1945int
1946UTF8ToHtml(unsigned char* out, int *outlen,
1947 const unsigned char* in, int *inlen) {
1948 const unsigned char* processed = in;
1949 const unsigned char* outend;
1950 const unsigned char* outstart = out;
1951 const unsigned char* instart = in;
1952 const unsigned char* inend;
1953 unsigned int c, d;
1954 int trailing;
1955
Daniel Veillardce682bc2004-11-05 17:22:25 +00001956 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001957 if (in == NULL) {
1958 /*
1959 * initialization nothing to do
1960 */
1961 *outlen = 0;
1962 *inlen = 0;
1963 return(0);
1964 }
1965 inend = in + (*inlen);
1966 outend = out + (*outlen);
1967 while (in < inend) {
1968 d = *in++;
1969 if (d < 0x80) { c= d; trailing= 0; }
1970 else if (d < 0xC0) {
1971 /* trailing byte in leading position */
1972 *outlen = out - outstart;
1973 *inlen = processed - instart;
1974 return(-2);
1975 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1976 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1978 else {
1979 /* no chance for this in Ascii */
1980 *outlen = out - outstart;
1981 *inlen = processed - instart;
1982 return(-2);
1983 }
1984
1985 if (inend - in < trailing) {
1986 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001987 }
Owen Taylor3473f882001-02-23 17:55:21 +00001988
1989 for ( ; trailing; trailing--) {
1990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991 break;
1992 c <<= 6;
1993 c |= d & 0x3F;
1994 }
1995
1996 /* assertion: c is a single UTF-4 value */
1997 if (c < 0x80) {
1998 if (out + 1 >= outend)
1999 break;
2000 *out++ = c;
2001 } else {
2002 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00002003 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00002004 const char *cp;
2005 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00002006
2007 /*
2008 * Try to lookup a predefined HTML entity for it
2009 */
2010
2011 ent = htmlEntityValueLookup(c);
2012 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00002013 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00002015 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00002016 else
2017 cp = ent->name;
2018 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00002019 if (out + 2 + len >= outend)
2020 break;
2021 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00002022 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00002023 out += len;
2024 *out++ = ';';
2025 }
2026 processed = in;
2027 }
2028 *outlen = out - outstart;
2029 *inlen = processed - instart;
2030 return(0);
2031}
2032
2033/**
2034 * htmlEncodeEntities:
2035 * @out: a pointer to an array of bytes to store the result
2036 * @outlen: the length of @out
2037 * @in: a pointer to an array of UTF-8 chars
2038 * @inlen: the length of @in
2039 * @quoteChar: the quote character to escape (' or ") or zero.
2040 *
2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042 * plus HTML entities block of chars out.
2043 *
2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002046 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00002047 * The value of @outlen after return is the number of octets consumed.
2048 */
2049int
2050htmlEncodeEntities(unsigned char* out, int *outlen,
2051 const unsigned char* in, int *inlen, int quoteChar) {
2052 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002053 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002054 const unsigned char* outstart = out;
2055 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002056 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002057 unsigned int c, d;
2058 int trailing;
2059
Daniel Veillardce682bc2004-11-05 17:22:25 +00002060 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061 return(-1);
2062 outend = out + (*outlen);
2063 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002064 while (in < inend) {
2065 d = *in++;
2066 if (d < 0x80) { c= d; trailing= 0; }
2067 else if (d < 0xC0) {
2068 /* trailing byte in leading position */
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2071 return(-2);
2072 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2073 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2074 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2075 else {
2076 /* no chance for this in Ascii */
2077 *outlen = out - outstart;
2078 *inlen = processed - instart;
2079 return(-2);
2080 }
2081
2082 if (inend - in < trailing)
2083 break;
2084
2085 while (trailing--) {
2086 if (((d= *in++) & 0xC0) != 0x80) {
2087 *outlen = out - outstart;
2088 *inlen = processed - instart;
2089 return(-2);
2090 }
2091 c <<= 6;
2092 c |= d & 0x3F;
2093 }
2094
2095 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002096 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002098 if (out >= outend)
2099 break;
2100 *out++ = c;
2101 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002102 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002103 const char *cp;
2104 char nbuf[16];
2105 int len;
2106
2107 /*
2108 * Try to lookup a predefined HTML entity for it
2109 */
2110 ent = htmlEntityValueLookup(c);
2111 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002112 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002113 cp = nbuf;
2114 }
2115 else
2116 cp = ent->name;
2117 len = strlen(cp);
2118 if (out + 2 + len > outend)
2119 break;
2120 *out++ = '&';
2121 memcpy(out, cp, len);
2122 out += len;
2123 *out++ = ';';
2124 }
2125 processed = in;
2126 }
2127 *outlen = out - outstart;
2128 *inlen = processed - instart;
2129 return(0);
2130}
2131
Owen Taylor3473f882001-02-23 17:55:21 +00002132/************************************************************************
2133 * *
2134 * Commodity functions to handle streams *
2135 * *
2136 ************************************************************************/
2137
2138/**
Owen Taylor3473f882001-02-23 17:55:21 +00002139 * htmlNewInputStream:
2140 * @ctxt: an HTML parser context
2141 *
2142 * Create a new input stream structure
2143 * Returns the new input stream or NULL
2144 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002145static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002146htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147 htmlParserInputPtr input;
2148
2149 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002151 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002152 return(NULL);
2153 }
2154 memset(input, 0, sizeof(htmlParserInput));
2155 input->filename = NULL;
2156 input->directory = NULL;
2157 input->base = NULL;
2158 input->cur = NULL;
2159 input->buf = NULL;
2160 input->line = 1;
2161 input->col = 1;
2162 input->buf = NULL;
2163 input->free = NULL;
2164 input->version = NULL;
2165 input->consumed = 0;
2166 input->length = 0;
2167 return(input);
2168}
2169
2170
2171/************************************************************************
2172 * *
2173 * Commodity functions, cleanup needed ? *
2174 * *
2175 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002176/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002177 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002178 * NOTE: it might be more apropriate to integrate this information
2179 * into the html40ElementTable array but I don't want to risk any
2180 * binary incomptibility
2181 */
2182static const char *allowPCData[] = {
2183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184 "blockquote", "body", "button", "caption", "center", "cite", "code",
2185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189};
Owen Taylor3473f882001-02-23 17:55:21 +00002190
2191/**
2192 * areBlanks:
2193 * @ctxt: an HTML parser context
2194 * @str: a xmlChar *
2195 * @len: the size of @str
2196 *
2197 * Is this a sequence of blank chars that one can ignore ?
2198 *
2199 * Returns 1 if ignorable 0 otherwise.
2200 */
2201
2202static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002203 unsigned int i;
2204 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002205 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002206 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002207
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002208 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002209 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002210
2211 if (CUR == 0) return(1);
2212 if (CUR != '<') return(0);
2213 if (ctxt->name == NULL)
2214 return(1);
2215 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216 return(1);
2217 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002219
2220 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222 dtd = xmlGetIntSubset(ctxt->myDoc);
2223 if (dtd != NULL && dtd->ExternalID != NULL) {
2224 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226 return(1);
2227 }
2228 }
2229
Owen Taylor3473f882001-02-23 17:55:21 +00002230 if (ctxt->node == NULL) return(0);
2231 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002234 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002235 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002237 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002238 for all tags "b" allowing PCDATA */
2239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241 return(0);
2242 }
2243 }
Owen Taylor3473f882001-02-23 17:55:21 +00002244 } else if (xmlNodeIsText(lastChild)) {
2245 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002246 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002247 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002248 for all tags "p" allowing PCDATA */
2249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251 return(0);
2252 }
2253 }
Owen Taylor3473f882001-02-23 17:55:21 +00002254 }
2255 return(1);
2256}
2257
2258/**
Owen Taylor3473f882001-02-23 17:55:21 +00002259 * htmlNewDocNoDtD:
2260 * @URI: URI for the dtd, or NULL
2261 * @ExternalID: the external ID of the DTD, or NULL
2262 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002263 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264 * are NULL
2265 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002266 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002267 */
2268htmlDocPtr
2269htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270 xmlDocPtr cur;
2271
2272 /*
2273 * Allocate a new document and fill the fields.
2274 */
2275 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002277 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002278 return(NULL);
2279 }
2280 memset(cur, 0, sizeof(xmlDoc));
2281
2282 cur->type = XML_HTML_DOCUMENT_NODE;
2283 cur->version = NULL;
2284 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002285 cur->doc = cur;
2286 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002287 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002288 cur->extSubset = NULL;
2289 cur->oldNs = NULL;
2290 cur->encoding = NULL;
2291 cur->standalone = 1;
2292 cur->compression = 0;
2293 cur->ids = NULL;
2294 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002295 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002296 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002298 if ((ExternalID != NULL) ||
2299 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002301 return(cur);
2302}
2303
2304/**
2305 * htmlNewDoc:
2306 * @URI: URI for the dtd, or NULL
2307 * @ExternalID: the external ID of the DTD, or NULL
2308 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002309 * Creates a new HTML document
2310 *
Owen Taylor3473f882001-02-23 17:55:21 +00002311 * Returns a new document
2312 */
2313htmlDocPtr
2314htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315 if ((URI == NULL) && (ExternalID == NULL))
2316 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002317 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002319
2320 return(htmlNewDocNoDtD(URI, ExternalID));
2321}
2322
2323
2324/************************************************************************
2325 * *
2326 * The parser itself *
2327 * Relates to http://www.w3.org/TR/html40 *
2328 * *
2329 ************************************************************************/
2330
2331/************************************************************************
2332 * *
2333 * The parser itself *
2334 * *
2335 ************************************************************************/
2336
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002337static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002338
Owen Taylor3473f882001-02-23 17:55:21 +00002339/**
2340 * htmlParseHTMLName:
2341 * @ctxt: an HTML parser context
2342 *
2343 * parse an HTML tag or attribute name, note that we convert it to lowercase
2344 * since HTML names are not case-sensitive.
2345 *
2346 * Returns the Tag Name parsed or NULL
2347 */
2348
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002349static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002350htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002351 int i = 0;
2352 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353
William M. Brackd1757ab2004-10-02 22:07:48 +00002354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002355 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002356
2357 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002359 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362 else loc[i] = CUR;
2363 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002364
Owen Taylor3473f882001-02-23 17:55:21 +00002365 NEXT;
2366 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002367
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002368 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002369}
2370
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002371
2372/**
2373 * htmlParseHTMLName_nonInvasive:
2374 * @ctxt: an HTML parser context
2375 *
2376 * parse an HTML tag or attribute name, note that we convert it to lowercase
2377 * since HTML names are not case-sensitive, this doesn't consume the data
2378 * from the stream, it's a look-ahead
2379 *
2380 * Returns the Tag Name parsed or NULL
2381 */
2382
2383static const xmlChar *
2384htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385 int i = 0;
2386 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387
2388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002390
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002391 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395 else loc[i] = NXT(1+i);
2396 i++;
2397 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002398
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002399 return(xmlDictLookup(ctxt->dict, loc, i));
2400}
2401
2402
Owen Taylor3473f882001-02-23 17:55:21 +00002403/**
2404 * htmlParseName:
2405 * @ctxt: an HTML parser context
2406 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002407 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002408 *
2409 * Returns the Name parsed or NULL
2410 */
2411
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002412static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002413htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002414 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002415 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002416 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002417
2418 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002419
2420 /*
2421 * Accelerator for simple ASCII names
2422 */
2423 in = ctxt->input->cur;
2424 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425 ((*in >= 0x41) && (*in <= 0x5A)) ||
2426 (*in == '_') || (*in == ':')) {
2427 in++;
2428 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429 ((*in >= 0x41) && (*in <= 0x5A)) ||
2430 ((*in >= 0x30) && (*in <= 0x39)) ||
2431 (*in == '_') || (*in == '-') ||
2432 (*in == ':') || (*in == '.'))
2433 in++;
2434 if ((*in > 0) && (*in < 0x80)) {
2435 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002436 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002437 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002438 ctxt->nbChars += count;
2439 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002440 return(ret);
2441 }
2442 }
2443 return(htmlParseNameComplex(ctxt));
2444}
2445
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002446static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002447htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002448 int len = 0, l;
2449 int c;
2450 int count = 0;
2451
2452 /*
2453 * Handler for more complex cases
2454 */
2455 GROW;
2456 c = CUR_CHAR(l);
2457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458 (!IS_LETTER(c) && (c != '_') &&
2459 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002460 return(NULL);
2461 }
2462
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002466 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002467 (IS_COMBINING(c)) ||
2468 (IS_EXTENDER(c)))) {
2469 if (count++ > 100) {
2470 count = 0;
2471 GROW;
2472 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002473 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002474 NEXTL(l);
2475 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002476 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002478}
2479
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002480
Owen Taylor3473f882001-02-23 17:55:21 +00002481/**
2482 * htmlParseHTMLAttribute:
2483 * @ctxt: an HTML parser context
2484 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002485 *
Owen Taylor3473f882001-02-23 17:55:21 +00002486 * parse an HTML attribute value till the stop (quote), if
2487 * stop is 0 then it stops at the first space
2488 *
2489 * Returns the attribute parsed or NULL
2490 */
2491
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002492static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002493htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494 xmlChar *buffer = NULL;
2495 int buffer_size = 0;
2496 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002497 const xmlChar *name = NULL;
2498 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002499 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002500
2501 /*
2502 * allocate a translation buffer.
2503 */
2504 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002505 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002506 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002507 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002508 return(NULL);
2509 }
2510 out = buffer;
2511
2512 /*
2513 * Ok loop until we reach one of the ending chars
2514 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002515 while ((CUR != 0) && (CUR != stop)) {
2516 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002517 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002518 if (CUR == '&') {
2519 if (NXT(1) == '#') {
2520 unsigned int c;
2521 int bits;
2522
2523 c = htmlParseCharRef(ctxt);
2524 if (c < 0x80)
2525 { *out++ = c; bits= -6; }
2526 else if (c < 0x800)
2527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2528 else if (c < 0x10000)
2529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002530 else
Owen Taylor3473f882001-02-23 17:55:21 +00002531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002532
Owen Taylor3473f882001-02-23 17:55:21 +00002533 for ( ; bits >= 0; bits-= 6) {
2534 *out++ = ((c >> bits) & 0x3F) | 0x80;
2535 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002536
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002537 if (out - buffer > buffer_size - 100) {
2538 int indx = out - buffer;
2539
2540 growBuffer(buffer);
2541 out = &buffer[indx];
2542 }
Owen Taylor3473f882001-02-23 17:55:21 +00002543 } else {
2544 ent = htmlParseEntityRef(ctxt, &name);
2545 if (name == NULL) {
2546 *out++ = '&';
2547 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002548 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002549
2550 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002551 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002552 }
2553 } else if (ent == NULL) {
2554 *out++ = '&';
2555 cur = name;
2556 while (*cur != 0) {
2557 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002558 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002559
2560 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002561 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002562 }
2563 *out++ = *cur++;
2564 }
Owen Taylor3473f882001-02-23 17:55:21 +00002565 } else {
2566 unsigned int c;
2567 int bits;
2568
2569 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002570 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002571
2572 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002573 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002574 }
Daniel Veillard48519092006-10-17 15:56:35 +00002575 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002576 if (c < 0x80)
2577 { *out++ = c; bits= -6; }
2578 else if (c < 0x800)
2579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2580 else if (c < 0x10000)
2581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002582 else
Owen Taylor3473f882001-02-23 17:55:21 +00002583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002584
Owen Taylor3473f882001-02-23 17:55:21 +00002585 for ( ; bits >= 0; bits-= 6) {
2586 *out++ = ((c >> bits) & 0x3F) | 0x80;
2587 }
Owen Taylor3473f882001-02-23 17:55:21 +00002588 }
2589 }
2590 } else {
2591 unsigned int c;
2592 int bits, l;
2593
2594 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002595 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002596
2597 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002598 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002599 }
2600 c = CUR_CHAR(l);
2601 if (c < 0x80)
2602 { *out++ = c; bits= -6; }
2603 else if (c < 0x800)
2604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2605 else if (c < 0x10000)
2606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002607 else
Owen Taylor3473f882001-02-23 17:55:21 +00002608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002609
Owen Taylor3473f882001-02-23 17:55:21 +00002610 for ( ; bits >= 0; bits-= 6) {
2611 *out++ = ((c >> bits) & 0x3F) | 0x80;
2612 }
2613 NEXT;
2614 }
2615 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002616 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002617 return(buffer);
2618}
2619
2620/**
Owen Taylor3473f882001-02-23 17:55:21 +00002621 * htmlParseEntityRef:
2622 * @ctxt: an HTML parser context
2623 * @str: location to store the entity name
2624 *
2625 * parse an HTML ENTITY references
2626 *
2627 * [68] EntityRef ::= '&' Name ';'
2628 *
2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630 * if non-NULL *str will have to be freed by the caller.
2631 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002632const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002633htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002635 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002636
2637 if (str != NULL) *str = NULL;
2638 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002639
2640 if (CUR == '&') {
2641 NEXT;
2642 name = htmlParseName(ctxt);
2643 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002646 } else {
2647 GROW;
2648 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002649 if (str != NULL)
2650 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002651
2652 /*
2653 * Lookup the entity in the table.
2654 */
2655 ent = htmlEntityLookup(name);
2656 if (ent != NULL) /* OK that's ugly !!! */
2657 NEXT;
2658 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002659 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660 "htmlParseEntityRef: expecting ';'\n",
2661 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002662 if (str != NULL)
2663 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002664 }
2665 }
2666 }
2667 return(ent);
2668}
2669
2670/**
2671 * htmlParseAttValue:
2672 * @ctxt: an HTML parser context
2673 *
2674 * parse a value for an attribute
2675 * Note: the parser won't do substitution of entities here, this
2676 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002677 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002678 *
2679 * Returns the AttValue parsed or NULL.
2680 */
2681
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002682static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002683htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684 xmlChar *ret = NULL;
2685
2686 if (CUR == '"') {
2687 NEXT;
2688 ret = htmlParseHTMLAttribute(ctxt, '"');
2689 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002690 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002692 } else
2693 NEXT;
2694 } else if (CUR == '\'') {
2695 NEXT;
2696 ret = htmlParseHTMLAttribute(ctxt, '\'');
2697 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002698 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002700 } else
2701 NEXT;
2702 } else {
2703 /*
2704 * That's an HTMLism, the attribute value may not be quoted
2705 */
2706 ret = htmlParseHTMLAttribute(ctxt, 0);
2707 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002708 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002710 }
2711 }
2712 return(ret);
2713}
2714
2715/**
2716 * htmlParseSystemLiteral:
2717 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002718 *
Owen Taylor3473f882001-02-23 17:55:21 +00002719 * parse an HTML Literal
2720 *
2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2722 *
2723 * Returns the SystemLiteral parsed or NULL
2724 */
2725
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002726static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002727htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2728 const xmlChar *q;
2729 xmlChar *ret = NULL;
2730
2731 if (CUR == '"') {
2732 NEXT;
2733 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002734 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002735 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002736 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002737 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002739 } else {
2740 ret = xmlStrndup(q, CUR_PTR - q);
2741 NEXT;
2742 }
2743 } else if (CUR == '\'') {
2744 NEXT;
2745 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002746 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002747 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002748 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002749 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002751 } else {
2752 ret = xmlStrndup(q, CUR_PTR - q);
2753 NEXT;
2754 }
2755 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002756 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002758 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002759
Owen Taylor3473f882001-02-23 17:55:21 +00002760 return(ret);
2761}
2762
2763/**
2764 * htmlParsePubidLiteral:
2765 * @ctxt: an HTML parser context
2766 *
2767 * parse an HTML public literal
2768 *
2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2770 *
2771 * Returns the PubidLiteral parsed or NULL.
2772 */
2773
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002774static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002775htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2776 const xmlChar *q;
2777 xmlChar *ret = NULL;
2778 /*
2779 * Name ::= (Letter | '_') (NameChar)*
2780 */
2781 if (CUR == '"') {
2782 NEXT;
2783 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002784 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002785 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002786 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002788 } else {
2789 ret = xmlStrndup(q, CUR_PTR - q);
2790 NEXT;
2791 }
2792 } else if (CUR == '\'') {
2793 NEXT;
2794 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002795 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002796 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002797 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002798 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002800 } else {
2801 ret = xmlStrndup(q, CUR_PTR - q);
2802 NEXT;
2803 }
2804 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002805 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002807 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002808
Owen Taylor3473f882001-02-23 17:55:21 +00002809 return(ret);
2810}
2811
2812/**
2813 * htmlParseScript:
2814 * @ctxt: an HTML parser context
2815 *
2816 * parse the content of an HTML SCRIPT or STYLE element
2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819 * http://www.w3.org/TR/html4/types.html#type-script
2820 * http://www.w3.org/TR/html4/types.html#h-6.15
2821 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2822 *
2823 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824 * element and the value of intrinsic event attributes. User agents must
2825 * not evaluate script data as HTML markup but instead must pass it on as
2826 * data to a script engine.
2827 * NOTES:
2828 * - The content is passed like CDATA
2829 * - the attributes for style and scripting "onXXX" are also described
2830 * as CDATA but SGML allows entities references in attributes so their
2831 * processing is identical as other attributes
2832 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002833static void
Owen Taylor3473f882001-02-23 17:55:21 +00002834htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002835 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002836 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002837 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002838
2839 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002840 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002841 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002842 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002843 /*
2844 * One should break here, the specification is clear:
2845 * Authors should therefore escape "</" within the content.
2846 * Escape mechanisms are specific to each scripting or
2847 * style sheet language.
2848 *
2849 * In recovery mode, only break if end tag match the
2850 * current tag, effectively ignoring all tags inside the
2851 * script/style block and treating the entire block as
2852 * CDATA.
2853 */
2854 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002855 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002857 {
2858 break; /* while */
2859 } else {
2860 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002861 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002862 ctxt->name, NULL);
2863 }
2864 } else {
2865 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002866 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002867 {
2868 break; /* while */
2869 }
2870 }
Owen Taylor3473f882001-02-23 17:55:21 +00002871 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002872 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002873 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874 if (ctxt->sax->cdataBlock!= NULL) {
2875 /*
2876 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2877 */
2878 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002879 } else if (ctxt->sax->characters != NULL) {
2880 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002881 }
2882 nbchar = 0;
2883 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002884 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002885 NEXTL(l);
2886 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002887 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002888
Daniel Veillard68716a72006-10-16 09:32:17 +00002889 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Pierre Belziled4b54472010-11-04 10:18:17 +01002890 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891 "Invalid char in CDATA 0x%X\n", cur);
2892 if (ctxt->input->cur < ctxt->input->end) {
2893 NEXT;
2894 }
Owen Taylor3473f882001-02-23 17:55:21 +00002895 }
2896
2897 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2898 if (ctxt->sax->cdataBlock!= NULL) {
2899 /*
2900 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2901 */
2902 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002903 } else if (ctxt->sax->characters != NULL) {
2904 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002905 }
2906 }
2907}
2908
2909
2910/**
2911 * htmlParseCharData:
2912 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002913 *
2914 * parse a CharData section.
2915 * if we are within a CDATA section ']]>' marks an end of section.
2916 *
2917 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2918 */
2919
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002920static void
2921htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002922 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2923 int nbchar = 0;
2924 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002925 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002926
2927 SHRINK;
2928 cur = CUR_CHAR(l);
2929 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002930 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002931 (cur != 0)) {
2932 if (!(IS_CHAR(cur))) {
2933 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2934 "Invalid char in CDATA 0x%X\n", cur);
2935 } else {
2936 COPY_BUF(l,buf,nbchar,cur);
2937 }
Owen Taylor3473f882001-02-23 17:55:21 +00002938 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2939 /*
2940 * Ok the segment is to be consumed as chars.
2941 */
2942 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2943 if (areBlanks(ctxt, buf, nbchar)) {
2944 if (ctxt->sax->ignorableWhitespace != NULL)
2945 ctxt->sax->ignorableWhitespace(ctxt->userData,
2946 buf, nbchar);
2947 } else {
2948 htmlCheckParagraph(ctxt);
2949 if (ctxt->sax->characters != NULL)
2950 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2951 }
2952 }
2953 nbchar = 0;
2954 }
2955 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002956 chunk++;
2957 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2958 chunk = 0;
2959 SHRINK;
2960 GROW;
2961 }
Owen Taylor3473f882001-02-23 17:55:21 +00002962 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002963 if (cur == 0) {
2964 SHRINK;
2965 GROW;
2966 cur = CUR_CHAR(l);
2967 }
Owen Taylor3473f882001-02-23 17:55:21 +00002968 }
2969 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002970 buf[nbchar] = 0;
2971
Owen Taylor3473f882001-02-23 17:55:21 +00002972 /*
2973 * Ok the segment is to be consumed as chars.
2974 */
2975 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2976 if (areBlanks(ctxt, buf, nbchar)) {
2977 if (ctxt->sax->ignorableWhitespace != NULL)
2978 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2979 } else {
2980 htmlCheckParagraph(ctxt);
2981 if (ctxt->sax->characters != NULL)
2982 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2983 }
2984 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002985 } else {
2986 /*
2987 * Loop detection
2988 */
2989 if (cur == 0)
2990 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002991 }
2992}
2993
2994/**
2995 * htmlParseExternalID:
2996 * @ctxt: an HTML parser context
2997 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002998 *
2999 * Parse an External ID or a Public ID
3000 *
Owen Taylor3473f882001-02-23 17:55:21 +00003001 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3002 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3003 *
3004 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3005 *
3006 * Returns the function returns SystemLiteral and in the second
3007 * case publicID receives PubidLiteral, is strict is off
3008 * it is possible to return NULL and have publicID set.
3009 */
3010
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003011static xmlChar *
3012htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00003013 xmlChar *URI = NULL;
3014
3015 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3016 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3017 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3018 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003019 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003020 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3021 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003022 }
3023 SKIP_BLANKS;
3024 URI = htmlParseSystemLiteral(ctxt);
3025 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003026 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3027 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003028 }
3029 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3030 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3031 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3032 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003033 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003034 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3035 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003036 }
3037 SKIP_BLANKS;
3038 *publicID = htmlParsePubidLiteral(ctxt);
3039 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003040 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3041 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3042 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003043 }
3044 SKIP_BLANKS;
3045 if ((CUR == '"') || (CUR == '\'')) {
3046 URI = htmlParseSystemLiteral(ctxt);
3047 }
3048 }
3049 return(URI);
3050}
3051
3052/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003053 * xmlParsePI:
3054 * @ctxt: an XML parser context
3055 *
3056 * parse an XML Processing Instruction.
3057 *
3058 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3059 */
3060static void
3061htmlParsePI(htmlParserCtxtPtr ctxt) {
3062 xmlChar *buf = NULL;
3063 int len = 0;
3064 int size = HTML_PARSER_BUFFER_SIZE;
3065 int cur, l;
3066 const xmlChar *target;
3067 xmlParserInputState state;
3068 int count = 0;
3069
3070 if ((RAW == '<') && (NXT(1) == '?')) {
3071 state = ctxt->instate;
3072 ctxt->instate = XML_PARSER_PI;
3073 /*
3074 * this is a Processing Instruction.
3075 */
3076 SKIP(2);
3077 SHRINK;
3078
3079 /*
3080 * Parse the target name and check for special support like
3081 * namespace.
3082 */
3083 target = htmlParseName(ctxt);
3084 if (target != NULL) {
3085 if (RAW == '>') {
3086 SKIP(1);
3087
3088 /*
3089 * SAX: PI detected.
3090 */
3091 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3092 (ctxt->sax->processingInstruction != NULL))
3093 ctxt->sax->processingInstruction(ctxt->userData,
3094 target, NULL);
3095 ctxt->instate = state;
3096 return;
3097 }
3098 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3099 if (buf == NULL) {
3100 htmlErrMemory(ctxt, NULL);
3101 ctxt->instate = state;
3102 return;
3103 }
3104 cur = CUR;
3105 if (!IS_BLANK(cur)) {
3106 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3107 "ParsePI: PI %s space expected\n", target, NULL);
3108 }
3109 SKIP_BLANKS;
3110 cur = CUR_CHAR(l);
3111 while (IS_CHAR(cur) && (cur != '>')) {
3112 if (len + 5 >= size) {
3113 xmlChar *tmp;
3114
3115 size *= 2;
3116 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3117 if (tmp == NULL) {
3118 htmlErrMemory(ctxt, NULL);
3119 xmlFree(buf);
3120 ctxt->instate = state;
3121 return;
3122 }
3123 buf = tmp;
3124 }
3125 count++;
3126 if (count > 50) {
3127 GROW;
3128 count = 0;
3129 }
3130 COPY_BUF(l,buf,len,cur);
3131 NEXTL(l);
3132 cur = CUR_CHAR(l);
3133 if (cur == 0) {
3134 SHRINK;
3135 GROW;
3136 cur = CUR_CHAR(l);
3137 }
3138 }
3139 buf[len] = 0;
3140 if (cur != '>') {
3141 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3142 "ParsePI: PI %s never end ...\n", target, NULL);
3143 } else {
3144 SKIP(1);
3145
3146 /*
3147 * SAX: PI detected.
3148 */
3149 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3150 (ctxt->sax->processingInstruction != NULL))
3151 ctxt->sax->processingInstruction(ctxt->userData,
3152 target, buf);
3153 }
3154 xmlFree(buf);
3155 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003156 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003157 "PI is not started correctly", NULL, NULL);
3158 }
3159 ctxt->instate = state;
3160 }
3161}
3162
3163/**
Owen Taylor3473f882001-02-23 17:55:21 +00003164 * htmlParseComment:
3165 * @ctxt: an HTML parser context
3166 *
3167 * Parse an XML (SGML) comment <!-- .... -->
3168 *
3169 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3170 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003171static void
Owen Taylor3473f882001-02-23 17:55:21 +00003172htmlParseComment(htmlParserCtxtPtr ctxt) {
3173 xmlChar *buf = NULL;
3174 int len;
3175 int size = HTML_PARSER_BUFFER_SIZE;
3176 int q, ql;
3177 int r, rl;
3178 int cur, l;
3179 xmlParserInputState state;
3180
3181 /*
3182 * Check that there is a comment right here.
3183 */
3184 if ((RAW != '<') || (NXT(1) != '!') ||
3185 (NXT(2) != '-') || (NXT(3) != '-')) return;
3186
3187 state = ctxt->instate;
3188 ctxt->instate = XML_PARSER_COMMENT;
3189 SHRINK;
3190 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003191 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003192 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003193 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003194 ctxt->instate = state;
3195 return;
3196 }
3197 q = CUR_CHAR(ql);
3198 NEXTL(ql);
3199 r = CUR_CHAR(rl);
3200 NEXTL(rl);
3201 cur = CUR_CHAR(l);
3202 len = 0;
3203 while (IS_CHAR(cur) &&
3204 ((cur != '>') ||
3205 (r != '-') || (q != '-'))) {
3206 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003207 xmlChar *tmp;
3208
Owen Taylor3473f882001-02-23 17:55:21 +00003209 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003210 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3211 if (tmp == NULL) {
3212 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003213 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003214 ctxt->instate = state;
3215 return;
3216 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003217 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003218 }
3219 COPY_BUF(ql,buf,len,q);
3220 q = r;
3221 ql = rl;
3222 r = cur;
3223 rl = l;
3224 NEXTL(l);
3225 cur = CUR_CHAR(l);
3226 if (cur == 0) {
3227 SHRINK;
3228 GROW;
3229 cur = CUR_CHAR(l);
3230 }
3231 }
3232 buf[len] = 0;
3233 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003234 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3235 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003236 xmlFree(buf);
3237 } else {
3238 NEXT;
3239 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3240 (!ctxt->disableSAX))
3241 ctxt->sax->comment(ctxt->userData, buf);
3242 xmlFree(buf);
3243 }
3244 ctxt->instate = state;
3245}
3246
3247/**
3248 * htmlParseCharRef:
3249 * @ctxt: an HTML parser context
3250 *
3251 * parse Reference declarations
3252 *
3253 * [66] CharRef ::= '&#' [0-9]+ ';' |
3254 * '&#x' [0-9a-fA-F]+ ';'
3255 *
3256 * Returns the value parsed (as an int)
3257 */
3258int
3259htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3260 int val = 0;
3261
Daniel Veillarda03e3652004-11-02 18:45:30 +00003262 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3263 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3264 "htmlParseCharRef: context error\n",
3265 NULL, NULL);
3266 return(0);
3267 }
Owen Taylor3473f882001-02-23 17:55:21 +00003268 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003269 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003270 SKIP(3);
3271 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003272 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003273 val = val * 16 + (CUR - '0');
3274 else if ((CUR >= 'a') && (CUR <= 'f'))
3275 val = val * 16 + (CUR - 'a') + 10;
3276 else if ((CUR >= 'A') && (CUR <= 'F'))
3277 val = val * 16 + (CUR - 'A') + 10;
3278 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003279 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003280 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003281 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003282 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003283 }
3284 NEXT;
3285 }
3286 if (CUR == ';')
3287 NEXT;
3288 } else if ((CUR == '&') && (NXT(1) == '#')) {
3289 SKIP(2);
3290 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003291 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003292 val = val * 10 + (CUR - '0');
3293 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003294 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003295 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003296 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003297 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003298 }
3299 NEXT;
3300 }
3301 if (CUR == ';')
3302 NEXT;
3303 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003304 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3305 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003306 }
3307 /*
3308 * Check the value IS_CHAR ...
3309 */
3310 if (IS_CHAR(val)) {
3311 return(val);
3312 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003313 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3314 "htmlParseCharRef: invalid xmlChar value %d\n",
3315 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003316 }
3317 return(0);
3318}
3319
3320
3321/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003322 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003323 * @ctxt: an HTML parser context
3324 *
3325 * parse a DOCTYPE declaration
3326 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003327 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003328 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3329 */
3330
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003331static void
Owen Taylor3473f882001-02-23 17:55:21 +00003332htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003333 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003334 xmlChar *ExternalID = NULL;
3335 xmlChar *URI = NULL;
3336
3337 /*
3338 * We know that '<!DOCTYPE' has been detected.
3339 */
3340 SKIP(9);
3341
3342 SKIP_BLANKS;
3343
3344 /*
3345 * Parse the DOCTYPE name.
3346 */
3347 name = htmlParseName(ctxt);
3348 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003349 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3350 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3351 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003352 }
3353 /*
3354 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3355 */
3356
3357 SKIP_BLANKS;
3358
3359 /*
3360 * Check for SystemID and ExternalID
3361 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003362 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003363 SKIP_BLANKS;
3364
3365 /*
3366 * We should be at the end of the DOCTYPE declaration.
3367 */
3368 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003369 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3370 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003371 /* We shouldn't try to resynchronize ... */
3372 }
3373 NEXT;
3374
3375 /*
3376 * Create or update the document accordingly to the DOCTYPE
3377 */
3378 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3379 (!ctxt->disableSAX))
3380 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3381
3382 /*
3383 * Cleanup, since we don't use all those identifiers
3384 */
3385 if (URI != NULL) xmlFree(URI);
3386 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003387}
3388
3389/**
3390 * htmlParseAttribute:
3391 * @ctxt: an HTML parser context
3392 * @value: a xmlChar ** used to store the value of the attribute
3393 *
3394 * parse an attribute
3395 *
3396 * [41] Attribute ::= Name Eq AttValue
3397 *
3398 * [25] Eq ::= S? '=' S?
3399 *
3400 * With namespace:
3401 *
3402 * [NS 11] Attribute ::= QName Eq AttValue
3403 *
3404 * Also the case QName == xmlns:??? is handled independently as a namespace
3405 * definition.
3406 *
3407 * Returns the attribute name, and the value in *value.
3408 */
3409
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003410static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003411htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003412 const xmlChar *name;
3413 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003414
3415 *value = NULL;
3416 name = htmlParseHTMLName(ctxt);
3417 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003418 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3419 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003420 return(NULL);
3421 }
3422
3423 /*
3424 * read the value
3425 */
3426 SKIP_BLANKS;
3427 if (CUR == '=') {
3428 NEXT;
3429 SKIP_BLANKS;
3430 val = htmlParseAttValue(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003431 }
3432
3433 *value = val;
3434 return(name);
3435}
3436
3437/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003438 * htmlCheckEncodingDirect:
Owen Taylor3473f882001-02-23 17:55:21 +00003439 * @ctxt: an HTML parser context
3440 * @attvalue: the attribute value
3441 *
Denis Pauk868d92d2012-05-10 15:34:57 +08003442 * Checks an attribute value to detect
Owen Taylor3473f882001-02-23 17:55:21 +00003443 * the encoding
3444 * If a new encoding is detected the parser is switched to decode
3445 * it and pass UTF8
3446 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003447static void
Denis Pauk868d92d2012-05-10 15:34:57 +08003448htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00003449
Denis Pauk868d92d2012-05-10 15:34:57 +08003450 if ((ctxt == NULL) || (encoding == NULL) ||
Daniel Veillardc62efc82011-05-16 16:03:50 +08003451 (ctxt->options & HTML_PARSE_IGNORE_ENC))
Owen Taylor3473f882001-02-23 17:55:21 +00003452 return;
3453
Daniel Veillarde77db162009-08-22 11:32:38 +02003454 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003455 if (ctxt->input->encoding != NULL)
3456 return;
3457
Owen Taylor3473f882001-02-23 17:55:21 +00003458 if (encoding != NULL) {
3459 xmlCharEncoding enc;
3460 xmlCharEncodingHandlerPtr handler;
3461
3462 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3463
3464 if (ctxt->input->encoding != NULL)
3465 xmlFree((xmlChar *) ctxt->input->encoding);
3466 ctxt->input->encoding = xmlStrdup(encoding);
3467
3468 enc = xmlParseCharEncoding((const char *) encoding);
3469 /*
3470 * registered set of known encodings
3471 */
3472 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003473 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003474 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3475 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3476 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3477 (ctxt->input->buf != NULL) &&
3478 (ctxt->input->buf->encoder == NULL)) {
3479 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3480 "htmlCheckEncoding: wrong encoding meta\n",
3481 NULL, NULL);
3482 } else {
3483 xmlSwitchEncoding(ctxt, enc);
3484 }
Owen Taylor3473f882001-02-23 17:55:21 +00003485 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3486 } else {
3487 /*
3488 * fallback for unknown encodings
3489 */
3490 handler = xmlFindCharEncodingHandler((const char *) encoding);
3491 if (handler != NULL) {
3492 xmlSwitchToEncoding(ctxt, handler);
3493 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3494 } else {
Daniel Veillardc62efc82011-05-16 16:03:50 +08003495 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3496 "htmlCheckEncoding: unknown encoding %s\n",
3497 encoding, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003498 }
3499 }
3500
3501 if ((ctxt->input->buf != NULL) &&
3502 (ctxt->input->buf->encoder != NULL) &&
3503 (ctxt->input->buf->raw != NULL) &&
3504 (ctxt->input->buf->buffer != NULL)) {
3505 int nbchars;
3506 int processed;
3507
3508 /*
3509 * convert as much as possible to the parser reading buffer.
3510 */
3511 processed = ctxt->input->cur - ctxt->input->base;
3512 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3513 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3514 ctxt->input->buf->buffer,
3515 ctxt->input->buf->raw);
3516 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003517 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3518 "htmlCheckEncoding: encoder error\n",
3519 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003520 }
3521 ctxt->input->base =
3522 ctxt->input->cur = ctxt->input->buf->buffer->content;
Eugene Pimenov1e60fbc2010-03-10 18:10:49 +01003523 ctxt->input->end =
3524 &ctxt->input->base[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00003525 }
3526 }
3527}
3528
3529/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003530 * htmlCheckEncoding:
3531 * @ctxt: an HTML parser context
3532 * @attvalue: the attribute value
3533 *
3534 * Checks an http-equiv attribute from a Meta tag to detect
3535 * the encoding
3536 * If a new encoding is detected the parser is switched to decode
3537 * it and pass UTF8
3538 */
3539static void
3540htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3541 const xmlChar *encoding;
3542
3543 if (!attvalue)
3544 return;
3545
3546 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3547 if (encoding != NULL) {
3548 encoding += 7;
3549 }
3550 /*
3551 * skip blank
3552 */
3553 if (encoding && IS_BLANK_CH(*encoding))
3554 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3555 if (encoding && *encoding == '=') {
3556 encoding ++;
3557 htmlCheckEncodingDirect(ctxt, encoding);
3558 }
3559}
3560
3561/**
Owen Taylor3473f882001-02-23 17:55:21 +00003562 * htmlCheckMeta:
3563 * @ctxt: an HTML parser context
3564 * @atts: the attributes values
3565 *
3566 * Checks an attributes from a Meta tag
3567 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003568static void
Owen Taylor3473f882001-02-23 17:55:21 +00003569htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3570 int i;
3571 const xmlChar *att, *value;
3572 int http = 0;
3573 const xmlChar *content = NULL;
3574
3575 if ((ctxt == NULL) || (atts == NULL))
3576 return;
3577
3578 i = 0;
3579 att = atts[i++];
3580 while (att != NULL) {
3581 value = atts[i++];
3582 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3583 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3584 http = 1;
Denis Pauk868d92d2012-05-10 15:34:57 +08003585 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3586 htmlCheckEncodingDirect(ctxt, value);
Owen Taylor3473f882001-02-23 17:55:21 +00003587 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3588 content = value;
3589 att = atts[i++];
3590 }
3591 if ((http) && (content != NULL))
3592 htmlCheckEncoding(ctxt, content);
3593
3594}
3595
3596/**
3597 * htmlParseStartTag:
3598 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003599 *
Owen Taylor3473f882001-02-23 17:55:21 +00003600 * parse a start of tag either for rule element or
3601 * EmptyElement. In both case we don't parse the tag closing chars.
3602 *
3603 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3604 *
3605 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3606 *
3607 * With namespace:
3608 *
3609 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3610 *
3611 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3612 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003613 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003614 */
3615
Daniel Veillard597f1c12005-07-03 23:00:18 +00003616static int
Owen Taylor3473f882001-02-23 17:55:21 +00003617htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003618 const xmlChar *name;
3619 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003620 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003621 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003622 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003623 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003624 int meta = 0;
3625 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003626 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003627
Daniel Veillarde77db162009-08-22 11:32:38 +02003628 if (ctxt->instate == XML_PARSER_EOF)
3629 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003630 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3631 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3632 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003633 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003634 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003635 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003636 NEXT;
3637
Daniel Veillard30e76072006-03-09 14:13:55 +00003638 atts = ctxt->atts;
3639 maxatts = ctxt->maxatts;
3640
Owen Taylor3473f882001-02-23 17:55:21 +00003641 GROW;
3642 name = htmlParseHTMLName(ctxt);
3643 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3645 "htmlParseStartTag: invalid element name\n",
3646 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003647 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003648 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3649 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003650 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003651 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003652 }
3653 if (xmlStrEqual(name, BAD_CAST"meta"))
3654 meta = 1;
3655
3656 /*
3657 * Check for auto-closure of HTML elements.
3658 */
3659 htmlAutoClose(ctxt, name);
3660
3661 /*
3662 * Check for implied HTML elements.
3663 */
3664 htmlCheckImplied(ctxt, name);
3665
3666 /*
3667 * Avoid html at any level > 0, head at any level != 1
3668 * or any attempt to recurse body
3669 */
3670 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003671 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3672 "htmlParseStartTag: misplaced <html> tag\n",
3673 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003674 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003675 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003676 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003677 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003678 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003679 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3680 "htmlParseStartTag: misplaced <head> tag\n",
3681 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003682 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003683 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003684 }
3685 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003686 int indx;
3687 for (indx = 0;indx < ctxt->nameNr;indx++) {
3688 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003689 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3690 "htmlParseStartTag: misplaced <body> tag\n",
3691 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003692 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003693 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003694 }
3695 }
3696 }
3697
3698 /*
3699 * Now parse the attributes, it ends up with the ending
3700 *
3701 * (S Attribute)* S?
3702 */
3703 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003704 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003705 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003706 ((CUR != '/') || (NXT(1) != '>'))) {
3707 long cons = ctxt->nbChars;
3708
3709 GROW;
3710 attname = htmlParseAttribute(ctxt, &attvalue);
3711 if (attname != NULL) {
3712
3713 /*
3714 * Well formedness requires at most one declaration of an attribute
3715 */
3716 for (i = 0; i < nbatts;i += 2) {
3717 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003718 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3719 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003720 if (attvalue != NULL)
3721 xmlFree(attvalue);
3722 goto failed;
3723 }
3724 }
3725
3726 /*
3727 * Add the pair to atts
3728 */
3729 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003730 maxatts = 22; /* allow for 10 attrs by default */
3731 atts = (const xmlChar **)
3732 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003733 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003734 htmlErrMemory(ctxt, NULL);
3735 if (attvalue != NULL)
3736 xmlFree(attvalue);
3737 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003738 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003739 ctxt->atts = atts;
3740 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003741 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003742 const xmlChar **n;
3743
Owen Taylor3473f882001-02-23 17:55:21 +00003744 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003745 n = (const xmlChar **) xmlRealloc((void *) atts,
3746 maxatts * sizeof(const xmlChar *));
3747 if (n == NULL) {
3748 htmlErrMemory(ctxt, NULL);
3749 if (attvalue != NULL)
3750 xmlFree(attvalue);
3751 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003752 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003753 atts = n;
3754 ctxt->atts = atts;
3755 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003756 }
3757 atts[nbatts++] = attname;
3758 atts[nbatts++] = attvalue;
3759 atts[nbatts] = NULL;
3760 atts[nbatts + 1] = NULL;
3761 }
3762 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003763 if (attvalue != NULL)
3764 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003765 /* Dump the bogus attribute string up to the next blank or
3766 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003767 while ((IS_CHAR_CH(CUR)) &&
3768 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003769 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003770 NEXT;
3771 }
3772
3773failed:
3774 SKIP_BLANKS;
3775 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003776 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3777 "htmlParseStartTag: problem parsing attributes\n",
3778 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003779 break;
3780 }
3781 }
3782
3783 /*
3784 * Handle specific association to the META tag
3785 */
William M. Bracke978ae22007-03-21 06:16:02 +00003786 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003787 htmlCheckMeta(ctxt, atts);
3788
3789 /*
3790 * SAX: Start of Element !
3791 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003792 if (!discardtag) {
3793 htmlnamePush(ctxt, name);
3794 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3795 if (nbatts != 0)
3796 ctxt->sax->startElement(ctxt->userData, name, atts);
3797 else
3798 ctxt->sax->startElement(ctxt->userData, name, NULL);
3799 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003800 }
Owen Taylor3473f882001-02-23 17:55:21 +00003801
3802 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003803 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003804 if (atts[i] != NULL)
3805 xmlFree((xmlChar *) atts[i]);
3806 }
Owen Taylor3473f882001-02-23 17:55:21 +00003807 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003808
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003809 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003810}
3811
3812/**
3813 * htmlParseEndTag:
3814 * @ctxt: an HTML parser context
3815 *
3816 * parse an end of tag
3817 *
3818 * [42] ETag ::= '</' Name S? '>'
3819 *
3820 * With namespace
3821 *
3822 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003823 *
3824 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003825 */
3826
Daniel Veillardf420ac52001-07-04 16:04:09 +00003827static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003828htmlParseEndTag(htmlParserCtxtPtr ctxt)
3829{
3830 const xmlChar *name;
3831 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003832 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003833
3834 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003835 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3836 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003837 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003838 }
3839 SKIP(2);
3840
3841 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003842 if (name == NULL)
3843 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003844 /*
3845 * We should definitely be at the ending "S? '>'" part
3846 */
3847 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003848 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003849 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3850 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003851 if (ctxt->recovery) {
3852 /*
3853 * We're not at the ending > !!
3854 * Error, unless in recover mode where we search forwards
3855 * until we find a >
3856 */
3857 while (CUR != '\0' && CUR != '>') NEXT;
3858 NEXT;
3859 }
Owen Taylor3473f882001-02-23 17:55:21 +00003860 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003861 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003862
3863 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003864 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3865 * out now.
3866 */
3867 if ((ctxt->depth > 0) &&
3868 (xmlStrEqual(name, BAD_CAST "html") ||
3869 xmlStrEqual(name, BAD_CAST "body") ||
3870 xmlStrEqual(name, BAD_CAST "head"))) {
3871 ctxt->depth--;
3872 return (0);
3873 }
3874
3875 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003876 * If the name read is not one of the element in the parsing stack
3877 * then return, it's just an error.
3878 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003879 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3880 if (xmlStrEqual(name, ctxt->nameTab[i]))
3881 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003882 }
3883 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003884 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3885 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003886 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003887 }
3888
3889
3890 /*
3891 * Check for auto-closure of HTML elements.
3892 */
3893
3894 htmlAutoCloseOnClose(ctxt, name);
3895
3896 /*
3897 * Well formedness constraints, opening and closing must match.
3898 * With the exception that the autoclose may have popped stuff out
3899 * of the stack.
3900 */
3901 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003902 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003903 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3904 "Opening and ending tag mismatch: %s and %s\n",
3905 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003906 }
3907 }
3908
3909 /*
3910 * SAX: End of Tag
3911 */
3912 oldname = ctxt->name;
3913 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003914 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3915 ctxt->sax->endElement(ctxt->userData, name);
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08003916 htmlNodeInfoPop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003917 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003918 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003919 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003920 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003921 }
3922
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003923 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003924}
3925
3926
3927/**
3928 * htmlParseReference:
3929 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003930 *
Owen Taylor3473f882001-02-23 17:55:21 +00003931 * parse and handle entity references in content,
3932 * this will end-up in a call to character() since this is either a
3933 * CharRef, or a predefined entity.
3934 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003935static void
Owen Taylor3473f882001-02-23 17:55:21 +00003936htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003937 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003938 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003939 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003940 if (CUR != '&') return;
3941
3942 if (NXT(1) == '#') {
3943 unsigned int c;
3944 int bits, i = 0;
3945
3946 c = htmlParseCharRef(ctxt);
3947 if (c == 0)
3948 return;
3949
3950 if (c < 0x80) { out[i++]= c; bits= -6; }
3951 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3952 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3953 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003954
Owen Taylor3473f882001-02-23 17:55:21 +00003955 for ( ; bits >= 0; bits-= 6) {
3956 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3957 }
3958 out[i] = 0;
3959
3960 htmlCheckParagraph(ctxt);
3961 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3962 ctxt->sax->characters(ctxt->userData, out, i);
3963 } else {
3964 ent = htmlParseEntityRef(ctxt, &name);
3965 if (name == NULL) {
3966 htmlCheckParagraph(ctxt);
3967 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3968 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3969 return;
3970 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003971 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003972 htmlCheckParagraph(ctxt);
3973 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3974 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3975 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3976 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3977 }
3978 } else {
3979 unsigned int c;
3980 int bits, i = 0;
3981
3982 c = ent->value;
3983 if (c < 0x80)
3984 { out[i++]= c; bits= -6; }
3985 else if (c < 0x800)
3986 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3987 else if (c < 0x10000)
3988 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003989 else
Owen Taylor3473f882001-02-23 17:55:21 +00003990 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003991
Owen Taylor3473f882001-02-23 17:55:21 +00003992 for ( ; bits >= 0; bits-= 6) {
3993 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3994 }
3995 out[i] = 0;
3996
3997 htmlCheckParagraph(ctxt);
3998 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3999 ctxt->sax->characters(ctxt->userData, out, i);
4000 }
Owen Taylor3473f882001-02-23 17:55:21 +00004001 }
4002}
4003
4004/**
4005 * htmlParseContent:
4006 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004007 *
4008 * Parse a content: comment, sub-element, reference or text.
Eugene Pimenov615904f2010-03-15 15:16:02 +01004009 * Kept for compatibility with old code
Owen Taylor3473f882001-02-23 17:55:21 +00004010 */
4011
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004012static void
Owen Taylor3473f882001-02-23 17:55:21 +00004013htmlParseContent(htmlParserCtxtPtr ctxt) {
4014 xmlChar *currentNode;
4015 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004016 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004017
4018 currentNode = xmlStrdup(ctxt->name);
4019 depth = ctxt->nameNr;
4020 while (1) {
4021 long cons = ctxt->nbChars;
4022
4023 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02004024
4025 if (ctxt->instate == XML_PARSER_EOF)
4026 break;
4027
Owen Taylor3473f882001-02-23 17:55:21 +00004028 /*
4029 * Our tag or one of it's parent or children is ending.
4030 */
4031 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00004032 if (htmlParseEndTag(ctxt) &&
4033 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4034 if (currentNode != NULL)
4035 xmlFree(currentNode);
4036 return;
4037 }
4038 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00004039 }
4040
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004041 else if ((CUR == '<') &&
4042 ((IS_ASCII_LETTER(NXT(1))) ||
4043 (NXT(1) == '_') || (NXT(1) == ':'))) {
4044 name = htmlParseHTMLName_nonInvasive(ctxt);
4045 if (name == NULL) {
4046 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4047 "htmlParseStartTag: invalid element name\n",
4048 NULL, NULL);
4049 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02004050 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004051 NEXT;
4052
4053 if (currentNode != NULL)
4054 xmlFree(currentNode);
4055 return;
4056 }
4057
4058 if (ctxt->name != NULL) {
4059 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4060 htmlAutoClose(ctxt, name);
4061 continue;
4062 }
Daniel Veillarde77db162009-08-22 11:32:38 +02004063 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004064 }
4065
Owen Taylor3473f882001-02-23 17:55:21 +00004066 /*
4067 * Has this node been popped out during parsing of
4068 * the next element
4069 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00004070 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4071 (!xmlStrEqual(currentNode, ctxt->name)))
4072 {
Owen Taylor3473f882001-02-23 17:55:21 +00004073 if (currentNode != NULL) xmlFree(currentNode);
4074 return;
4075 }
4076
Daniel Veillardf9533d12001-03-03 10:04:57 +00004077 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4078 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004079 /*
4080 * Handle SCRIPT/STYLE separately
4081 */
4082 htmlParseScript(ctxt);
4083 } else {
4084 /*
4085 * Sometimes DOCTYPE arrives in the middle of the document
4086 */
4087 if ((CUR == '<') && (NXT(1) == '!') &&
4088 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4089 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4090 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4091 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004092 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4093 "Misplaced DOCTYPE declaration\n",
4094 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004095 htmlParseDocTypeDecl(ctxt);
4096 }
4097
4098 /*
4099 * First case : a comment
4100 */
4101 if ((CUR == '<') && (NXT(1) == '!') &&
4102 (NXT(2) == '-') && (NXT(3) == '-')) {
4103 htmlParseComment(ctxt);
4104 }
4105
4106 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004107 * Second case : a Processing Instruction.
4108 */
4109 else if ((CUR == '<') && (NXT(1) == '?')) {
4110 htmlParsePI(ctxt);
4111 }
4112
4113 /*
4114 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004115 */
4116 else if (CUR == '<') {
4117 htmlParseElement(ctxt);
4118 }
4119
4120 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004121 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004122 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004123 */
4124 else if (CUR == '&') {
4125 htmlParseReference(ctxt);
4126 }
4127
4128 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004129 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004130 */
4131 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004132 htmlAutoCloseOnEnd(ctxt);
4133 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004134 }
4135
4136 /*
4137 * Last case, text. Note that References are handled directly.
4138 */
4139 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004140 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004141 }
4142
4143 if (cons == ctxt->nbChars) {
4144 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004145 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4146 "detected an error in element content\n",
4147 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004148 }
4149 break;
4150 }
4151 }
4152 GROW;
4153 }
4154 if (currentNode != NULL) xmlFree(currentNode);
4155}
4156
4157/**
4158 * htmlParseElement:
4159 * @ctxt: an HTML parser context
4160 *
4161 * parse an HTML element, this is highly recursive
Eugene Pimenov615904f2010-03-15 15:16:02 +01004162 * this is kept for compatibility with previous code versions
Owen Taylor3473f882001-02-23 17:55:21 +00004163 *
4164 * [39] element ::= EmptyElemTag | STag content ETag
4165 *
4166 * [41] Attribute ::= Name Eq AttValue
4167 */
4168
4169void
4170htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004171 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004172 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004173 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004174 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004175 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004176 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004177 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004178
Daniel Veillarda03e3652004-11-02 18:45:30 +00004179 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4180 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004181 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004182 return;
4183 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004184
4185 if (ctxt->instate == XML_PARSER_EOF)
4186 return;
4187
Owen Taylor3473f882001-02-23 17:55:21 +00004188 /* Capture start position */
4189 if (ctxt->record_info) {
4190 node_info.begin_pos = ctxt->input->consumed +
4191 (CUR_PTR - ctxt->input->base);
4192 node_info.begin_line = ctxt->input->line;
4193 }
4194
Daniel Veillard597f1c12005-07-03 23:00:18 +00004195 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004196 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004197 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004198 if (CUR == '>')
4199 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004200 return;
4201 }
Owen Taylor3473f882001-02-23 17:55:21 +00004202
4203 /*
4204 * Lookup the info for that element.
4205 */
4206 info = htmlTagLookup(name);
4207 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004208 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4209 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004210 }
4211
4212 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004213 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004214 */
4215 if ((CUR == '/') && (NXT(1) == '>')) {
4216 SKIP(2);
4217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4218 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004220 return;
4221 }
4222
4223 if (CUR == '>') {
4224 NEXT;
4225 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004226 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4227 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004228
4229 /*
4230 * end of parsing of this node.
4231 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004232 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004233 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004234 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004235 }
Owen Taylor3473f882001-02-23 17:55:21 +00004236
4237 /*
4238 * Capture end position and add node
4239 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004240 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004241 node_info.end_pos = ctxt->input->consumed +
4242 (CUR_PTR - ctxt->input->base);
4243 node_info.end_line = ctxt->input->line;
4244 node_info.node = ctxt->node;
4245 xmlParserAddNodeInfo(ctxt, &node_info);
4246 }
4247 return;
4248 }
4249
4250 /*
4251 * Check for an Empty Element from DTD definition
4252 */
4253 if ((info != NULL) && (info->empty)) {
4254 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4255 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004256 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004257 return;
4258 }
4259
4260 /*
4261 * Parse the content of the element:
4262 */
4263 currentNode = xmlStrdup(ctxt->name);
4264 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004265 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004266 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004267 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004268 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004269 if (ctxt->nameNr < depth) break;
4270 }
Owen Taylor3473f882001-02-23 17:55:21 +00004271
Owen Taylor3473f882001-02-23 17:55:21 +00004272 /*
4273 * Capture end position and add node
4274 */
4275 if ( currentNode != NULL && ctxt->record_info ) {
4276 node_info.end_pos = ctxt->input->consumed +
4277 (CUR_PTR - ctxt->input->base);
4278 node_info.end_line = ctxt->input->line;
4279 node_info.node = ctxt->node;
4280 xmlParserAddNodeInfo(ctxt, &node_info);
4281 }
William M. Brack76e95df2003-10-18 16:20:14 +00004282 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004283 htmlAutoCloseOnEnd(ctxt);
4284 }
4285
Owen Taylor3473f882001-02-23 17:55:21 +00004286 if (currentNode != NULL)
4287 xmlFree(currentNode);
4288}
4289
Eugene Pimenov615904f2010-03-15 15:16:02 +01004290static void
4291htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4292 /*
4293 * Capture end position and add node
4294 */
4295 if ( ctxt->node != NULL && ctxt->record_info ) {
4296 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4297 (CUR_PTR - ctxt->input->base);
4298 ctxt->nodeInfo->end_line = ctxt->input->line;
4299 ctxt->nodeInfo->node = ctxt->node;
4300 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4301 htmlNodeInfoPop(ctxt);
4302 }
4303 if (!IS_CHAR_CH(CUR)) {
4304 htmlAutoCloseOnEnd(ctxt);
4305 }
4306}
4307
4308/**
4309 * htmlParseElementInternal:
4310 * @ctxt: an HTML parser context
4311 *
4312 * parse an HTML element, new version, non recursive
4313 *
4314 * [39] element ::= EmptyElemTag | STag content ETag
4315 *
4316 * [41] Attribute ::= Name Eq AttValue
4317 */
4318
4319static void
4320htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4321 const xmlChar *name;
4322 const htmlElemDesc * info;
4323 htmlParserNodeInfo node_info;
4324 int failed;
Eugene Pimenov615904f2010-03-15 15:16:02 +01004325
4326 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4327 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4328 "htmlParseElementInternal: context error\n", NULL, NULL);
4329 return;
4330 }
4331
4332 if (ctxt->instate == XML_PARSER_EOF)
4333 return;
4334
4335 /* Capture start position */
4336 if (ctxt->record_info) {
4337 node_info.begin_pos = ctxt->input->consumed +
4338 (CUR_PTR - ctxt->input->base);
4339 node_info.begin_line = ctxt->input->line;
4340 }
4341
4342 failed = htmlParseStartTag(ctxt);
4343 name = ctxt->name;
4344 if ((failed == -1) || (name == NULL)) {
4345 if (CUR == '>')
4346 NEXT;
4347 return;
4348 }
4349
4350 /*
4351 * Lookup the info for that element.
4352 */
4353 info = htmlTagLookup(name);
4354 if (info == NULL) {
4355 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4356 "Tag %s invalid\n", name, NULL);
4357 }
4358
4359 /*
4360 * Check for an Empty Element labeled the XML/SGML way
4361 */
4362 if ((CUR == '/') && (NXT(1) == '>')) {
4363 SKIP(2);
4364 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4365 ctxt->sax->endElement(ctxt->userData, name);
4366 htmlnamePop(ctxt);
4367 return;
4368 }
4369
4370 if (CUR == '>') {
4371 NEXT;
4372 } else {
4373 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4374 "Couldn't find end of Start Tag %s\n", name, NULL);
4375
4376 /*
4377 * end of parsing of this node.
4378 */
4379 if (xmlStrEqual(name, ctxt->name)) {
4380 nodePop(ctxt);
4381 htmlnamePop(ctxt);
4382 }
4383
4384 if (ctxt->record_info)
4385 htmlNodeInfoPush(ctxt, &node_info);
4386 htmlParserFinishElementParsing(ctxt);
4387 return;
4388 }
4389
4390 /*
4391 * Check for an Empty Element from DTD definition
4392 */
4393 if ((info != NULL) && (info->empty)) {
4394 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4395 ctxt->sax->endElement(ctxt->userData, name);
4396 htmlnamePop(ctxt);
4397 return;
4398 }
4399
4400 if (ctxt->record_info)
4401 htmlNodeInfoPush(ctxt, &node_info);
4402}
4403
4404/**
4405 * htmlParseContentInternal:
4406 * @ctxt: an HTML parser context
4407 *
4408 * Parse a content: comment, sub-element, reference or text.
4409 * New version for non recursive htmlParseElementInternal
4410 */
4411
4412static void
4413htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4414 xmlChar *currentNode;
4415 int depth;
4416 const xmlChar *name;
4417
4418 currentNode = xmlStrdup(ctxt->name);
4419 depth = ctxt->nameNr;
4420 while (1) {
4421 long cons = ctxt->nbChars;
4422
4423 GROW;
4424
4425 if (ctxt->instate == XML_PARSER_EOF)
4426 break;
4427
4428 /*
4429 * Our tag or one of it's parent or children is ending.
4430 */
4431 if ((CUR == '<') && (NXT(1) == '/')) {
4432 if (htmlParseEndTag(ctxt) &&
4433 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4434 if (currentNode != NULL)
4435 xmlFree(currentNode);
4436
4437 currentNode = xmlStrdup(ctxt->name);
4438 depth = ctxt->nameNr;
4439 }
4440 continue; /* while */
4441 }
4442
4443 else if ((CUR == '<') &&
4444 ((IS_ASCII_LETTER(NXT(1))) ||
4445 (NXT(1) == '_') || (NXT(1) == ':'))) {
4446 name = htmlParseHTMLName_nonInvasive(ctxt);
4447 if (name == NULL) {
4448 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4449 "htmlParseStartTag: invalid element name\n",
4450 NULL, NULL);
4451 /* Dump the bogus tag like browsers do */
4452 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4453 NEXT;
4454
4455 htmlParserFinishElementParsing(ctxt);
4456 if (currentNode != NULL)
4457 xmlFree(currentNode);
4458
4459 currentNode = xmlStrdup(ctxt->name);
4460 depth = ctxt->nameNr;
4461 continue;
4462 }
4463
4464 if (ctxt->name != NULL) {
4465 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4466 htmlAutoClose(ctxt, name);
4467 continue;
4468 }
4469 }
4470 }
4471
4472 /*
4473 * Has this node been popped out during parsing of
4474 * the next element
4475 */
4476 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4477 (!xmlStrEqual(currentNode, ctxt->name)))
4478 {
4479 htmlParserFinishElementParsing(ctxt);
4480 if (currentNode != NULL) xmlFree(currentNode);
4481
4482 currentNode = xmlStrdup(ctxt->name);
4483 depth = ctxt->nameNr;
4484 continue;
4485 }
4486
4487 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4488 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4489 /*
4490 * Handle SCRIPT/STYLE separately
4491 */
4492 htmlParseScript(ctxt);
4493 } else {
4494 /*
4495 * Sometimes DOCTYPE arrives in the middle of the document
4496 */
4497 if ((CUR == '<') && (NXT(1) == '!') &&
4498 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4499 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4500 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4501 (UPP(8) == 'E')) {
4502 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4503 "Misplaced DOCTYPE declaration\n",
4504 BAD_CAST "DOCTYPE" , NULL);
4505 htmlParseDocTypeDecl(ctxt);
4506 }
4507
4508 /*
4509 * First case : a comment
4510 */
4511 if ((CUR == '<') && (NXT(1) == '!') &&
4512 (NXT(2) == '-') && (NXT(3) == '-')) {
4513 htmlParseComment(ctxt);
4514 }
4515
4516 /*
4517 * Second case : a Processing Instruction.
4518 */
4519 else if ((CUR == '<') && (NXT(1) == '?')) {
4520 htmlParsePI(ctxt);
4521 }
4522
4523 /*
4524 * Third case : a sub-element.
4525 */
4526 else if (CUR == '<') {
4527 htmlParseElementInternal(ctxt);
4528 if (currentNode != NULL) xmlFree(currentNode);
4529
4530 currentNode = xmlStrdup(ctxt->name);
4531 depth = ctxt->nameNr;
4532 }
4533
4534 /*
4535 * Fourth case : a reference. If if has not been resolved,
4536 * parsing returns it's Name, create the node
4537 */
4538 else if (CUR == '&') {
4539 htmlParseReference(ctxt);
4540 }
4541
4542 /*
4543 * Fifth case : end of the resource
4544 */
4545 else if (CUR == 0) {
4546 htmlAutoCloseOnEnd(ctxt);
4547 break;
4548 }
4549
4550 /*
4551 * Last case, text. Note that References are handled directly.
4552 */
4553 else {
4554 htmlParseCharData(ctxt);
4555 }
4556
4557 if (cons == ctxt->nbChars) {
4558 if (ctxt->node != NULL) {
4559 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4560 "detected an error in element content\n",
4561 NULL, NULL);
4562 }
4563 break;
4564 }
4565 }
4566 GROW;
4567 }
4568 if (currentNode != NULL) xmlFree(currentNode);
4569}
4570
4571/**
4572 * htmlParseContent:
4573 * @ctxt: an HTML parser context
4574 *
4575 * Parse a content: comment, sub-element, reference or text.
4576 * This is the entry point when called from parser.c
4577 */
4578
4579void
4580__htmlParseContent(void *ctxt) {
4581 if (ctxt != NULL)
4582 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4583}
4584
Owen Taylor3473f882001-02-23 17:55:21 +00004585/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004586 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004587 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004588 *
Owen Taylor3473f882001-02-23 17:55:21 +00004589 * parse an HTML document (and build a tree if using the standard SAX
4590 * interface).
4591 *
4592 * Returns 0, -1 in case of error. the parser context is augmented
4593 * as a result of the parsing.
4594 */
4595
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004596int
Owen Taylor3473f882001-02-23 17:55:21 +00004597htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004598 xmlChar start[4];
4599 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004600 xmlDtdPtr dtd;
4601
Daniel Veillardd0463562001-10-13 09:15:48 +00004602 xmlInitParser();
4603
Owen Taylor3473f882001-02-23 17:55:21 +00004604 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004605
Daniel Veillarda03e3652004-11-02 18:45:30 +00004606 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4607 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4608 "htmlParseDocument: context error\n", NULL, NULL);
4609 return(XML_ERR_INTERNAL_ERROR);
4610 }
4611 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004612 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004613 GROW;
4614 /*
4615 * SAX: beginning of the document processing.
4616 */
4617 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4618 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4619
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004620 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4621 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4622 /*
4623 * Get the 4 first bytes and decode the charset
4624 * if enc != XML_CHAR_ENCODING_NONE
4625 * plug some encoding conversion routines.
4626 */
4627 start[0] = RAW;
4628 start[1] = NXT(1);
4629 start[2] = NXT(2);
4630 start[3] = NXT(3);
4631 enc = xmlDetectCharEncoding(&start[0], 4);
4632 if (enc != XML_CHAR_ENCODING_NONE) {
4633 xmlSwitchEncoding(ctxt, enc);
4634 }
4635 }
4636
Owen Taylor3473f882001-02-23 17:55:21 +00004637 /*
4638 * Wipe out everything which is before the first '<'
4639 */
4640 SKIP_BLANKS;
4641 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004642 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004643 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004644 }
4645
4646 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4647 ctxt->sax->startDocument(ctxt->userData);
4648
4649
4650 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004651 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004652 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004653 while (((CUR == '<') && (NXT(1) == '!') &&
4654 (NXT(2) == '-') && (NXT(3) == '-')) ||
4655 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004656 htmlParseComment(ctxt);
4657 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004658 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004659 }
Owen Taylor3473f882001-02-23 17:55:21 +00004660
4661
4662 /*
4663 * Then possibly doc type declaration(s) and more Misc
4664 * (doctypedecl Misc*)?
4665 */
4666 if ((CUR == '<') && (NXT(1) == '!') &&
4667 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4668 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4669 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4670 (UPP(8) == 'E')) {
4671 htmlParseDocTypeDecl(ctxt);
4672 }
4673 SKIP_BLANKS;
4674
4675 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004676 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004677 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004678 while (((CUR == '<') && (NXT(1) == '!') &&
4679 (NXT(2) == '-') && (NXT(3) == '-')) ||
4680 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004681 htmlParseComment(ctxt);
4682 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004683 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004684 }
Owen Taylor3473f882001-02-23 17:55:21 +00004685
4686 /*
4687 * Time to start parsing the tree itself
4688 */
Eugene Pimenov615904f2010-03-15 15:16:02 +01004689 htmlParseContentInternal(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004690
4691 /*
4692 * autoclose
4693 */
4694 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004695 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004696
4697
4698 /*
4699 * SAX: end of the document processing.
4700 */
4701 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4702 ctxt->sax->endDocument(ctxt->userData);
4703
Daniel Veillardf1121c42010-07-26 14:02:42 +02004704 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004705 dtd = xmlGetIntSubset(ctxt->myDoc);
4706 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004707 ctxt->myDoc->intSubset =
4708 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004709 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4710 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4711 }
4712 if (! ctxt->wellFormed) return(-1);
4713 return(0);
4714}
4715
4716
4717/************************************************************************
4718 * *
4719 * Parser contexts handling *
4720 * *
4721 ************************************************************************/
4722
4723/**
William M. Brackedb65a72004-02-06 07:36:04 +00004724 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004725 * @ctxt: an HTML parser context
4726 *
4727 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004728 *
4729 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004730 */
4731
Daniel Veillardf403d292003-10-05 13:51:35 +00004732static int
Owen Taylor3473f882001-02-23 17:55:21 +00004733htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4734{
4735 htmlSAXHandler *sax;
4736
Daniel Veillardf403d292003-10-05 13:51:35 +00004737 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004738 memset(ctxt, 0, sizeof(htmlParserCtxt));
4739
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004740 ctxt->dict = xmlDictCreate();
4741 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004742 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4743 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004744 }
Owen Taylor3473f882001-02-23 17:55:21 +00004745 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4746 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004747 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4748 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004749 }
4750 else
4751 memset(sax, 0, sizeof(htmlSAXHandler));
4752
4753 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004754 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004755 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4756 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004757 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004758 ctxt->inputNr = 0;
4759 ctxt->inputMax = 0;
4760 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004761 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004762 }
4763 ctxt->inputNr = 0;
4764 ctxt->inputMax = 5;
4765 ctxt->input = NULL;
4766 ctxt->version = NULL;
4767 ctxt->encoding = NULL;
4768 ctxt->standalone = -1;
4769 ctxt->instate = XML_PARSER_START;
4770
4771 /* Allocate the Node stack */
4772 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4773 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004774 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004775 ctxt->nodeNr = 0;
4776 ctxt->nodeMax = 0;
4777 ctxt->node = NULL;
4778 ctxt->inputNr = 0;
4779 ctxt->inputMax = 0;
4780 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004781 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004782 }
4783 ctxt->nodeNr = 0;
4784 ctxt->nodeMax = 10;
4785 ctxt->node = NULL;
4786
4787 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004788 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004789 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004790 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004791 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004792 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004793 ctxt->name = NULL;
4794 ctxt->nodeNr = 0;
4795 ctxt->nodeMax = 0;
4796 ctxt->node = NULL;
4797 ctxt->inputNr = 0;
4798 ctxt->inputMax = 0;
4799 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004800 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004801 }
4802 ctxt->nameNr = 0;
4803 ctxt->nameMax = 10;
4804 ctxt->name = NULL;
4805
Eugene Pimenov615904f2010-03-15 15:16:02 +01004806 ctxt->nodeInfoTab = NULL;
4807 ctxt->nodeInfoNr = 0;
4808 ctxt->nodeInfoMax = 0;
4809
Daniel Veillard092643b2003-09-25 14:29:29 +00004810 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004811 else {
4812 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004813 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004814 }
4815 ctxt->userData = ctxt;
4816 ctxt->myDoc = NULL;
4817 ctxt->wellFormed = 1;
4818 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004819 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004820 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004821 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004822 ctxt->vctxt.userData = ctxt;
4823 ctxt->vctxt.error = xmlParserValidityError;
4824 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004825 ctxt->record_info = 0;
4826 ctxt->validate = 0;
4827 ctxt->nbChars = 0;
4828 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004829 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004830 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004831 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004832}
4833
4834/**
4835 * htmlFreeParserCtxt:
4836 * @ctxt: an HTML parser context
4837 *
4838 * Free all the memory used by a parser context. However the parsed
4839 * document in ctxt->myDoc is not freed.
4840 */
4841
4842void
4843htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4844{
4845 xmlFreeParserCtxt(ctxt);
4846}
4847
4848/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004849 * htmlNewParserCtxt:
4850 *
4851 * Allocate and initialize a new parser context.
4852 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004853 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004854 */
4855
Daniel Veillard34c647c2006-09-21 06:53:59 +00004856htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004857htmlNewParserCtxt(void)
4858{
4859 xmlParserCtxtPtr ctxt;
4860
4861 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4862 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004863 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004864 return(NULL);
4865 }
4866 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004867 if (htmlInitParserCtxt(ctxt) < 0) {
4868 htmlFreeParserCtxt(ctxt);
4869 return(NULL);
4870 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004871 return(ctxt);
4872}
4873
4874/**
4875 * htmlCreateMemoryParserCtxt:
4876 * @buffer: a pointer to a char array
4877 * @size: the size of the array
4878 *
4879 * Create a parser context for an HTML in-memory document.
4880 *
4881 * Returns the new parser context or NULL
4882 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004883htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004884htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4885 xmlParserCtxtPtr ctxt;
4886 xmlParserInputPtr input;
4887 xmlParserInputBufferPtr buf;
4888
4889 if (buffer == NULL)
4890 return(NULL);
4891 if (size <= 0)
4892 return(NULL);
4893
4894 ctxt = htmlNewParserCtxt();
4895 if (ctxt == NULL)
4896 return(NULL);
4897
4898 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4899 if (buf == NULL) return(NULL);
4900
4901 input = xmlNewInputStream(ctxt);
4902 if (input == NULL) {
4903 xmlFreeParserCtxt(ctxt);
4904 return(NULL);
4905 }
4906
4907 input->filename = NULL;
4908 input->buf = buf;
4909 input->base = input->buf->buffer->content;
4910 input->cur = input->buf->buffer->content;
4911 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4912
4913 inputPush(ctxt, input);
4914 return(ctxt);
4915}
4916
4917/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004918 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004919 * @cur: a pointer to an array of xmlChar
4920 * @encoding: a free form C string describing the HTML document encoding, or NULL
4921 *
4922 * Create a parser context for an HTML document.
4923 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004924 * TODO: check the need to add encoding handling there
4925 *
Owen Taylor3473f882001-02-23 17:55:21 +00004926 * Returns the new parser context or NULL
4927 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004928static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004929htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004930 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004931 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004932
Daniel Veillard1d995272002-07-22 16:43:32 +00004933 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004934 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004935 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004936 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004937 if (ctxt == NULL)
4938 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004939
4940 if (encoding != NULL) {
4941 xmlCharEncoding enc;
4942 xmlCharEncodingHandlerPtr handler;
4943
4944 if (ctxt->input->encoding != NULL)
4945 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004946 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004947
4948 enc = xmlParseCharEncoding(encoding);
4949 /*
4950 * registered set of known encodings
4951 */
4952 if (enc != XML_CHAR_ENCODING_ERROR) {
4953 xmlSwitchEncoding(ctxt, enc);
4954 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004955 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004956 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004957 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004958 }
4959 } else {
4960 /*
4961 * fallback for unknown encodings
4962 */
4963 handler = xmlFindCharEncodingHandler((const char *) encoding);
4964 if (handler != NULL) {
4965 xmlSwitchToEncoding(ctxt, handler);
4966 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004967 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4968 "Unsupported encoding %s\n",
4969 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004970 }
4971 }
4972 }
4973 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004974}
4975
Daniel Veillard73b013f2003-09-30 12:36:01 +00004976#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004977/************************************************************************
4978 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004979 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004980 * *
4981 ************************************************************************/
4982
4983/**
4984 * htmlParseLookupSequence:
4985 * @ctxt: an HTML parser context
4986 * @first: the first char to lookup
4987 * @next: the next char to lookup or zero
4988 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004989 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004990 *
4991 * Try to find if a sequence (first, next, third) or just (first next) or
4992 * (first) is available in the input stream.
4993 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4994 * to avoid rescanning sequences of bytes, it DOES change the state of the
4995 * parser, do not use liberally.
4996 * This is basically similar to xmlParseLookupSequence()
4997 *
4998 * Returns the index to the current parsing point if the full sequence
4999 * is available, -1 otherwise.
5000 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005001static int
Owen Taylor3473f882001-02-23 17:55:21 +00005002htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02005003 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02005004 int ignoreattrval)
5005{
Owen Taylor3473f882001-02-23 17:55:21 +00005006 int base, len;
5007 htmlParserInputPtr in;
5008 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00005009 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02005010 int invalue = 0;
5011 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00005012
5013 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005014 if (in == NULL)
5015 return (-1);
5016
Owen Taylor3473f882001-02-23 17:55:21 +00005017 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005018 if (base < 0)
5019 return (-1);
5020
Owen Taylor3473f882001-02-23 17:55:21 +00005021 if (ctxt->checkIndex > base)
5022 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005023
Owen Taylor3473f882001-02-23 17:55:21 +00005024 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005025 buf = in->base;
5026 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00005027 } else {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005028 buf = in->buf->buffer->content;
5029 len = in->buf->buffer->use;
Owen Taylor3473f882001-02-23 17:55:21 +00005030 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005031
Owen Taylor3473f882001-02-23 17:55:21 +00005032 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02005033 if (third)
5034 len -= 2;
5035 else if (next)
5036 len--;
5037 for (; base < len; base++) {
5038 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5039 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5040 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5041 incomment = 1;
5042 /* do not increment past <! - some people use <!--> */
5043 base += 2;
5044 }
5045 }
5046 if (ignoreattrval) {
5047 if (buf[base] == '"' || buf[base] == '\'') {
5048 if (invalue) {
5049 if (buf[base] == valdellim) {
5050 invalue = 0;
5051 continue;
5052 }
5053 } else {
5054 valdellim = buf[base];
5055 invalue = 1;
5056 continue;
5057 }
5058 } else if (invalue) {
5059 continue;
5060 }
5061 }
5062 if (incomment) {
5063 if (base + 3 > len)
5064 return (-1);
5065 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5066 (buf[base + 2] == '>')) {
5067 incomment = 0;
5068 base += 2;
5069 }
5070 continue;
5071 }
Owen Taylor3473f882001-02-23 17:55:21 +00005072 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005073 if (third != 0) {
5074 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5075 continue;
5076 } else if (next != 0) {
5077 if (buf[base + 1] != next)
5078 continue;
5079 }
5080 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00005081#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02005082 if (next == 0)
5083 xmlGenericError(xmlGenericErrorContext,
5084 "HPP: lookup '%c' found at %d\n",
5085 first, base);
5086 else if (third == 0)
5087 xmlGenericError(xmlGenericErrorContext,
5088 "HPP: lookup '%c%c' found at %d\n",
5089 first, next, base);
5090 else
5091 xmlGenericError(xmlGenericErrorContext,
5092 "HPP: lookup '%c%c%c' found at %d\n",
5093 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00005094#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005095 return (base - (in->cur - in->base));
5096 }
Owen Taylor3473f882001-02-23 17:55:21 +00005097 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005098 if ((!incomment) && (!invalue))
5099 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00005100#ifdef DEBUG_PUSH
5101 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005102 xmlGenericError(xmlGenericErrorContext,
5103 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00005104 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005105 xmlGenericError(xmlGenericErrorContext,
5106 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02005107 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02005108 xmlGenericError(xmlGenericErrorContext,
5109 "HPP: lookup '%c%c%c' failed\n", first, next,
5110 third);
Owen Taylor3473f882001-02-23 17:55:21 +00005111#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005112 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00005113}
5114
5115/**
Markus Kull56a03032009-08-24 19:00:23 +02005116 * htmlParseLookupChars:
5117 * @ctxt: an HTML parser context
5118 * @stop: Array of chars, which stop the lookup.
5119 * @stopLen: Length of stop-Array
5120 *
5121 * Try to find if any char of the stop-Array is available in the input
5122 * stream.
5123 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5124 * to avoid rescanning sequences of bytes, it DOES change the state of the
5125 * parser, do not use liberally.
5126 *
5127 * Returns the index to the current parsing point if a stopChar
5128 * is available, -1 otherwise.
5129 */
5130static int
5131htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5132 int stopLen)
5133{
5134 int base, len;
5135 htmlParserInputPtr in;
5136 const xmlChar *buf;
5137 int incomment = 0;
5138 int i;
5139
5140 in = ctxt->input;
5141 if (in == NULL)
5142 return (-1);
5143
5144 base = in->cur - in->base;
5145 if (base < 0)
5146 return (-1);
5147
5148 if (ctxt->checkIndex > base)
5149 base = ctxt->checkIndex;
5150
5151 if (in->buf == NULL) {
5152 buf = in->base;
5153 len = in->length;
5154 } else {
5155 buf = in->buf->buffer->content;
5156 len = in->buf->buffer->use;
5157 }
5158
5159 for (; base < len; base++) {
5160 if (!incomment && (base + 4 < len)) {
5161 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5162 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5163 incomment = 1;
5164 /* do not increment past <! - some people use <!--> */
5165 base += 2;
5166 }
5167 }
5168 if (incomment) {
5169 if (base + 3 > len)
5170 return (-1);
5171 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5172 (buf[base + 2] == '>')) {
5173 incomment = 0;
5174 base += 2;
5175 }
5176 continue;
5177 }
5178 for (i = 0; i < stopLen; ++i) {
5179 if (buf[base] == stop[i]) {
5180 ctxt->checkIndex = 0;
5181 return (base - (in->cur - in->base));
5182 }
5183 }
5184 }
5185 ctxt->checkIndex = base;
5186 return (-1);
5187}
5188
5189/**
Owen Taylor3473f882001-02-23 17:55:21 +00005190 * htmlParseTryOrFinish:
5191 * @ctxt: an HTML parser context
5192 * @terminate: last chunk indicator
5193 *
5194 * Try to progress on parsing
5195 *
5196 * Returns zero if no parsing was possible
5197 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005198static int
Owen Taylor3473f882001-02-23 17:55:21 +00005199htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5200 int ret = 0;
5201 htmlParserInputPtr in;
5202 int avail = 0;
5203 xmlChar cur, next;
5204
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005205 htmlParserNodeInfo node_info;
5206
Owen Taylor3473f882001-02-23 17:55:21 +00005207#ifdef DEBUG_PUSH
5208 switch (ctxt->instate) {
5209 case XML_PARSER_EOF:
5210 xmlGenericError(xmlGenericErrorContext,
5211 "HPP: try EOF\n"); break;
5212 case XML_PARSER_START:
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: try START\n"); break;
5215 case XML_PARSER_MISC:
5216 xmlGenericError(xmlGenericErrorContext,
5217 "HPP: try MISC\n");break;
5218 case XML_PARSER_COMMENT:
5219 xmlGenericError(xmlGenericErrorContext,
5220 "HPP: try COMMENT\n");break;
5221 case XML_PARSER_PROLOG:
5222 xmlGenericError(xmlGenericErrorContext,
5223 "HPP: try PROLOG\n");break;
5224 case XML_PARSER_START_TAG:
5225 xmlGenericError(xmlGenericErrorContext,
5226 "HPP: try START_TAG\n");break;
5227 case XML_PARSER_CONTENT:
5228 xmlGenericError(xmlGenericErrorContext,
5229 "HPP: try CONTENT\n");break;
5230 case XML_PARSER_CDATA_SECTION:
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: try CDATA_SECTION\n");break;
5233 case XML_PARSER_END_TAG:
5234 xmlGenericError(xmlGenericErrorContext,
5235 "HPP: try END_TAG\n");break;
5236 case XML_PARSER_ENTITY_DECL:
5237 xmlGenericError(xmlGenericErrorContext,
5238 "HPP: try ENTITY_DECL\n");break;
5239 case XML_PARSER_ENTITY_VALUE:
5240 xmlGenericError(xmlGenericErrorContext,
5241 "HPP: try ENTITY_VALUE\n");break;
5242 case XML_PARSER_ATTRIBUTE_VALUE:
5243 xmlGenericError(xmlGenericErrorContext,
5244 "HPP: try ATTRIBUTE_VALUE\n");break;
5245 case XML_PARSER_DTD:
5246 xmlGenericError(xmlGenericErrorContext,
5247 "HPP: try DTD\n");break;
5248 case XML_PARSER_EPILOG:
5249 xmlGenericError(xmlGenericErrorContext,
5250 "HPP: try EPILOG\n");break;
5251 case XML_PARSER_PI:
5252 xmlGenericError(xmlGenericErrorContext,
5253 "HPP: try PI\n");break;
5254 case XML_PARSER_SYSTEM_LITERAL:
5255 xmlGenericError(xmlGenericErrorContext,
5256 "HPP: try SYSTEM_LITERAL\n");break;
5257 }
5258#endif
5259
5260 while (1) {
5261
5262 in = ctxt->input;
5263 if (in == NULL) break;
5264 if (in->buf == NULL)
5265 avail = in->length - (in->cur - in->base);
5266 else
5267 avail = in->buf->buffer->use - (in->cur - in->base);
5268 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005269 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005270 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005271 /*
5272 * SAX: end of the document processing.
5273 */
5274 ctxt->instate = XML_PARSER_EOF;
5275 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5276 ctxt->sax->endDocument(ctxt->userData);
5277 }
5278 }
5279 if (avail < 1)
5280 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00005281 cur = in->cur[0];
5282 if (cur == 0) {
5283 SKIP(1);
5284 continue;
5285 }
5286
Owen Taylor3473f882001-02-23 17:55:21 +00005287 switch (ctxt->instate) {
5288 case XML_PARSER_EOF:
5289 /*
5290 * Document parsing is done !
5291 */
5292 goto done;
5293 case XML_PARSER_START:
5294 /*
5295 * Very first chars read from the document flow.
5296 */
5297 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005298 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005299 SKIP_BLANKS;
5300 if (in->buf == NULL)
5301 avail = in->length - (in->cur - in->base);
5302 else
5303 avail = in->buf->buffer->use - (in->cur - in->base);
5304 }
5305 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5306 ctxt->sax->setDocumentLocator(ctxt->userData,
5307 &xmlDefaultSAXLocator);
5308 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5309 (!ctxt->disableSAX))
5310 ctxt->sax->startDocument(ctxt->userData);
5311
5312 cur = in->cur[0];
5313 next = in->cur[1];
5314 if ((cur == '<') && (next == '!') &&
5315 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5316 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5317 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5318 (UPP(8) == 'E')) {
5319 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005320 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005321 goto done;
5322#ifdef DEBUG_PUSH
5323 xmlGenericError(xmlGenericErrorContext,
5324 "HPP: Parsing internal subset\n");
5325#endif
5326 htmlParseDocTypeDecl(ctxt);
5327 ctxt->instate = XML_PARSER_PROLOG;
5328#ifdef DEBUG_PUSH
5329 xmlGenericError(xmlGenericErrorContext,
5330 "HPP: entering PROLOG\n");
5331#endif
5332 } else {
5333 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005334#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00005335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005337#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00005338 }
Owen Taylor3473f882001-02-23 17:55:21 +00005339 break;
5340 case XML_PARSER_MISC:
5341 SKIP_BLANKS;
5342 if (in->buf == NULL)
5343 avail = in->length - (in->cur - in->base);
5344 else
5345 avail = in->buf->buffer->use - (in->cur - in->base);
Denis Paukfdf990c2012-05-10 20:40:49 +08005346 /*
5347 * no chars in buffer
5348 */
5349 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005350 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005351 /*
5352 * not enouth chars in buffer
5353 */
5354 if (avail < 2) {
5355 if (!terminate)
5356 goto done;
5357 else
5358 next = ' ';
5359 } else {
5360 next = in->cur[1];
5361 }
Owen Taylor3473f882001-02-23 17:55:21 +00005362 cur = in->cur[0];
Owen Taylor3473f882001-02-23 17:55:21 +00005363 if ((cur == '<') && (next == '!') &&
5364 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5365 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005366 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005367 goto done;
5368#ifdef DEBUG_PUSH
5369 xmlGenericError(xmlGenericErrorContext,
5370 "HPP: Parsing Comment\n");
5371#endif
5372 htmlParseComment(ctxt);
5373 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005374 } else if ((cur == '<') && (next == '?')) {
5375 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005376 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005377 goto done;
5378#ifdef DEBUG_PUSH
5379 xmlGenericError(xmlGenericErrorContext,
5380 "HPP: Parsing PI\n");
5381#endif
5382 htmlParsePI(ctxt);
5383 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005384 } else if ((cur == '<') && (next == '!') &&
5385 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5386 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5387 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5388 (UPP(8) == 'E')) {
5389 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005390 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005391 goto done;
5392#ifdef DEBUG_PUSH
5393 xmlGenericError(xmlGenericErrorContext,
5394 "HPP: Parsing internal subset\n");
5395#endif
5396 htmlParseDocTypeDecl(ctxt);
5397 ctxt->instate = XML_PARSER_PROLOG;
5398#ifdef DEBUG_PUSH
5399 xmlGenericError(xmlGenericErrorContext,
5400 "HPP: entering PROLOG\n");
5401#endif
5402 } else if ((cur == '<') && (next == '!') &&
5403 (avail < 9)) {
5404 goto done;
5405 } else {
5406 ctxt->instate = XML_PARSER_START_TAG;
5407#ifdef DEBUG_PUSH
5408 xmlGenericError(xmlGenericErrorContext,
5409 "HPP: entering START_TAG\n");
5410#endif
5411 }
5412 break;
5413 case XML_PARSER_PROLOG:
5414 SKIP_BLANKS;
5415 if (in->buf == NULL)
5416 avail = in->length - (in->cur - in->base);
5417 else
5418 avail = in->buf->buffer->use - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005419 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005420 goto done;
5421 cur = in->cur[0];
5422 next = in->cur[1];
5423 if ((cur == '<') && (next == '!') &&
5424 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5425 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005426 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005427 goto done;
5428#ifdef DEBUG_PUSH
5429 xmlGenericError(xmlGenericErrorContext,
5430 "HPP: Parsing Comment\n");
5431#endif
5432 htmlParseComment(ctxt);
5433 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005434 } else if ((cur == '<') && (next == '?')) {
5435 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005436 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005437 goto done;
5438#ifdef DEBUG_PUSH
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: Parsing PI\n");
5441#endif
5442 htmlParsePI(ctxt);
5443 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005444 } else if ((cur == '<') && (next == '!') &&
5445 (avail < 4)) {
5446 goto done;
5447 } else {
5448 ctxt->instate = XML_PARSER_START_TAG;
5449#ifdef DEBUG_PUSH
5450 xmlGenericError(xmlGenericErrorContext,
5451 "HPP: entering START_TAG\n");
5452#endif
5453 }
5454 break;
5455 case XML_PARSER_EPILOG:
5456 if (in->buf == NULL)
5457 avail = in->length - (in->cur - in->base);
5458 else
5459 avail = in->buf->buffer->use - (in->cur - in->base);
5460 if (avail < 1)
5461 goto done;
5462 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005463 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005464 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005465 goto done;
5466 }
5467 if (avail < 2)
5468 goto done;
5469 next = in->cur[1];
5470 if ((cur == '<') && (next == '!') &&
5471 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5472 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005473 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005474 goto done;
5475#ifdef DEBUG_PUSH
5476 xmlGenericError(xmlGenericErrorContext,
5477 "HPP: Parsing Comment\n");
5478#endif
5479 htmlParseComment(ctxt);
5480 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005481 } else if ((cur == '<') && (next == '?')) {
5482 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005483 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005484 goto done;
5485#ifdef DEBUG_PUSH
5486 xmlGenericError(xmlGenericErrorContext,
5487 "HPP: Parsing PI\n");
5488#endif
5489 htmlParsePI(ctxt);
5490 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005491 } else if ((cur == '<') && (next == '!') &&
5492 (avail < 4)) {
5493 goto done;
5494 } else {
5495 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005496 ctxt->wellFormed = 0;
5497 ctxt->instate = XML_PARSER_EOF;
5498#ifdef DEBUG_PUSH
5499 xmlGenericError(xmlGenericErrorContext,
5500 "HPP: entering EOF\n");
5501#endif
5502 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5503 ctxt->sax->endDocument(ctxt->userData);
5504 goto done;
5505 }
5506 break;
5507 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005508 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005509 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005510 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005511
Denis Paukfdf990c2012-05-10 20:40:49 +08005512 /*
5513 * no chars in buffer
5514 */
5515 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005516 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005517 /*
5518 * not enouth chars in buffer
5519 */
5520 if (avail < 2) {
5521 if (!terminate)
5522 goto done;
5523 else
5524 next = ' ';
5525 } else {
5526 next = in->cur[1];
5527 }
Owen Taylor3473f882001-02-23 17:55:21 +00005528 cur = in->cur[0];
5529 if (cur != '<') {
5530 ctxt->instate = XML_PARSER_CONTENT;
5531#ifdef DEBUG_PUSH
5532 xmlGenericError(xmlGenericErrorContext,
5533 "HPP: entering CONTENT\n");
5534#endif
5535 break;
5536 }
Denis Paukfdf990c2012-05-10 20:40:49 +08005537 if (next == '/') {
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005538 ctxt->instate = XML_PARSER_END_TAG;
5539 ctxt->checkIndex = 0;
5540#ifdef DEBUG_PUSH
5541 xmlGenericError(xmlGenericErrorContext,
5542 "HPP: entering END_TAG\n");
5543#endif
5544 break;
5545 }
Owen Taylor3473f882001-02-23 17:55:21 +00005546 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005547 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005548 goto done;
5549
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005550 /* Capture start position */
5551 if (ctxt->record_info) {
5552 node_info.begin_pos = ctxt->input->consumed +
5553 (CUR_PTR - ctxt->input->base);
5554 node_info.begin_line = ctxt->input->line;
5555 }
5556
5557
Daniel Veillard597f1c12005-07-03 23:00:18 +00005558 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005559 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005560 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005561 (name == NULL)) {
5562 if (CUR == '>')
5563 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005564 break;
5565 }
Owen Taylor3473f882001-02-23 17:55:21 +00005566
5567 /*
5568 * Lookup the info for that element.
5569 */
5570 info = htmlTagLookup(name);
5571 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005572 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5573 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005574 }
5575
5576 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005577 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005578 */
5579 if ((CUR == '/') && (NXT(1) == '>')) {
5580 SKIP(2);
5581 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5582 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005583 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005584 ctxt->instate = XML_PARSER_CONTENT;
5585#ifdef DEBUG_PUSH
5586 xmlGenericError(xmlGenericErrorContext,
5587 "HPP: entering CONTENT\n");
5588#endif
5589 break;
5590 }
5591
5592 if (CUR == '>') {
5593 NEXT;
5594 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005595 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5596 "Couldn't find end of Start Tag %s\n",
5597 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005598
5599 /*
5600 * end of parsing of this node.
5601 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005602 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005603 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005604 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005605 }
Owen Taylor3473f882001-02-23 17:55:21 +00005606
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005607 if (ctxt->record_info)
5608 htmlNodeInfoPush(ctxt, &node_info);
5609
Owen Taylor3473f882001-02-23 17:55:21 +00005610 ctxt->instate = XML_PARSER_CONTENT;
5611#ifdef DEBUG_PUSH
5612 xmlGenericError(xmlGenericErrorContext,
5613 "HPP: entering CONTENT\n");
5614#endif
5615 break;
5616 }
5617
5618 /*
5619 * Check for an Empty Element from DTD definition
5620 */
5621 if ((info != NULL) && (info->empty)) {
5622 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5623 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005624 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005625 }
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005626
5627 if (ctxt->record_info)
5628 htmlNodeInfoPush(ctxt, &node_info);
5629
Owen Taylor3473f882001-02-23 17:55:21 +00005630 ctxt->instate = XML_PARSER_CONTENT;
5631#ifdef DEBUG_PUSH
5632 xmlGenericError(xmlGenericErrorContext,
5633 "HPP: entering CONTENT\n");
5634#endif
5635 break;
5636 }
5637 case XML_PARSER_CONTENT: {
5638 long cons;
5639 /*
5640 * Handle preparsed entities and charRef
5641 */
5642 if (ctxt->token != 0) {
5643 xmlChar chr[2] = { 0 , 0 } ;
5644
5645 chr[0] = (xmlChar) ctxt->token;
5646 htmlCheckParagraph(ctxt);
5647 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5648 ctxt->sax->characters(ctxt->userData, chr, 1);
5649 ctxt->token = 0;
5650 ctxt->checkIndex = 0;
5651 }
5652 if ((avail == 1) && (terminate)) {
5653 cur = in->cur[0];
5654 if ((cur != '<') && (cur != '&')) {
5655 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005656 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005657 if (ctxt->sax->ignorableWhitespace != NULL)
5658 ctxt->sax->ignorableWhitespace(
5659 ctxt->userData, &cur, 1);
5660 } else {
5661 htmlCheckParagraph(ctxt);
5662 if (ctxt->sax->characters != NULL)
5663 ctxt->sax->characters(
5664 ctxt->userData, &cur, 1);
5665 }
5666 }
5667 ctxt->token = 0;
5668 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005669 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005670 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005671 }
Owen Taylor3473f882001-02-23 17:55:21 +00005672 }
5673 if (avail < 2)
5674 goto done;
5675 cur = in->cur[0];
5676 next = in->cur[1];
5677 cons = ctxt->nbChars;
5678 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5679 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5680 /*
5681 * Handle SCRIPT/STYLE separately
5682 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005683 if (!terminate) {
5684 int idx;
5685 xmlChar val;
5686
Denis Pauk91d239c2010-11-04 12:39:18 +01005687 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
Daniel Veillard68716a72006-10-16 09:32:17 +00005688 if (idx < 0)
5689 goto done;
5690 val = in->cur[idx + 2];
5691 if (val == 0) /* bad cut of input */
5692 goto done;
5693 }
Owen Taylor3473f882001-02-23 17:55:21 +00005694 htmlParseScript(ctxt);
5695 if ((cur == '<') && (next == '/')) {
5696 ctxt->instate = XML_PARSER_END_TAG;
5697 ctxt->checkIndex = 0;
5698#ifdef DEBUG_PUSH
5699 xmlGenericError(xmlGenericErrorContext,
5700 "HPP: entering END_TAG\n");
5701#endif
5702 break;
5703 }
5704 } else {
5705 /*
5706 * Sometimes DOCTYPE arrives in the middle of the document
5707 */
5708 if ((cur == '<') && (next == '!') &&
5709 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5710 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5711 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5712 (UPP(8) == 'E')) {
5713 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005714 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005715 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005716 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5717 "Misplaced DOCTYPE declaration\n",
5718 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005719 htmlParseDocTypeDecl(ctxt);
5720 } else if ((cur == '<') && (next == '!') &&
5721 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5722 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005723 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005724 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005725 goto done;
5726#ifdef DEBUG_PUSH
5727 xmlGenericError(xmlGenericErrorContext,
5728 "HPP: Parsing Comment\n");
5729#endif
5730 htmlParseComment(ctxt);
5731 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005732 } else if ((cur == '<') && (next == '?')) {
5733 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005734 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005735 goto done;
5736#ifdef DEBUG_PUSH
5737 xmlGenericError(xmlGenericErrorContext,
5738 "HPP: Parsing PI\n");
5739#endif
5740 htmlParsePI(ctxt);
5741 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005742 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5743 goto done;
5744 } else if ((cur == '<') && (next == '/')) {
5745 ctxt->instate = XML_PARSER_END_TAG;
5746 ctxt->checkIndex = 0;
5747#ifdef DEBUG_PUSH
5748 xmlGenericError(xmlGenericErrorContext,
5749 "HPP: entering END_TAG\n");
5750#endif
5751 break;
5752 } else if (cur == '<') {
5753 ctxt->instate = XML_PARSER_START_TAG;
5754 ctxt->checkIndex = 0;
5755#ifdef DEBUG_PUSH
5756 xmlGenericError(xmlGenericErrorContext,
5757 "HPP: entering START_TAG\n");
5758#endif
5759 break;
5760 } else if (cur == '&') {
5761 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005762 (htmlParseLookupChars(ctxt,
5763 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005764 goto done;
5765#ifdef DEBUG_PUSH
5766 xmlGenericError(xmlGenericErrorContext,
5767 "HPP: Parsing Reference\n");
5768#endif
5769 /* TODO: check generation of subtrees if noent !!! */
5770 htmlParseReference(ctxt);
5771 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005772 /*
5773 * check that the text sequence is complete
5774 * before handing out the data to the parser
5775 * to avoid problems with erroneous end of
5776 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005777 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005778 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005779 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005780 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005781 ctxt->checkIndex = 0;
5782#ifdef DEBUG_PUSH
5783 xmlGenericError(xmlGenericErrorContext,
5784 "HPP: Parsing char data\n");
5785#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005786 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005787 }
5788 }
5789 if (cons == ctxt->nbChars) {
5790 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005791 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5792 "detected an error in element content\n",
5793 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005794 }
5795 NEXT;
5796 break;
5797 }
5798
5799 break;
5800 }
5801 case XML_PARSER_END_TAG:
5802 if (avail < 2)
5803 goto done;
5804 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005805 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005806 goto done;
5807 htmlParseEndTag(ctxt);
5808 if (ctxt->nameNr == 0) {
5809 ctxt->instate = XML_PARSER_EPILOG;
5810 } else {
5811 ctxt->instate = XML_PARSER_CONTENT;
5812 }
5813 ctxt->checkIndex = 0;
5814#ifdef DEBUG_PUSH
5815 xmlGenericError(xmlGenericErrorContext,
5816 "HPP: entering CONTENT\n");
5817#endif
5818 break;
5819 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005820 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5821 "HPP: internal error, state == CDATA\n",
5822 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005823 ctxt->instate = XML_PARSER_CONTENT;
5824 ctxt->checkIndex = 0;
5825#ifdef DEBUG_PUSH
5826 xmlGenericError(xmlGenericErrorContext,
5827 "HPP: entering CONTENT\n");
5828#endif
5829 break;
5830 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005831 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5832 "HPP: internal error, state == DTD\n",
5833 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005834 ctxt->instate = XML_PARSER_CONTENT;
5835 ctxt->checkIndex = 0;
5836#ifdef DEBUG_PUSH
5837 xmlGenericError(xmlGenericErrorContext,
5838 "HPP: entering CONTENT\n");
5839#endif
5840 break;
5841 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005842 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5843 "HPP: internal error, state == COMMENT\n",
5844 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005845 ctxt->instate = XML_PARSER_CONTENT;
5846 ctxt->checkIndex = 0;
5847#ifdef DEBUG_PUSH
5848 xmlGenericError(xmlGenericErrorContext,
5849 "HPP: entering CONTENT\n");
5850#endif
5851 break;
5852 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005853 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5854 "HPP: internal error, state == PI\n",
5855 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005856 ctxt->instate = XML_PARSER_CONTENT;
5857 ctxt->checkIndex = 0;
5858#ifdef DEBUG_PUSH
5859 xmlGenericError(xmlGenericErrorContext,
5860 "HPP: entering CONTENT\n");
5861#endif
5862 break;
5863 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005864 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5865 "HPP: internal error, state == ENTITY_DECL\n",
5866 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005867 ctxt->instate = XML_PARSER_CONTENT;
5868 ctxt->checkIndex = 0;
5869#ifdef DEBUG_PUSH
5870 xmlGenericError(xmlGenericErrorContext,
5871 "HPP: entering CONTENT\n");
5872#endif
5873 break;
5874 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005875 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5876 "HPP: internal error, state == ENTITY_VALUE\n",
5877 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005878 ctxt->instate = XML_PARSER_CONTENT;
5879 ctxt->checkIndex = 0;
5880#ifdef DEBUG_PUSH
5881 xmlGenericError(xmlGenericErrorContext,
5882 "HPP: entering DTD\n");
5883#endif
5884 break;
5885 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005886 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5887 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5888 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005889 ctxt->instate = XML_PARSER_START_TAG;
5890 ctxt->checkIndex = 0;
5891#ifdef DEBUG_PUSH
5892 xmlGenericError(xmlGenericErrorContext,
5893 "HPP: entering START_TAG\n");
5894#endif
5895 break;
5896 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005897 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5898 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5899 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005900 ctxt->instate = XML_PARSER_CONTENT;
5901 ctxt->checkIndex = 0;
5902#ifdef DEBUG_PUSH
5903 xmlGenericError(xmlGenericErrorContext,
5904 "HPP: entering CONTENT\n");
5905#endif
5906 break;
5907 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005908 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5909 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5910 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005911 ctxt->instate = XML_PARSER_CONTENT;
5912 ctxt->checkIndex = 0;
5913#ifdef DEBUG_PUSH
5914 xmlGenericError(xmlGenericErrorContext,
5915 "HPP: entering CONTENT\n");
5916#endif
5917 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005918 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005919 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5921 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005922 ctxt->instate = XML_PARSER_CONTENT;
5923 ctxt->checkIndex = 0;
5924#ifdef DEBUG_PUSH
5925 xmlGenericError(xmlGenericErrorContext,
5926 "HPP: entering CONTENT\n");
5927#endif
5928 break;
5929
Owen Taylor3473f882001-02-23 17:55:21 +00005930 }
5931 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005932done:
Owen Taylor3473f882001-02-23 17:55:21 +00005933 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005934 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005935 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005936 /*
5937 * SAX: end of the document processing.
5938 */
5939 ctxt->instate = XML_PARSER_EOF;
5940 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5941 ctxt->sax->endDocument(ctxt->userData);
5942 }
5943 }
5944 if ((ctxt->myDoc != NULL) &&
5945 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5946 (ctxt->instate == XML_PARSER_EPILOG))) {
5947 xmlDtdPtr dtd;
5948 dtd = xmlGetIntSubset(ctxt->myDoc);
5949 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005950 ctxt->myDoc->intSubset =
5951 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005952 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5953 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5954 }
5955#ifdef DEBUG_PUSH
5956 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5957#endif
5958 return(ret);
5959}
5960
5961/**
Owen Taylor3473f882001-02-23 17:55:21 +00005962 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005963 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005964 * @chunk: an char array
5965 * @size: the size in byte of the chunk
5966 * @terminate: last chunk indicator
5967 *
5968 * Parse a Chunk of memory
5969 *
5970 * Returns zero if no error, the xmlParserErrors otherwise.
5971 */
5972int
5973htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5974 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005975 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5976 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5977 "htmlParseChunk: context error\n", NULL, NULL);
5978 return(XML_ERR_INTERNAL_ERROR);
5979 }
Owen Taylor3473f882001-02-23 17:55:21 +00005980 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5981 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5982 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5983 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005984 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005985
5986 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005987 if (res < 0) {
5988 ctxt->errNo = XML_PARSER_EOF;
5989 ctxt->disableSAX = 1;
5990 return (XML_PARSER_EOF);
5991 }
Owen Taylor3473f882001-02-23 17:55:21 +00005992 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5993 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005994 ctxt->input->end =
5995 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005996#ifdef DEBUG_PUSH
5997 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5998#endif
5999
Daniel Veillard14f752c2003-08-09 11:44:50 +00006000#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00006001 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6002 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006003#endif
Owen Taylor3473f882001-02-23 17:55:21 +00006004 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00006005 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6006 xmlParserInputBufferPtr in = ctxt->input->buf;
6007 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6008 (in->raw != NULL)) {
6009 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02006010
Daniel Veillard14f752c2003-08-09 11:44:50 +00006011 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
6012 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006013 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6014 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006015 return(XML_ERR_INVALID_ENCODING);
6016 }
6017 }
6018 }
Owen Taylor3473f882001-02-23 17:55:21 +00006019 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00006020 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00006021 if (terminate) {
6022 if ((ctxt->instate != XML_PARSER_EOF) &&
6023 (ctxt->instate != XML_PARSER_EPILOG) &&
6024 (ctxt->instate != XML_PARSER_MISC)) {
6025 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00006026 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02006027 }
Owen Taylor3473f882001-02-23 17:55:21 +00006028 if (ctxt->instate != XML_PARSER_EOF) {
6029 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6030 ctxt->sax->endDocument(ctxt->userData);
6031 }
6032 ctxt->instate = XML_PARSER_EOF;
6033 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006034 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00006035}
6036
6037/************************************************************************
6038 * *
6039 * User entry points *
6040 * *
6041 ************************************************************************/
6042
6043/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006044 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006045 * @sax: a SAX handler
6046 * @user_data: The user data returned on SAX callbacks
6047 * @chunk: a pointer to an array of chars
6048 * @size: number of chars in the array
6049 * @filename: an optional file name or URI
6050 * @enc: an optional encoding
6051 *
6052 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00006053 * The value of @filename is used for fetching external entities
6054 * and error/warning reports.
6055 *
6056 * Returns the new parser context or NULL
6057 */
6058htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006059htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00006060 const char *chunk, int size, const char *filename,
6061 xmlCharEncoding enc) {
6062 htmlParserCtxtPtr ctxt;
6063 htmlParserInputPtr inputStream;
6064 xmlParserInputBufferPtr buf;
6065
Daniel Veillardd0463562001-10-13 09:15:48 +00006066 xmlInitParser();
6067
Owen Taylor3473f882001-02-23 17:55:21 +00006068 buf = xmlAllocParserInputBuffer(enc);
6069 if (buf == NULL) return(NULL);
6070
Daniel Veillardf403d292003-10-05 13:51:35 +00006071 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006072 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006073 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006074 return(NULL);
6075 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00006076 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6077 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00006078 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00006079 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00006080 xmlFree(ctxt->sax);
6081 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6082 if (ctxt->sax == NULL) {
6083 xmlFree(buf);
6084 xmlFree(ctxt);
6085 return(NULL);
6086 }
6087 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6088 if (user_data != NULL)
6089 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02006090 }
Owen Taylor3473f882001-02-23 17:55:21 +00006091 if (filename == NULL) {
6092 ctxt->directory = NULL;
6093 } else {
6094 ctxt->directory = xmlParserGetDirectory(filename);
6095 }
6096
6097 inputStream = htmlNewInputStream(ctxt);
6098 if (inputStream == NULL) {
6099 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00006100 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006101 return(NULL);
6102 }
6103
6104 if (filename == NULL)
6105 inputStream->filename = NULL;
6106 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00006107 inputStream->filename = (char *)
6108 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00006109 inputStream->buf = buf;
6110 inputStream->base = inputStream->buf->buffer->content;
6111 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillarde77db162009-08-22 11:32:38 +02006112 inputStream->end =
Daniel Veillard5f704af2003-03-05 10:01:43 +00006113 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00006114
6115 inputPush(ctxt, inputStream);
6116
6117 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02006118 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00006119 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6120 int cur = ctxt->input->cur - ctxt->input->base;
6121
Daniel Veillarde77db162009-08-22 11:32:38 +02006122 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006123
6124 ctxt->input->base = ctxt->input->buf->buffer->content + base;
6125 ctxt->input->cur = ctxt->input->base + cur;
6126 ctxt->input->end =
6127 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00006128#ifdef DEBUG_PUSH
6129 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6130#endif
6131 }
Daniel Veillard68716a72006-10-16 09:32:17 +00006132 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00006133
6134 return(ctxt);
6135}
William M. Brack21e4ef22005-01-02 09:53:13 +00006136#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00006137
6138/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006139 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006140 * @cur: a pointer to an array of xmlChar
6141 * @encoding: a free form C string describing the HTML document encoding, or NULL
6142 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006143 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006144 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006145 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6146 * to handle parse events. If sax is NULL, fallback to the default DOM
6147 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006148 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006149 * Returns the resulting document tree unless SAX is NULL or the document is
6150 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006151 */
6152
6153htmlDocPtr
6154htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6155 htmlDocPtr ret;
6156 htmlParserCtxtPtr ctxt;
6157
Daniel Veillardd0463562001-10-13 09:15:48 +00006158 xmlInitParser();
6159
Owen Taylor3473f882001-02-23 17:55:21 +00006160 if (cur == NULL) return(NULL);
6161
6162
6163 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6164 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02006165 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00006166 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00006167 ctxt->sax = sax;
6168 ctxt->userData = userData;
6169 }
6170
6171 htmlParseDocument(ctxt);
6172 ret = ctxt->myDoc;
6173 if (sax != NULL) {
6174 ctxt->sax = NULL;
6175 ctxt->userData = NULL;
6176 }
6177 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006178
Owen Taylor3473f882001-02-23 17:55:21 +00006179 return(ret);
6180}
6181
6182/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006183 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006184 * @cur: a pointer to an array of xmlChar
6185 * @encoding: a free form C string describing the HTML document encoding, or NULL
6186 *
6187 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006188 *
Owen Taylor3473f882001-02-23 17:55:21 +00006189 * Returns the resulting document tree
6190 */
6191
6192htmlDocPtr
6193htmlParseDoc(xmlChar *cur, const char *encoding) {
6194 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6195}
6196
6197
6198/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006199 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006200 * @filename: the filename
6201 * @encoding: a free form C string describing the HTML document encoding, or NULL
6202 *
Daniel Veillarde77db162009-08-22 11:32:38 +02006203 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00006204 * Automatic support for ZLIB/Compress compressed document is provided
6205 * by default if found at compile-time.
6206 *
6207 * Returns the new parser context or NULL
6208 */
6209htmlParserCtxtPtr
6210htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6211{
6212 htmlParserCtxtPtr ctxt;
6213 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006214 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00006215 /* htmlCharEncoding enc; */
6216 xmlChar *content, *content_line = (xmlChar *) "charset=";
6217
Daniel Veillarda03e3652004-11-02 18:45:30 +00006218 if (filename == NULL)
6219 return(NULL);
6220
Daniel Veillardf403d292003-10-05 13:51:35 +00006221 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006222 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00006223 return(NULL);
6224 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006225 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6226 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00006227#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006228 if (xmlDefaultSAXHandler.error != NULL) {
6229 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6230 }
Daniel Veillard87247e82004-01-13 20:42:02 +00006231#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00006232 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00006233 return(NULL);
6234 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006235
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006236 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6237 xmlFree(canonicFilename);
6238 if (inputStream == NULL) {
6239 xmlFreeParserCtxt(ctxt);
6240 return(NULL);
6241 }
Owen Taylor3473f882001-02-23 17:55:21 +00006242
6243 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006244
Owen Taylor3473f882001-02-23 17:55:21 +00006245 /* set encoding */
6246 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00006247 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02006248 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00006249 strcpy ((char *)content, (char *)content_line);
6250 strcat ((char *)content, (char *)encoding);
6251 htmlCheckEncoding (ctxt, content);
6252 xmlFree (content);
6253 }
6254 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006255
Owen Taylor3473f882001-02-23 17:55:21 +00006256 return(ctxt);
6257}
6258
6259/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006260 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006261 * @filename: the filename
6262 * @encoding: a free form C string describing the HTML document encoding, or NULL
6263 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006264 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006265 *
6266 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6267 * compressed document is provided by default if found at compile-time.
6268 * It use the given SAX function block to handle the parsing callback.
6269 * If sax is NULL, fallback to the default DOM tree building routines.
6270 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006271 * Returns the resulting document tree unless SAX is NULL or the document is
6272 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006273 */
6274
6275htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006276htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00006277 void *userData) {
6278 htmlDocPtr ret;
6279 htmlParserCtxtPtr ctxt;
6280 htmlSAXHandlerPtr oldsax = NULL;
6281
Daniel Veillardd0463562001-10-13 09:15:48 +00006282 xmlInitParser();
6283
Owen Taylor3473f882001-02-23 17:55:21 +00006284 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6285 if (ctxt == NULL) return(NULL);
6286 if (sax != NULL) {
6287 oldsax = ctxt->sax;
6288 ctxt->sax = sax;
6289 ctxt->userData = userData;
6290 }
6291
6292 htmlParseDocument(ctxt);
6293
6294 ret = ctxt->myDoc;
6295 if (sax != NULL) {
6296 ctxt->sax = oldsax;
6297 ctxt->userData = NULL;
6298 }
6299 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006300
Owen Taylor3473f882001-02-23 17:55:21 +00006301 return(ret);
6302}
6303
6304/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006305 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006306 * @filename: the filename
6307 * @encoding: a free form C string describing the HTML document encoding, or NULL
6308 *
6309 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6310 * compressed document is provided by default if found at compile-time.
6311 *
6312 * Returns the resulting document tree
6313 */
6314
6315htmlDocPtr
6316htmlParseFile(const char *filename, const char *encoding) {
6317 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6318}
6319
6320/**
6321 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02006322 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00006323 *
6324 * Set and return the previous value for handling HTML omitted tags.
6325 *
6326 * Returns the last value for 0 for no handling, 1 for auto insertion.
6327 */
6328
6329int
6330htmlHandleOmittedElem(int val) {
6331 int old = htmlOmittedDefaultValue;
6332
6333 htmlOmittedDefaultValue = val;
6334 return(old);
6335}
6336
Daniel Veillard930dfb62003-02-05 10:17:38 +00006337/**
6338 * htmlElementAllowedHere:
6339 * @parent: HTML parent element
6340 * @elt: HTML element
6341 *
6342 * Checks whether an HTML element may be a direct child of a parent element.
6343 * Note - doesn't check for deprecated elements
6344 *
6345 * Returns 1 if allowed; 0 otherwise.
6346 */
6347int
6348htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6349 const char** p ;
6350
6351 if ( ! elt || ! parent || ! parent->subelts )
6352 return 0 ;
6353
6354 for ( p = parent->subelts; *p; ++p )
6355 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6356 return 1 ;
6357
6358 return 0 ;
6359}
6360/**
6361 * htmlElementStatusHere:
6362 * @parent: HTML parent element
6363 * @elt: HTML element
6364 *
6365 * Checks whether an HTML element may be a direct child of a parent element.
6366 * and if so whether it is valid or deprecated.
6367 *
6368 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6369 */
6370htmlStatus
6371htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6372 if ( ! parent || ! elt )
6373 return HTML_INVALID ;
6374 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6375 return HTML_INVALID ;
6376
6377 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6378}
6379/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006380 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00006381 * @elt: HTML element
6382 * @attr: HTML attribute
6383 * @legacy: whether to allow deprecated attributes
6384 *
6385 * Checks whether an attribute is valid for an element
6386 * Has full knowledge of Required and Deprecated attributes
6387 *
6388 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6389 */
6390htmlStatus
6391htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6392 const char** p ;
6393
6394 if ( !elt || ! attr )
6395 return HTML_INVALID ;
6396
6397 if ( elt->attrs_req )
6398 for ( p = elt->attrs_req; *p; ++p)
6399 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6400 return HTML_REQUIRED ;
6401
6402 if ( elt->attrs_opt )
6403 for ( p = elt->attrs_opt; *p; ++p)
6404 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6405 return HTML_VALID ;
6406
6407 if ( legacy && elt->attrs_depr )
6408 for ( p = elt->attrs_depr; *p; ++p)
6409 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6410 return HTML_DEPRECATED ;
6411
6412 return HTML_INVALID ;
6413}
6414/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006415 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006416 * @node: an htmlNodePtr in a tree
6417 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006418 * for Element nodes)
6419 *
6420 * Checks whether the tree node is valid. Experimental (the author
6421 * only uses the HTML enhancements in a SAX parser)
6422 *
6423 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6424 * legacy allowed) or htmlElementStatusHere (otherwise).
6425 * for Attribute nodes, a return from htmlAttrAllowed
6426 * for other nodes, HTML_NA (no checks performed)
6427 */
6428htmlStatus
6429htmlNodeStatus(const htmlNodePtr node, int legacy) {
6430 if ( ! node )
6431 return HTML_INVALID ;
6432
6433 switch ( node->type ) {
6434 case XML_ELEMENT_NODE:
6435 return legacy
6436 ? ( htmlElementAllowedHere (
6437 htmlTagLookup(node->parent->name) , node->name
6438 ) ? HTML_VALID : HTML_INVALID )
6439 : htmlElementStatusHere(
6440 htmlTagLookup(node->parent->name) ,
6441 htmlTagLookup(node->name) )
6442 ;
6443 case XML_ATTRIBUTE_NODE:
6444 return htmlAttrAllowed(
6445 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6446 default: return HTML_NA ;
6447 }
6448}
Daniel Veillard9475a352003-09-26 12:47:50 +00006449/************************************************************************
6450 * *
6451 * New set (2.6.0) of simpler and more flexible APIs *
6452 * *
6453 ************************************************************************/
6454/**
6455 * DICT_FREE:
6456 * @str: a string
6457 *
6458 * Free a string if it is not owned by the "dict" dictionnary in the
6459 * current scope
6460 */
6461#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006462 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006463 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6464 xmlFree((char *)(str));
6465
6466/**
6467 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006468 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006469 *
6470 * Reset a parser context
6471 */
6472void
6473htmlCtxtReset(htmlParserCtxtPtr ctxt)
6474{
6475 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006476 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006477
Daniel Veillarda03e3652004-11-02 18:45:30 +00006478 if (ctxt == NULL)
6479 return;
6480
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006481 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006482 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006483
6484 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6485 xmlFreeInputStream(input);
6486 }
6487 ctxt->inputNr = 0;
6488 ctxt->input = NULL;
6489
6490 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006491 if (ctxt->spaceTab != NULL) {
6492 ctxt->spaceTab[0] = -1;
6493 ctxt->space = &ctxt->spaceTab[0];
6494 } else {
6495 ctxt->space = NULL;
6496 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006497
6498
6499 ctxt->nodeNr = 0;
6500 ctxt->node = NULL;
6501
6502 ctxt->nameNr = 0;
6503 ctxt->name = NULL;
6504
6505 DICT_FREE(ctxt->version);
6506 ctxt->version = NULL;
6507 DICT_FREE(ctxt->encoding);
6508 ctxt->encoding = NULL;
6509 DICT_FREE(ctxt->directory);
6510 ctxt->directory = NULL;
6511 DICT_FREE(ctxt->extSubURI);
6512 ctxt->extSubURI = NULL;
6513 DICT_FREE(ctxt->extSubSystem);
6514 ctxt->extSubSystem = NULL;
6515 if (ctxt->myDoc != NULL)
6516 xmlFreeDoc(ctxt->myDoc);
6517 ctxt->myDoc = NULL;
6518
6519 ctxt->standalone = -1;
6520 ctxt->hasExternalSubset = 0;
6521 ctxt->hasPErefs = 0;
6522 ctxt->html = 1;
6523 ctxt->external = 0;
6524 ctxt->instate = XML_PARSER_START;
6525 ctxt->token = 0;
6526
6527 ctxt->wellFormed = 1;
6528 ctxt->nsWellFormed = 1;
Daniel Veillard8ad29302010-10-28 11:51:22 +02006529 ctxt->disableSAX = 0;
Daniel Veillard9475a352003-09-26 12:47:50 +00006530 ctxt->valid = 1;
6531 ctxt->vctxt.userData = ctxt;
6532 ctxt->vctxt.error = xmlParserValidityError;
6533 ctxt->vctxt.warning = xmlParserValidityWarning;
6534 ctxt->record_info = 0;
6535 ctxt->nbChars = 0;
6536 ctxt->checkIndex = 0;
6537 ctxt->inSubset = 0;
6538 ctxt->errNo = XML_ERR_OK;
6539 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006540 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006541 ctxt->catalogs = NULL;
6542 xmlInitNodeInfoSeq(&ctxt->node_seq);
6543
6544 if (ctxt->attsDefault != NULL) {
6545 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6546 ctxt->attsDefault = NULL;
6547 }
6548 if (ctxt->attsSpecial != NULL) {
6549 xmlHashFree(ctxt->attsSpecial, NULL);
6550 ctxt->attsSpecial = NULL;
6551 }
6552}
6553
6554/**
6555 * htmlCtxtUseOptions:
6556 * @ctxt: an HTML parser context
6557 * @options: a combination of htmlParserOption(s)
6558 *
6559 * Applies the options to the parser context
6560 *
6561 * Returns 0 in case of success, the set of unknown or unimplemented options
6562 * in case of error.
6563 */
6564int
6565htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6566{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006567 if (ctxt == NULL)
6568 return(-1);
6569
Daniel Veillard9475a352003-09-26 12:47:50 +00006570 if (options & HTML_PARSE_NOWARNING) {
6571 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006572 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006573 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006574 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006575 }
6576 if (options & HTML_PARSE_NOERROR) {
6577 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006578 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006579 ctxt->sax->fatalError = NULL;
6580 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006581 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006582 }
6583 if (options & HTML_PARSE_PEDANTIC) {
6584 ctxt->pedantic = 1;
6585 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006586 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006587 } else
6588 ctxt->pedantic = 0;
6589 if (options & XML_PARSE_NOBLANKS) {
6590 ctxt->keepBlanks = 0;
6591 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6592 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006593 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006594 } else
6595 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006596 if (options & HTML_PARSE_RECOVER) {
6597 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006598 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006599 } else
6600 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006601 if (options & HTML_PARSE_COMPACT) {
6602 ctxt->options |= HTML_PARSE_COMPACT;
6603 options -= HTML_PARSE_COMPACT;
6604 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006605 if (options & XML_PARSE_HUGE) {
6606 ctxt->options |= XML_PARSE_HUGE;
6607 options -= XML_PARSE_HUGE;
6608 }
Daniel Veillardf1121c42010-07-26 14:02:42 +02006609 if (options & HTML_PARSE_NODEFDTD) {
6610 ctxt->options |= HTML_PARSE_NODEFDTD;
6611 options -= HTML_PARSE_NODEFDTD;
6612 }
Daniel Veillardc62efc82011-05-16 16:03:50 +08006613 if (options & HTML_PARSE_IGNORE_ENC) {
6614 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6615 options -= HTML_PARSE_IGNORE_ENC;
6616 }
Martin Schröderb91111b2012-05-10 18:52:37 +08006617 if (options & HTML_PARSE_NOIMPLIED) {
6618 ctxt->options |= HTML_PARSE_NOIMPLIED;
6619 options -= HTML_PARSE_NOIMPLIED;
6620 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006621 ctxt->dictNames = 0;
6622 return (options);
6623}
6624
6625/**
6626 * htmlDoRead:
6627 * @ctxt: an HTML parser context
6628 * @URL: the base URL to use for the document
6629 * @encoding: the document encoding, or NULL
6630 * @options: a combination of htmlParserOption(s)
6631 * @reuse: keep the context for reuse
6632 *
6633 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006634 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006635 * Returns the resulting document tree or NULL
6636 */
6637static htmlDocPtr
6638htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6639 int options, int reuse)
6640{
6641 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006642
Daniel Veillard9475a352003-09-26 12:47:50 +00006643 htmlCtxtUseOptions(ctxt, options);
6644 ctxt->html = 1;
6645 if (encoding != NULL) {
6646 xmlCharEncodingHandlerPtr hdlr;
6647
6648 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006649 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006650 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006651 if (ctxt->input->encoding != NULL)
6652 xmlFree((xmlChar *) ctxt->input->encoding);
6653 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6654 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006655 }
6656 if ((URL != NULL) && (ctxt->input != NULL) &&
6657 (ctxt->input->filename == NULL))
6658 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6659 htmlParseDocument(ctxt);
6660 ret = ctxt->myDoc;
6661 ctxt->myDoc = NULL;
6662 if (!reuse) {
6663 if ((ctxt->dictNames) &&
6664 (ret != NULL) &&
6665 (ret->dict == ctxt->dict))
6666 ctxt->dict = NULL;
6667 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006668 }
6669 return (ret);
6670}
6671
6672/**
6673 * htmlReadDoc:
6674 * @cur: a pointer to a zero terminated string
6675 * @URL: the base URL to use for the document
6676 * @encoding: the document encoding, or NULL
6677 * @options: a combination of htmlParserOption(s)
6678 *
6679 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006680 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006681 * Returns the resulting document tree
6682 */
6683htmlDocPtr
6684htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6685{
6686 htmlParserCtxtPtr ctxt;
6687
6688 if (cur == NULL)
6689 return (NULL);
6690
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006691 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006692 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006693 if (ctxt == NULL)
6694 return (NULL);
6695 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6696}
6697
6698/**
6699 * htmlReadFile:
6700 * @filename: a file or URL
6701 * @encoding: the document encoding, or NULL
6702 * @options: a combination of htmlParserOption(s)
6703 *
6704 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006705 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006706 * Returns the resulting document tree
6707 */
6708htmlDocPtr
6709htmlReadFile(const char *filename, const char *encoding, int options)
6710{
6711 htmlParserCtxtPtr ctxt;
6712
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006713 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006714 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6715 if (ctxt == NULL)
6716 return (NULL);
6717 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6718}
6719
6720/**
6721 * htmlReadMemory:
6722 * @buffer: a pointer to a char array
6723 * @size: the size of the array
6724 * @URL: the base URL to use for the document
6725 * @encoding: the document encoding, or NULL
6726 * @options: a combination of htmlParserOption(s)
6727 *
6728 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006729 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006730 * Returns the resulting document tree
6731 */
6732htmlDocPtr
6733htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6734{
6735 htmlParserCtxtPtr ctxt;
6736
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006737 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006738 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6739 if (ctxt == NULL)
6740 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006741 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006742 if (ctxt->sax != NULL)
6743 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006744 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6745}
6746
6747/**
6748 * htmlReadFd:
6749 * @fd: an open file descriptor
6750 * @URL: the base URL to use for the document
6751 * @encoding: the document encoding, or NULL
6752 * @options: a combination of htmlParserOption(s)
6753 *
6754 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006755 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006756 * Returns the resulting document tree
6757 */
6758htmlDocPtr
6759htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6760{
6761 htmlParserCtxtPtr ctxt;
6762 xmlParserInputBufferPtr input;
6763 xmlParserInputPtr stream;
6764
6765 if (fd < 0)
6766 return (NULL);
6767
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006768 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006769 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6770 if (input == NULL)
6771 return (NULL);
6772 ctxt = xmlNewParserCtxt();
6773 if (ctxt == NULL) {
6774 xmlFreeParserInputBuffer(input);
6775 return (NULL);
6776 }
6777 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6778 if (stream == NULL) {
6779 xmlFreeParserInputBuffer(input);
6780 xmlFreeParserCtxt(ctxt);
6781 return (NULL);
6782 }
6783 inputPush(ctxt, stream);
6784 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6785}
6786
6787/**
6788 * htmlReadIO:
6789 * @ioread: an I/O read function
6790 * @ioclose: an I/O close function
6791 * @ioctx: an I/O handler
6792 * @URL: the base URL to use for the document
6793 * @encoding: the document encoding, or NULL
6794 * @options: a combination of htmlParserOption(s)
6795 *
6796 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006797 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006798 * Returns the resulting document tree
6799 */
6800htmlDocPtr
6801htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6802 void *ioctx, const char *URL, const char *encoding, int options)
6803{
6804 htmlParserCtxtPtr ctxt;
6805 xmlParserInputBufferPtr input;
6806 xmlParserInputPtr stream;
6807
6808 if (ioread == NULL)
6809 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006810 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006811
6812 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6813 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006814 if (input == NULL) {
6815 if (ioclose != NULL)
6816 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00006817 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006818 }
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006819 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006820 if (ctxt == NULL) {
6821 xmlFreeParserInputBuffer(input);
6822 return (NULL);
6823 }
6824 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6825 if (stream == NULL) {
6826 xmlFreeParserInputBuffer(input);
6827 xmlFreeParserCtxt(ctxt);
6828 return (NULL);
6829 }
6830 inputPush(ctxt, stream);
6831 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6832}
6833
6834/**
6835 * htmlCtxtReadDoc:
6836 * @ctxt: an HTML parser context
6837 * @cur: a pointer to a zero terminated string
6838 * @URL: the base URL to use for the document
6839 * @encoding: the document encoding, or NULL
6840 * @options: a combination of htmlParserOption(s)
6841 *
6842 * parse an XML in-memory document and build a tree.
6843 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006844 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006845 * Returns the resulting document tree
6846 */
6847htmlDocPtr
6848htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6849 const char *URL, const char *encoding, int options)
6850{
6851 xmlParserInputPtr stream;
6852
6853 if (cur == NULL)
6854 return (NULL);
6855 if (ctxt == NULL)
6856 return (NULL);
6857
6858 htmlCtxtReset(ctxt);
6859
6860 stream = xmlNewStringInputStream(ctxt, cur);
6861 if (stream == NULL) {
6862 return (NULL);
6863 }
6864 inputPush(ctxt, stream);
6865 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6866}
6867
6868/**
6869 * htmlCtxtReadFile:
6870 * @ctxt: an HTML parser context
6871 * @filename: a file or URL
6872 * @encoding: the document encoding, or NULL
6873 * @options: a combination of htmlParserOption(s)
6874 *
6875 * parse an XML file from the filesystem or the network.
6876 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006877 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006878 * Returns the resulting document tree
6879 */
6880htmlDocPtr
6881htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6882 const char *encoding, int options)
6883{
6884 xmlParserInputPtr stream;
6885
6886 if (filename == NULL)
6887 return (NULL);
6888 if (ctxt == NULL)
6889 return (NULL);
6890
6891 htmlCtxtReset(ctxt);
6892
Daniel Veillard29614c72004-11-26 10:47:26 +00006893 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006894 if (stream == NULL) {
6895 return (NULL);
6896 }
6897 inputPush(ctxt, stream);
6898 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6899}
6900
6901/**
6902 * htmlCtxtReadMemory:
6903 * @ctxt: an HTML parser context
6904 * @buffer: a pointer to a char array
6905 * @size: the size of the array
6906 * @URL: the base URL to use for the document
6907 * @encoding: the document encoding, or NULL
6908 * @options: a combination of htmlParserOption(s)
6909 *
6910 * parse an XML in-memory document and build a tree.
6911 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006912 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006913 * Returns the resulting document tree
6914 */
6915htmlDocPtr
6916htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6917 const char *URL, const char *encoding, int options)
6918{
6919 xmlParserInputBufferPtr input;
6920 xmlParserInputPtr stream;
6921
6922 if (ctxt == NULL)
6923 return (NULL);
6924 if (buffer == NULL)
6925 return (NULL);
6926
6927 htmlCtxtReset(ctxt);
6928
6929 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6930 if (input == NULL) {
6931 return(NULL);
6932 }
6933
6934 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6935 if (stream == NULL) {
6936 xmlFreeParserInputBuffer(input);
6937 return(NULL);
6938 }
6939
6940 inputPush(ctxt, stream);
6941 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6942}
6943
6944/**
6945 * htmlCtxtReadFd:
6946 * @ctxt: an HTML parser context
6947 * @fd: an open file descriptor
6948 * @URL: the base URL to use for the document
6949 * @encoding: the document encoding, or NULL
6950 * @options: a combination of htmlParserOption(s)
6951 *
6952 * parse an XML from a file descriptor and build a tree.
6953 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006954 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006955 * Returns the resulting document tree
6956 */
6957htmlDocPtr
6958htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6959 const char *URL, const char *encoding, int options)
6960{
6961 xmlParserInputBufferPtr input;
6962 xmlParserInputPtr stream;
6963
6964 if (fd < 0)
6965 return (NULL);
6966 if (ctxt == NULL)
6967 return (NULL);
6968
6969 htmlCtxtReset(ctxt);
6970
6971
6972 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6973 if (input == NULL)
6974 return (NULL);
6975 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6976 if (stream == NULL) {
6977 xmlFreeParserInputBuffer(input);
6978 return (NULL);
6979 }
6980 inputPush(ctxt, stream);
6981 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6982}
6983
6984/**
6985 * htmlCtxtReadIO:
6986 * @ctxt: an HTML parser context
6987 * @ioread: an I/O read function
6988 * @ioclose: an I/O close function
6989 * @ioctx: an I/O handler
6990 * @URL: the base URL to use for the document
6991 * @encoding: the document encoding, or NULL
6992 * @options: a combination of htmlParserOption(s)
6993 *
6994 * parse an HTML document from I/O functions and source and build a tree.
6995 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006996 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006997 * Returns the resulting document tree
6998 */
6999htmlDocPtr
7000htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7001 xmlInputCloseCallback ioclose, void *ioctx,
7002 const char *URL,
7003 const char *encoding, int options)
7004{
7005 xmlParserInputBufferPtr input;
7006 xmlParserInputPtr stream;
7007
7008 if (ioread == NULL)
7009 return (NULL);
7010 if (ctxt == NULL)
7011 return (NULL);
7012
7013 htmlCtxtReset(ctxt);
7014
7015 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7016 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007017 if (input == NULL) {
7018 if (ioclose != NULL)
7019 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00007020 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007021 }
Daniel Veillard9475a352003-09-26 12:47:50 +00007022 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7023 if (stream == NULL) {
7024 xmlFreeParserInputBuffer(input);
7025 return (NULL);
7026 }
7027 inputPush(ctxt, stream);
7028 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7029}
7030
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007031#define bottom_HTMLparser
7032#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00007033#endif /* LIBXML_HTML_ENABLED */