blob: b7291972ef874b78ead87b6882825434ed9ae0a5 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
Daniel Veillarda78d8032012-07-16 14:56:50 +080047#include "buf.h"
48#include "enc.h"
49
Owen Taylor3473f882001-02-23 17:55:21 +000050#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
Daniel Veillard22090732001-07-16 00:06:07 +000057static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000058
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000061static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000062
63/************************************************************************
64 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020065 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000066 * *
67 ************************************************************************/
68
69/**
William M. Brackedb65a72004-02-06 07:36:04 +000070 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000071 * @ctxt: an HTML parser context
72 * @extra: extra informations
73 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
Daniel Veillard157fee02003-10-31 10:36:03 +000079 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000082 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000088 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000089 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000093 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000094 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
Daniel Veillard157fee02003-10-31 10:36:03 +0000112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000115 if (ctxt != NULL)
116 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
Daniel Veillard157fee02003-10-31 10:36:03 +0000139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000142 if (ctxt != NULL)
143 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000149}
150
151/************************************************************************
152 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200153 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000154 * *
155 ************************************************************************/
156
Daniel Veillard1c732d22002-11-30 11:22:59 +0000157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000165 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000168{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000175 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000176 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000180 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000199 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000200
Daniel Veillard1c732d22002-11-30 11:22:59 +0000201 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000205 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000211 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000212 return (ret);
213}
Owen Taylor3473f882001-02-23 17:55:21 +0000214
Eugene Pimenov615904f2010-03-15 15:16:02 +0100215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
Owen Taylor3473f882001-02-23 17:55:21 +0000268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000285 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Daniel Veillard77a90a72003-03-22 00:04:05 +0000299#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000307#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
308 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
309 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000310
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000311#define GROW if ((ctxt->progressive == 0) && \
312 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
313 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000314
315#define CURRENT ((int) (*ctxt->input->cur))
316
317#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
318
319/* Inported from XML */
320
Daniel Veillard561b7f82002-03-20 21:55:57 +0000321/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
322#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000323#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000324
Daniel Veillard561b7f82002-03-20 21:55:57 +0000325#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000326
327
328#define NEXTL(l) do { \
329 if (*(ctxt->input->cur) == '\n') { \
330 ctxt->input->line++; ctxt->input->col = 1; \
331 } else ctxt->input->col++; \
332 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
333 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200334
Owen Taylor3473f882001-02-23 17:55:21 +0000335/************
336 \
337 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
338 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
339 ************/
340
341#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
342#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
343
344#define COPY_BUF(l,b,i,v) \
345 if (l == 1) b[i++] = (xmlChar) v; \
346 else i += xmlCopyChar(l,&b[i],v)
347
348/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200349 * htmlFindEncoding:
350 * @the HTML parser context
351 *
352 * Ty to find and encoding in the current data available in the input
353 * buffer this is needed to try to switch to the proper encoding when
354 * one face a character error.
355 * That's an heuristic, since it's operating outside of parsing it could
356 * try to use a meta which had been commented out, that's the reason it
357 * should only be used in case of error, not as a default.
358 *
359 * Returns an encoding string or NULL if not found, the string need to
360 * be freed
361 */
362static xmlChar *
363htmlFindEncoding(xmlParserCtxtPtr ctxt) {
364 const xmlChar *start, *cur, *end;
365
366 if ((ctxt == NULL) || (ctxt->input == NULL) ||
367 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
368 (ctxt->input->buf->encoder != NULL))
369 return(NULL);
370 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
371 return(NULL);
372
373 start = ctxt->input->cur;
374 end = ctxt->input->end;
375 /* we also expect the input buffer to be zero terminated */
376 if (*end != 0)
377 return(NULL);
378
379 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
380 if (cur == NULL)
381 return(NULL);
382 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
383 if (cur == NULL)
384 return(NULL);
385 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
386 if (cur == NULL)
387 return(NULL);
388 cur += 8;
389 start = cur;
390 while (((*cur >= 'A') && (*cur <= 'Z')) ||
391 ((*cur >= 'a') && (*cur <= 'z')) ||
392 ((*cur >= '0') && (*cur <= '9')) ||
393 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
394 cur++;
395 if (cur == start)
396 return(NULL);
397 return(xmlStrndup(start, cur - start));
398}
399
400/**
Owen Taylor3473f882001-02-23 17:55:21 +0000401 * htmlCurrentChar:
402 * @ctxt: the HTML parser context
403 * @len: pointer to the length of the char read
404 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000405 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000406 * bytes in the input buffer. Implement the end of line normalization:
407 * 2.11 End-of-Line Handling
408 * If the encoding is unspecified, in the case we find an ISO-Latin-1
409 * char, then the encoding converter is plugged in automatically.
410 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000411 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000412 */
413
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000414static int
Owen Taylor3473f882001-02-23 17:55:21 +0000415htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
416 if (ctxt->instate == XML_PARSER_EOF)
417 return(0);
418
419 if (ctxt->token != 0) {
420 *len = 0;
421 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200422 }
Owen Taylor3473f882001-02-23 17:55:21 +0000423 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
424 /*
425 * We are supposed to handle UTF8, check it's valid
426 * From rfc2044: encoding of the Unicode values on UTF-8:
427 *
428 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
429 * 0000 0000-0000 007F 0xxxxxxx
430 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200431 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000432 *
433 * Check for the 0x110000 limit too
434 */
435 const unsigned char *cur = ctxt->input->cur;
436 unsigned char c;
437 unsigned int val;
438
439 c = *cur;
440 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200441 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000442 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200443 cur = ctxt->input->cur;
444 }
Owen Taylor3473f882001-02-23 17:55:21 +0000445 if ((cur[1] & 0xc0) != 0x80)
446 goto encoding_error;
447 if ((c & 0xe0) == 0xe0) {
448
Adiel Mittmann8a103792009-08-25 11:27:13 +0200449 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000450 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200451 cur = ctxt->input->cur;
452 }
Owen Taylor3473f882001-02-23 17:55:21 +0000453 if ((cur[2] & 0xc0) != 0x80)
454 goto encoding_error;
455 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200456 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000457 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200458 cur = ctxt->input->cur;
459 }
Owen Taylor3473f882001-02-23 17:55:21 +0000460 if (((c & 0xf8) != 0xf0) ||
461 ((cur[3] & 0xc0) != 0x80))
462 goto encoding_error;
463 /* 4-byte code */
464 *len = 4;
465 val = (cur[0] & 0x7) << 18;
466 val |= (cur[1] & 0x3f) << 12;
467 val |= (cur[2] & 0x3f) << 6;
468 val |= cur[3] & 0x3f;
469 } else {
470 /* 3-byte code */
471 *len = 3;
472 val = (cur[0] & 0xf) << 12;
473 val |= (cur[1] & 0x3f) << 6;
474 val |= cur[2] & 0x3f;
475 }
476 } else {
477 /* 2-byte code */
478 *len = 2;
479 val = (cur[0] & 0x1f) << 6;
480 val |= cur[1] & 0x3f;
481 }
482 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000483 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
484 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200485 }
Owen Taylor3473f882001-02-23 17:55:21 +0000486 return(val);
487 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200488 if ((*ctxt->input->cur == 0) &&
489 (ctxt->input->cur < ctxt->input->end)) {
490 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
491 "Char 0x%X out of allowed range\n", 0);
492 *len = 1;
493 return(' ');
494 }
Owen Taylor3473f882001-02-23 17:55:21 +0000495 /* 1-byte code */
496 *len = 1;
497 return((int) *ctxt->input->cur);
498 }
499 }
500 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000501 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000502 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000503 * XML constructs only use < 128 chars
504 */
505 *len = 1;
506 if ((int) *ctxt->input->cur < 0x80)
507 return((int) *ctxt->input->cur);
508
509 /*
510 * Humm this is bad, do an automatic flow conversion
511 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200512 {
513 xmlChar * guess;
514 xmlCharEncodingHandlerPtr handler;
515
516 guess = htmlFindEncoding(ctxt);
517 if (guess == NULL) {
518 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
519 } else {
520 if (ctxt->input->encoding != NULL)
521 xmlFree((xmlChar *) ctxt->input->encoding);
522 ctxt->input->encoding = guess;
523 handler = xmlFindCharEncodingHandler((const char *) guess);
524 if (handler != NULL) {
525 xmlSwitchToEncoding(ctxt, handler);
526 } else {
527 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
528 "Unsupported encoding %s", guess, NULL);
529 }
530 }
531 ctxt->charset = XML_CHAR_ENCODING_UTF8;
532 }
533
Owen Taylor3473f882001-02-23 17:55:21 +0000534 return(xmlCurrentChar(ctxt, len));
535
536encoding_error:
537 /*
538 * If we detect an UTF8 error that probably mean that the
539 * input encoding didn't get properly advertized in the
540 * declaration header. Report the error and switch the encoding
541 * to ISO-Latin-1 (if you don't like this policy, just declare the
542 * encoding !)
543 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000544 {
545 char buffer[150];
546
Daniel Veillard861101d2007-06-12 08:38:57 +0000547 if (ctxt->input->end - ctxt->input->cur >= 4) {
548 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
549 ctxt->input->cur[0], ctxt->input->cur[1],
550 ctxt->input->cur[2], ctxt->input->cur[3]);
551 } else {
552 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
553 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000554 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
555 "Input is not proper UTF-8, indicate encoding !\n",
556 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000557 }
558
Daniel Veillarde77db162009-08-22 11:32:38 +0200559 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000560 *len = 1;
561 return((int) *ctxt->input->cur);
562}
563
564/**
Owen Taylor3473f882001-02-23 17:55:21 +0000565 * htmlSkipBlankChars:
566 * @ctxt: the HTML parser context
567 *
568 * skip all blanks character found at that point in the input streams.
569 *
570 * Returns the number of space chars skipped
571 */
572
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000573static int
Owen Taylor3473f882001-02-23 17:55:21 +0000574htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
575 int res = 0;
576
William M. Brack76e95df2003-10-18 16:20:14 +0000577 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000578 if ((*ctxt->input->cur == 0) &&
579 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
580 xmlPopInput(ctxt);
581 } else {
582 if (*(ctxt->input->cur) == '\n') {
583 ctxt->input->line++; ctxt->input->col = 1;
584 } else ctxt->input->col++;
585 ctxt->input->cur++;
586 ctxt->nbChars++;
587 if (*ctxt->input->cur == 0)
588 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
589 }
590 res++;
591 }
592 return(res);
593}
594
595
596
597/************************************************************************
598 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200599 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000600 * *
601 ************************************************************************/
602
603/*
604 * Start Tag: 1 means the start tag can be ommited
605 * End Tag: 1 means the end tag can be ommited
606 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000607 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000608 * Depr: this element is deprecated
609 * DTD: 1 means that this element is valid only in the Loose DTD
610 * 2 means that this element is valid only in the Frameset DTD
611 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000612 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000613 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000614 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615
616/* Definitions and a couple of vars for HTML Elements */
617
618#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000619#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000621#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000622#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
623#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100624#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000625#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100626#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000627#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000628#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000629#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000630#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000631#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000632#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000633#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000634#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000635#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000636#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000637#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000638#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000639#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000640#define EMPTY NULL
641
642
Daniel Veillard065abe82006-07-03 08:55:04 +0000643static const char* const html_flow[] = { FLOW, NULL } ;
644static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000645
646/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000647static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000648#define html_cdata html_pcdata
649
650
651/* ... and for HTML Attributes */
652
653#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000654#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000655#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000656#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000657#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000658#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000659#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000660#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000661#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000662#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000663#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000664#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000665
Daniel Veillard065abe82006-07-03 08:55:04 +0000666static const char* const html_attrs[] = { ATTRS, NULL } ;
667static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
668static const char* const core_attrs[] = { COREATTRS, NULL } ;
669static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000670
671
672/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000673static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000674 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
675 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000676static const char* const target_attr[] = { "target", NULL } ;
677static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
678static const char* const alt_attr[] = { "alt", NULL } ;
679static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
680static const char* const href_attrs[] = { "href", NULL } ;
681static const char* const clear_attrs[] = { "clear", NULL } ;
682static const char* const inline_p[] = { INLINE, "p", NULL } ;
683
684static const char* const flow_param[] = { FLOW, "param", NULL } ;
685static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000686 "archive", "alt", "name", "height", "width", "align",
687 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000688static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000689 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000690static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000691 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000692static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
693static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
694static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
695static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000696 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000697static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000698 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
699
700
Daniel Veillard065abe82006-07-03 08:55:04 +0000701static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
702static const char* const col_elt[] = { "col", NULL } ;
703static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
704static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
705static const char* const dl_contents[] = { "dt", "dd", NULL } ;
706static const char* const compact_attr[] = { "compact", NULL } ;
707static const char* const label_attr[] = { "label", NULL } ;
708static const char* const fieldset_contents[] = { FLOW, "legend" } ;
709static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
710static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
711static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
712static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
713static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
714static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
715static const char* const head_attrs[] = { I18N, "profile", NULL } ;
716static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
717static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
718static const char* const version_attr[] = { "version", NULL } ;
719static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
720static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
721static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000722static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000723static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
724static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
725static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
726static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
727static const char* const align_attr[] = { "align", NULL } ;
728static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
729static const char* const map_contents[] = { BLOCK, "area", NULL } ;
730static const char* const name_attr[] = { "name", NULL } ;
731static const char* const action_attr[] = { "action", NULL } ;
732static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
Denis Pauk868d92d2012-05-10 15:34:57 +0800733static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000734static const char* const content_attr[] = { "content", NULL } ;
735static const char* const type_attr[] = { "type", NULL } ;
736static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
737static const char* const object_contents[] = { FLOW, "param", NULL } ;
738static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
739static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
740static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
741static const char* const option_elt[] = { "option", NULL } ;
742static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
743static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
744static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
745static const char* const width_attr[] = { "width", NULL } ;
746static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
747static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
748static const char* const language_attr[] = { "language", NULL } ;
749static const char* const select_content[] = { "optgroup", "option", NULL } ;
750static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
751static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200752static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000753static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
754static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
755static const char* const tr_elt[] = { "tr", NULL } ;
756static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
757static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
758static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
759static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
760static const char* const tr_contents[] = { "th", "td", NULL } ;
761static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
762static const char* const li_elt[] = { "li", NULL } ;
763static const char* const ul_depr[] = { "type", "compact", NULL} ;
764static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000765
766#define DECL (const char**)
767
Daniel Veillard22090732001-07-16 00:06:07 +0000768static const htmlElemDesc
769html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000770{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
771 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
772},
773{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775},
776{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
777 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
778},
779{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
780 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
781},
782{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
783 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
784},
785{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
786 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
787},
788{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
789 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
790},
791{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
792 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
793},
794{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
795 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
796},
797{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
798 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
799},
800{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802},
803{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
804 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
805},
806{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
807 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
808},
809{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
810 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
811},
812{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
813 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
814},
815{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817},
818{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
819 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
820},
821{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823},
824{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
825 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
826},
827{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
828 EMPTY , NULL , DECL col_attrs , NULL, NULL
829},
830{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
831 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
832},
833{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
834 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
835},
836{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
837 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
838},
839{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
840 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
841},
842{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
843 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
844},
845{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
846 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
847},
848{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000849 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000850},
851{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853},
854{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
855 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000857{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000858 EMPTY, NULL, DECL embed_attrs, NULL, NULL
859},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000860{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
861 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
862},
863{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
864 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
865},
866{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
867 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
868},
869{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
870 EMPTY, NULL, NULL, DECL frame_attrs, NULL
871},
872{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
873 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
874},
875{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877},
878{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880},
881{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883},
884{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886},
887{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889},
890{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
891 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
892},
893{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
894 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
895},
896{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
897 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
898},
899{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
900 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
901},
902{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
903 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
904},
905{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
906 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
907},
908{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000909 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000910},
911{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
912 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
913},
914{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
915 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
916},
917{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
918 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
919},
920{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
921 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
922},
923{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
924 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
925},
926{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
927 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
928},
929{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
930 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
931},
932{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
933 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
934},
935{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000936 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000937},
938{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
939 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
940},
941{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
942 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
943},
944{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
945 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
946},
947{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
948 DECL html_flow, "div", DECL html_attrs, NULL, NULL
949},
950{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
951 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
952},
953{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
954 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
955},
956{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000957 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000958},
959{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
960 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
961},
962{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
963 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
964},
965{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000966 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000967},
968{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
969 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
970},
971{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
972 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
973},
974{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
975 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
976},
977{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
978 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
979},
980{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
981 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
982},
983{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
984 DECL select_content, NULL, DECL select_attrs, NULL, NULL
985},
986{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988},
989{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
990 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991},
992{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
993 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
994},
995{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
996 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
997},
998{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
999 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1000},
1001{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003},
1004{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006},
1007{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1008 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1009},
1010{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1011 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1012},
1013{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1014 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1015},
1016{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1017 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1018},
1019{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1020 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1021},
1022{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1023 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1024},
1025{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1026 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1027},
1028{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1029 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1030},
1031{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1032 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1033},
1034{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1035 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1036},
1037{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1038 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1039},
1040{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1041 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1042},
1043{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1044 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1045}
Owen Taylor3473f882001-02-23 17:55:21 +00001046};
1047
1048/*
Owen Taylor3473f882001-02-23 17:55:21 +00001049 * start tags that imply the end of current element
1050 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001051static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001052"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1053 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1054 "listing", "xmp", "head", NULL,
1055"head", "p", NULL,
1056"title", "p", NULL,
1057"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001058"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001059"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1060 "pre", "listing", "xmp", "head", "li", NULL,
1061"hr", "p", "head", NULL,
1062"h1", "p", "head", NULL,
1063"h2", "p", "head", NULL,
1064"h3", "p", "head", NULL,
1065"h4", "p", "head", NULL,
1066"h5", "p", "head", NULL,
1067"h6", "p", "head", NULL,
1068"dir", "p", "head", NULL,
1069"address", "p", "head", "ul", NULL,
1070"pre", "p", "head", "ul", NULL,
1071"listing", "p", "head", NULL,
1072"xmp", "p", "head", NULL,
1073"blockquote", "p", "head", NULL,
1074"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1075 "xmp", "head", NULL,
1076"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1077 "head", "dd", NULL,
1078"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1079 "head", "dt", NULL,
1080"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1081 "listing", "xmp", NULL,
1082"ol", "p", "head", "ul", NULL,
1083"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001084"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001085"div", "p", "head", NULL,
Denis Pauka0cd0752012-05-11 19:31:12 +08001086"noscript", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001087"center", "font", "b", "i", "p", "head", NULL,
Conrad Irwinb60061a2012-07-27 15:42:27 -07001088"a", "a", "head", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001089"caption", "p", NULL,
1090"colgroup", "caption", "colgroup", "col", "p", NULL,
1091"col", "caption", "col", "p", NULL,
1092"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1093 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001094"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001095"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001096"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1097"thead", "caption", "col", "colgroup", NULL,
1098"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1099 "tbody", "p", NULL,
1100"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1101 "tfoot", "tbody", "p", NULL,
1102"optgroup", "option", NULL,
1103"option", "option", NULL,
1104"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1105 "pre", "listing", "xmp", "a", NULL,
Conrad Irwinb60061a2012-07-27 15:42:27 -07001106/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1107"tt", "head", NULL,
1108"i", "head", NULL,
1109"b", "head", NULL,
1110"u", "head", NULL,
1111"s", "head", NULL,
1112"strike", "head", NULL,
1113"big", "head", NULL,
1114"small", "head", NULL,
1115
1116"em", "head", NULL,
1117"strong", "head", NULL,
1118"dfn", "head", NULL,
1119"code", "head", NULL,
1120"samp", "head", NULL,
1121"kbd", "head", NULL,
1122"var", "head", NULL,
1123"cite", "head", NULL,
1124"abbr", "head", NULL,
1125"acronym", "head", NULL,
1126
1127/* "a" */
1128"img", "head", NULL,
1129/* "applet" */
1130/* "embed" */
1131/* "object" */
1132"font", "head", NULL,
1133/* "basefont" */
1134"br", "head", NULL,
1135/* "script" */
1136"map", "head", NULL,
1137"q", "head", NULL,
1138"sub", "head", NULL,
1139"sup", "head", NULL,
1140"span", "head", NULL,
1141"bdo", "head", NULL,
1142"iframe", "head", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001143NULL
1144};
1145
1146/*
1147 * The list of HTML elements which are supposed not to have
1148 * CDATA content and where a p element will be implied
1149 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001150 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001151 * implied paragraph
1152 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001153static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001154 "html",
1155 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001156 NULL
1157};
1158
1159/*
1160 * The list of HTML attributes which are of content %Script;
1161 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1162 * it assumes the name starts with 'on'
1163 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001164static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001165 "onclick",
1166 "ondblclick",
1167 "onmousedown",
1168 "onmouseup",
1169 "onmouseover",
1170 "onmousemove",
1171 "onmouseout",
1172 "onkeypress",
1173 "onkeydown",
1174 "onkeyup",
1175 "onload",
1176 "onunload",
1177 "onfocus",
1178 "onblur",
1179 "onsubmit",
Daniel Veillardb0c7e7e2014-02-06 10:50:35 +01001180 "onreset",
Owen Taylor3473f882001-02-23 17:55:21 +00001181 "onchange",
1182 "onselect"
1183};
1184
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001185/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001186 * This table is used by the htmlparser to know what to do with
1187 * broken html pages. By assigning different priorities to different
1188 * elements the parser can decide how to handle extra endtags.
1189 * Endtags are only allowed to close elements with lower or equal
1190 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001191 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001192
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001193typedef struct {
1194 const char *name;
1195 int priority;
1196} elementPriority;
1197
Daniel Veillard22090732001-07-16 00:06:07 +00001198static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001199 {"div", 150},
1200 {"td", 160},
1201 {"th", 160},
1202 {"tr", 170},
1203 {"thead", 180},
1204 {"tbody", 180},
1205 {"tfoot", 180},
1206 {"table", 190},
1207 {"head", 200},
1208 {"body", 200},
1209 {"html", 220},
1210 {NULL, 100} /* Default priority */
1211};
Owen Taylor3473f882001-02-23 17:55:21 +00001212
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001213static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001214static int htmlStartCloseIndexinitialized = 0;
1215
1216/************************************************************************
1217 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001218 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001219 * *
1220 ************************************************************************/
1221
1222/**
1223 * htmlInitAutoClose:
1224 *
1225 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1226 * This is not reentrant. Call xmlInitParser() once before processing in
1227 * case of use in multithreaded programs.
1228 */
1229void
1230htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001231 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001232
1233 if (htmlStartCloseIndexinitialized) return;
1234
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001235 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1236 indx = 0;
1237 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001238 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001239 while (htmlStartClose[i] != NULL) i++;
1240 i++;
1241 }
1242 htmlStartCloseIndexinitialized = 1;
1243}
1244
1245/**
1246 * htmlTagLookup:
1247 * @tag: The tag name in lowercase
1248 *
1249 * Lookup the HTML tag in the ElementTable
1250 *
1251 * Returns the related htmlElemDescPtr or NULL if not found.
1252 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001253const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001254htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001255 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001256
1257 for (i = 0; i < (sizeof(html40ElementTable) /
1258 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001259 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001260 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001261 }
1262 return(NULL);
1263}
1264
1265/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001266 * htmlGetEndPriority:
1267 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001268 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001269 * Return value: The "endtag" priority.
1270 **/
1271static int
1272htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001273 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001274
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001275 while ((htmlEndPriority[i].name != NULL) &&
1276 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1277 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001278
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001279 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001280}
1281
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001282
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001283/**
Owen Taylor3473f882001-02-23 17:55:21 +00001284 * htmlCheckAutoClose:
1285 * @newtag: The new tag name
1286 * @oldtag: The old tag name
1287 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001288 * Checks whether the new tag is one of the registered valid tags for
1289 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001290 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1291 *
1292 * Returns 0 if no, 1 if yes.
1293 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001294static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001295htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1296{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001297 int i, indx;
1298 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001299
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001300 if (htmlStartCloseIndexinitialized == 0)
1301 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001302
1303 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001304 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001305 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001306 if (closed == NULL)
1307 return (0);
1308 if (xmlStrEqual(BAD_CAST * closed, newtag))
1309 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001310 }
1311
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001312 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001313 i++;
1314 while (htmlStartClose[i] != NULL) {
1315 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001316 return (1);
1317 }
1318 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001319 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001320 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001321}
1322
1323/**
1324 * htmlAutoCloseOnClose:
1325 * @ctxt: an HTML parser context
1326 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001327 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001328 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001329 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001330 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001331static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001332htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1333{
1334 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001335 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001336
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001337 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001338
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001339 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001340
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001341 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1342 break;
1343 /*
1344 * A missplaced endtag can only close elements with lower
1345 * or equal priority, so if we find an element with higher
1346 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001347 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001348 */
1349 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1350 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001351 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001352 if (i < 0)
1353 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001354
1355 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001356 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001357 if ((info != NULL) && (info->endTag == 3)) {
1358 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1359 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001360 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001361 }
1362 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1363 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001364 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001365 }
1366}
1367
1368/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001369 * htmlAutoCloseOnEnd:
1370 * @ctxt: an HTML parser context
1371 *
1372 * Close all remaining tags at the end of the stream
1373 */
1374static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001375htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1376{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001377 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001378
William M. Brack899e64a2003-09-26 18:03:42 +00001379 if (ctxt->nameNr == 0)
1380 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001381 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001382 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1383 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001384 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001385 }
1386}
1387
1388/**
Owen Taylor3473f882001-02-23 17:55:21 +00001389 * htmlAutoClose:
1390 * @ctxt: an HTML parser context
1391 * @newtag: The new tag name or NULL
1392 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001393 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001394 * The list is kept in htmlStartClose array. This function is
1395 * called when a new tag has been detected and generates the
1396 * appropriates closes if possible/needed.
1397 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001398 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001399 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001400static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001401htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1402{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001403 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001404 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001405 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1406 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001407 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001408 }
1409 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001410 htmlAutoCloseOnEnd(ctxt);
1411 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001412 }
1413 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001414 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1415 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1416 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001417 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1418 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001419 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001420 }
Owen Taylor3473f882001-02-23 17:55:21 +00001421}
1422
1423/**
1424 * htmlAutoCloseTag:
1425 * @doc: the HTML document
1426 * @name: The tag name
1427 * @elem: the HTML element
1428 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001429 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001430 * The list is kept in htmlStartClose array. This function checks
1431 * if the element or one of it's children would autoclose the
1432 * given tag.
1433 *
1434 * Returns 1 if autoclose, 0 otherwise
1435 */
1436int
1437htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1438 htmlNodePtr child;
1439
1440 if (elem == NULL) return(1);
1441 if (xmlStrEqual(name, elem->name)) return(0);
1442 if (htmlCheckAutoClose(elem->name, name)) return(1);
1443 child = elem->children;
1444 while (child != NULL) {
1445 if (htmlAutoCloseTag(doc, name, child)) return(1);
1446 child = child->next;
1447 }
1448 return(0);
1449}
1450
1451/**
1452 * htmlIsAutoClosed:
1453 * @doc: the HTML document
1454 * @elem: the HTML element
1455 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001456 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001457 * The list is kept in htmlStartClose array. This function checks
1458 * if a tag is autoclosed by one of it's child
1459 *
1460 * Returns 1 if autoclosed, 0 otherwise
1461 */
1462int
1463htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1464 htmlNodePtr child;
1465
1466 if (elem == NULL) return(1);
1467 child = elem->children;
1468 while (child != NULL) {
1469 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1470 child = child->next;
1471 }
1472 return(0);
1473}
1474
1475/**
1476 * htmlCheckImplied:
1477 * @ctxt: an HTML parser context
1478 * @newtag: The new tag name
1479 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001480 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001481 * called when a new tag has been detected and generates the
1482 * appropriates implicit tags if missing
1483 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001484static void
Owen Taylor3473f882001-02-23 17:55:21 +00001485htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001486 int i;
1487
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001488 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1489 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001490 if (!htmlOmittedDefaultValue)
1491 return;
1492 if (xmlStrEqual(newtag, BAD_CAST"html"))
1493 return;
1494 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001495 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1498 }
1499 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1500 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001501 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001502 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1503 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1504 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1505 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1506 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1507 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001508 if (ctxt->html >= 3) {
1509 /* we already saw or generated an <head> before */
1510 return;
1511 }
1512 /*
1513 * dropped OBJECT ... i you put it first BODY will be
1514 * assumed !
1515 */
1516 htmlnamePush(ctxt, BAD_CAST"head");
1517 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1518 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001519 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1520 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1521 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001522 if (ctxt->html >= 10) {
1523 /* we already saw or generated a <body> before */
1524 return;
1525 }
Owen Taylor3473f882001-02-23 17:55:21 +00001526 for (i = 0;i < ctxt->nameNr;i++) {
1527 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1528 return;
1529 }
1530 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1531 return;
1532 }
1533 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001534
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001535 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001536 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1537 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1538 }
1539}
1540
1541/**
1542 * htmlCheckParagraph
1543 * @ctxt: an HTML parser context
1544 *
1545 * Check whether a p element need to be implied before inserting
1546 * characters in the current element.
1547 *
1548 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1549 * in case of error.
1550 */
1551
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001552static int
Owen Taylor3473f882001-02-23 17:55:21 +00001553htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1554 const xmlChar *tag;
1555 int i;
1556
1557 if (ctxt == NULL)
1558 return(-1);
1559 tag = ctxt->name;
1560 if (tag == NULL) {
1561 htmlAutoClose(ctxt, BAD_CAST"p");
1562 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001563 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001564 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1565 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1566 return(1);
1567 }
1568 if (!htmlOmittedDefaultValue)
1569 return(0);
1570 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1571 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001572 htmlAutoClose(ctxt, BAD_CAST"p");
1573 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001574 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001575 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1576 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1577 return(1);
1578 }
1579 }
1580 return(0);
1581}
1582
1583/**
1584 * htmlIsScriptAttribute:
1585 * @name: an attribute name
1586 *
1587 * Check if an attribute is of content type Script
1588 *
1589 * Returns 1 is the attribute is a script 0 otherwise
1590 */
1591int
1592htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001593 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001594
1595 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001596 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001597 /*
1598 * all script attributes start with 'on'
1599 */
1600 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001601 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001602 for (i = 0;
1603 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1604 i++) {
1605 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1606 return(1);
1607 }
1608 return(0);
1609}
1610
1611/************************************************************************
1612 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001613 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001614 * *
1615 ************************************************************************/
1616
1617
Daniel Veillard22090732001-07-16 00:06:07 +00001618static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001619/*
1620 * the 4 absolute ones, plus apostrophe.
1621 */
1622{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1623{ 38, "amp", "ampersand, U+0026 ISOnum" },
1624{ 39, "apos", "single quote" },
1625{ 60, "lt", "less-than sign, U+003C ISOnum" },
1626{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1627
1628/*
1629 * A bunch still in the 128-255 range
1630 * Replacing them depend really on the charset used.
1631 */
1632{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1633{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1634{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1635{ 163, "pound","pound sign, U+00A3 ISOnum" },
1636{ 164, "curren","currency sign, U+00A4 ISOnum" },
1637{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1638{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1639{ 167, "sect", "section sign, U+00A7 ISOnum" },
1640{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1641{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1642{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1643{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1644{ 172, "not", "not sign, U+00AC ISOnum" },
1645{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1646{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1647{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1648{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1649{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1650{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1651{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1652{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1653{ 181, "micro","micro sign, U+00B5 ISOnum" },
1654{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1655{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1656{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1657{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1658{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1659{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1660{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1661{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1662{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1663{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1664{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1665{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1666{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1667{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1668{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1669{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1670{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1671{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1672{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1673{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1674{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1675{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1676{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1677{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1678{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1679{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1680{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1681{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1682{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1683{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1684{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1685{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1686{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1687{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1688{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1689{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1690{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1691{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1692{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1693{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1694{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1695{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1696{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1697{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1698{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1699{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1700{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1701{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1702{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1703{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1704{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1705{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1706{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1707{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1708{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1709{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1710{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1711{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1712{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1713{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1714{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1715{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1716{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1717{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1718{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1719{ 247, "divide","division sign, U+00F7 ISOnum" },
1720{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1721{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1722{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1723{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1724{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1725{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1726{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1727{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1728
1729{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1730{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1731{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1732{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1733{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1734
1735/*
1736 * Anything below should really be kept as entities references
1737 */
1738{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1739
1740{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1741{ 732, "tilde","small tilde, U+02DC ISOdia" },
1742
1743{ 913, "Alpha","greek capital letter alpha, U+0391" },
1744{ 914, "Beta", "greek capital letter beta, U+0392" },
1745{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1746{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1747{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1748{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1749{ 919, "Eta", "greek capital letter eta, U+0397" },
1750{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1751{ 921, "Iota", "greek capital letter iota, U+0399" },
1752{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001753{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001754{ 924, "Mu", "greek capital letter mu, U+039C" },
1755{ 925, "Nu", "greek capital letter nu, U+039D" },
1756{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1757{ 927, "Omicron","greek capital letter omicron, U+039F" },
1758{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1759{ 929, "Rho", "greek capital letter rho, U+03A1" },
1760{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1761{ 932, "Tau", "greek capital letter tau, U+03A4" },
1762{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1763{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1764{ 935, "Chi", "greek capital letter chi, U+03A7" },
1765{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1766{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1767
1768{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1769{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1770{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1771{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1772{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1773{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1774{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1775{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1776{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1777{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1778{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1779{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1780{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1781{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1782{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1783{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1784{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1785{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1786{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1787{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1788{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1789{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1790{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1791{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1792{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1793{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1794{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1795{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1796
1797{ 8194, "ensp", "en space, U+2002 ISOpub" },
1798{ 8195, "emsp", "em space, U+2003 ISOpub" },
1799{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1800{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1801{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1802{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1803{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1804{ 8211, "ndash","en dash, U+2013 ISOpub" },
1805{ 8212, "mdash","em dash, U+2014 ISOpub" },
1806{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1807{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1808{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1809{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1810{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1811{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1812{ 8224, "dagger","dagger, U+2020 ISOpub" },
1813{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1814
1815{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1816{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1817
1818{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1819
1820{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1821{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1822
1823{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1824{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1825
1826{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1827{ 8260, "frasl","fraction slash, U+2044 NEW" },
1828
1829{ 8364, "euro", "euro sign, U+20AC NEW" },
1830
1831{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1832{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1833{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1834{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1835{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1836{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1837{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1838{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1839{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1840{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1841{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1842{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1843{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1844{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1845{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1846{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1847
1848{ 8704, "forall","for all, U+2200 ISOtech" },
1849{ 8706, "part", "partial differential, U+2202 ISOtech" },
1850{ 8707, "exist","there exists, U+2203 ISOtech" },
1851{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1852{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1853{ 8712, "isin", "element of, U+2208 ISOtech" },
1854{ 8713, "notin","not an element of, U+2209 ISOtech" },
1855{ 8715, "ni", "contains as member, U+220B ISOtech" },
1856{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001857{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001858{ 8722, "minus","minus sign, U+2212 ISOtech" },
1859{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1860{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1861{ 8733, "prop", "proportional to, U+221D ISOtech" },
1862{ 8734, "infin","infinity, U+221E ISOtech" },
1863{ 8736, "ang", "angle, U+2220 ISOamso" },
1864{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1865{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1866{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1867{ 8746, "cup", "union = cup, U+222A ISOtech" },
1868{ 8747, "int", "integral, U+222B ISOtech" },
1869{ 8756, "there4","therefore, U+2234 ISOtech" },
1870{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1871{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1872{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1873{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1874{ 8801, "equiv","identical to, U+2261 ISOtech" },
1875{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1876{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1877{ 8834, "sub", "subset of, U+2282 ISOtech" },
1878{ 8835, "sup", "superset of, U+2283 ISOtech" },
1879{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1880{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1881{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1882{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1883{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1884{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1885{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1886{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1887{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1888{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1889{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1890{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1891{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1892{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1893
1894{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1895{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1896{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1897{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1898
1899};
1900
1901/************************************************************************
1902 * *
1903 * Commodity functions to handle entities *
1904 * *
1905 ************************************************************************/
1906
1907/*
1908 * Macro used to grow the current buffer.
1909 */
1910#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001911 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001912 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001913 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1914 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001915 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001916 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001917 return(NULL); \
1918 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001919 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001920}
1921
1922/**
1923 * htmlEntityLookup:
1924 * @name: the entity name
1925 *
1926 * Lookup the given entity in EntitiesTable
1927 *
1928 * TODO: the linear scan is really ugly, an hash table is really needed.
1929 *
1930 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1931 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001932const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001933htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001934 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001935
1936 for (i = 0;i < (sizeof(html40EntitiesTable)/
1937 sizeof(html40EntitiesTable[0]));i++) {
1938 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001939 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001940 }
1941 }
1942 return(NULL);
1943}
1944
1945/**
1946 * htmlEntityValueLookup:
1947 * @value: the entity's unicode value
1948 *
1949 * Lookup the given entity in EntitiesTable
1950 *
1951 * TODO: the linear scan is really ugly, an hash table is really needed.
1952 *
1953 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1954 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001955const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001956htmlEntityValueLookup(unsigned int value) {
1957 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001958
1959 for (i = 0;i < (sizeof(html40EntitiesTable)/
1960 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001961 if (html40EntitiesTable[i].value >= value) {
1962 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001963 break;
William M. Brack78637da2003-07-31 14:47:38 +00001964 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001965 }
Owen Taylor3473f882001-02-23 17:55:21 +00001966 }
1967 return(NULL);
1968}
1969
1970/**
1971 * UTF8ToHtml:
1972 * @out: a pointer to an array of bytes to store the result
1973 * @outlen: the length of @out
1974 * @in: a pointer to an array of UTF-8 chars
1975 * @inlen: the length of @in
1976 *
1977 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1978 * plus HTML entities block of chars out.
1979 *
1980 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1981 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001982 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001983 * The value of @outlen after return is the number of octets consumed.
1984 */
1985int
1986UTF8ToHtml(unsigned char* out, int *outlen,
1987 const unsigned char* in, int *inlen) {
1988 const unsigned char* processed = in;
1989 const unsigned char* outend;
1990 const unsigned char* outstart = out;
1991 const unsigned char* instart = in;
1992 const unsigned char* inend;
1993 unsigned int c, d;
1994 int trailing;
1995
Daniel Veillardce682bc2004-11-05 17:22:25 +00001996 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001997 if (in == NULL) {
1998 /*
1999 * initialization nothing to do
2000 */
2001 *outlen = 0;
2002 *inlen = 0;
2003 return(0);
2004 }
2005 inend = in + (*inlen);
2006 outend = out + (*outlen);
2007 while (in < inend) {
2008 d = *in++;
2009 if (d < 0x80) { c= d; trailing= 0; }
2010 else if (d < 0xC0) {
2011 /* trailing byte in leading position */
2012 *outlen = out - outstart;
2013 *inlen = processed - instart;
2014 return(-2);
2015 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2016 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2017 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2018 else {
2019 /* no chance for this in Ascii */
2020 *outlen = out - outstart;
2021 *inlen = processed - instart;
2022 return(-2);
2023 }
2024
2025 if (inend - in < trailing) {
2026 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02002027 }
Owen Taylor3473f882001-02-23 17:55:21 +00002028
2029 for ( ; trailing; trailing--) {
2030 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2031 break;
2032 c <<= 6;
2033 c |= d & 0x3F;
2034 }
2035
2036 /* assertion: c is a single UTF-4 value */
2037 if (c < 0x80) {
2038 if (out + 1 >= outend)
2039 break;
2040 *out++ = c;
2041 } else {
2042 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00002043 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00002044 const char *cp;
2045 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00002046
2047 /*
2048 * Try to lookup a predefined HTML entity for it
2049 */
2050
2051 ent = htmlEntityValueLookup(c);
2052 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00002053 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2054 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00002055 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00002056 else
2057 cp = ent->name;
2058 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00002059 if (out + 2 + len >= outend)
2060 break;
2061 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00002062 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00002063 out += len;
2064 *out++ = ';';
2065 }
2066 processed = in;
2067 }
2068 *outlen = out - outstart;
2069 *inlen = processed - instart;
2070 return(0);
2071}
2072
2073/**
2074 * htmlEncodeEntities:
2075 * @out: a pointer to an array of bytes to store the result
2076 * @outlen: the length of @out
2077 * @in: a pointer to an array of UTF-8 chars
2078 * @inlen: the length of @in
2079 * @quoteChar: the quote character to escape (' or ") or zero.
2080 *
2081 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2082 * plus HTML entities block of chars out.
2083 *
2084 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2085 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002086 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00002087 * The value of @outlen after return is the number of octets consumed.
2088 */
2089int
2090htmlEncodeEntities(unsigned char* out, int *outlen,
2091 const unsigned char* in, int *inlen, int quoteChar) {
2092 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002093 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002094 const unsigned char* outstart = out;
2095 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002096 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002097 unsigned int c, d;
2098 int trailing;
2099
Daniel Veillardce682bc2004-11-05 17:22:25 +00002100 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2101 return(-1);
2102 outend = out + (*outlen);
2103 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002104 while (in < inend) {
2105 d = *in++;
2106 if (d < 0x80) { c= d; trailing= 0; }
2107 else if (d < 0xC0) {
2108 /* trailing byte in leading position */
2109 *outlen = out - outstart;
2110 *inlen = processed - instart;
2111 return(-2);
2112 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2113 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2114 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2115 else {
2116 /* no chance for this in Ascii */
2117 *outlen = out - outstart;
2118 *inlen = processed - instart;
2119 return(-2);
2120 }
2121
2122 if (inend - in < trailing)
2123 break;
2124
2125 while (trailing--) {
2126 if (((d= *in++) & 0xC0) != 0x80) {
2127 *outlen = out - outstart;
2128 *inlen = processed - instart;
2129 return(-2);
2130 }
2131 c <<= 6;
2132 c |= d & 0x3F;
2133 }
2134
2135 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002136 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2137 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002138 if (out >= outend)
2139 break;
2140 *out++ = c;
2141 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002142 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002143 const char *cp;
2144 char nbuf[16];
2145 int len;
2146
2147 /*
2148 * Try to lookup a predefined HTML entity for it
2149 */
2150 ent = htmlEntityValueLookup(c);
2151 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002152 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002153 cp = nbuf;
2154 }
2155 else
2156 cp = ent->name;
2157 len = strlen(cp);
2158 if (out + 2 + len > outend)
2159 break;
2160 *out++ = '&';
2161 memcpy(out, cp, len);
2162 out += len;
2163 *out++ = ';';
2164 }
2165 processed = in;
2166 }
2167 *outlen = out - outstart;
2168 *inlen = processed - instart;
2169 return(0);
2170}
2171
Owen Taylor3473f882001-02-23 17:55:21 +00002172/************************************************************************
2173 * *
2174 * Commodity functions to handle streams *
2175 * *
2176 ************************************************************************/
2177
2178/**
Owen Taylor3473f882001-02-23 17:55:21 +00002179 * htmlNewInputStream:
2180 * @ctxt: an HTML parser context
2181 *
2182 * Create a new input stream structure
2183 * Returns the new input stream or NULL
2184 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002185static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002186htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2187 htmlParserInputPtr input;
2188
2189 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2190 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002191 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002192 return(NULL);
2193 }
2194 memset(input, 0, sizeof(htmlParserInput));
2195 input->filename = NULL;
2196 input->directory = NULL;
2197 input->base = NULL;
2198 input->cur = NULL;
2199 input->buf = NULL;
2200 input->line = 1;
2201 input->col = 1;
2202 input->buf = NULL;
2203 input->free = NULL;
2204 input->version = NULL;
2205 input->consumed = 0;
2206 input->length = 0;
2207 return(input);
2208}
2209
2210
2211/************************************************************************
2212 * *
2213 * Commodity functions, cleanup needed ? *
2214 * *
2215 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002216/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002217 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002218 * NOTE: it might be more apropriate to integrate this information
2219 * into the html40ElementTable array but I don't want to risk any
2220 * binary incomptibility
2221 */
2222static const char *allowPCData[] = {
2223 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2224 "blockquote", "body", "button", "caption", "center", "cite", "code",
2225 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2226 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2227 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2228 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2229};
Owen Taylor3473f882001-02-23 17:55:21 +00002230
2231/**
2232 * areBlanks:
2233 * @ctxt: an HTML parser context
2234 * @str: a xmlChar *
2235 * @len: the size of @str
2236 *
2237 * Is this a sequence of blank chars that one can ignore ?
2238 *
2239 * Returns 1 if ignorable 0 otherwise.
2240 */
2241
2242static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002243 unsigned int i;
2244 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002245 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002246 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002247
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002248 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002249 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002250
2251 if (CUR == 0) return(1);
2252 if (CUR != '<') return(0);
2253 if (ctxt->name == NULL)
2254 return(1);
2255 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2256 return(1);
2257 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2258 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002259
2260 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2261 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2262 dtd = xmlGetIntSubset(ctxt->myDoc);
2263 if (dtd != NULL && dtd->ExternalID != NULL) {
2264 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2265 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2266 return(1);
2267 }
2268 }
2269
Owen Taylor3473f882001-02-23 17:55:21 +00002270 if (ctxt->node == NULL) return(0);
2271 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002272 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2273 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002274 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002275 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2276 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002277 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002278 for all tags "b" allowing PCDATA */
2279 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2280 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2281 return(0);
2282 }
2283 }
Owen Taylor3473f882001-02-23 17:55:21 +00002284 } else if (xmlNodeIsText(lastChild)) {
2285 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002286 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002287 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002288 for all tags "p" allowing PCDATA */
2289 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2290 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2291 return(0);
2292 }
2293 }
Owen Taylor3473f882001-02-23 17:55:21 +00002294 }
2295 return(1);
2296}
2297
2298/**
Owen Taylor3473f882001-02-23 17:55:21 +00002299 * htmlNewDocNoDtD:
2300 * @URI: URI for the dtd, or NULL
2301 * @ExternalID: the external ID of the DTD, or NULL
2302 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002303 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2304 * are NULL
2305 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002306 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002307 */
2308htmlDocPtr
2309htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2310 xmlDocPtr cur;
2311
2312 /*
2313 * Allocate a new document and fill the fields.
2314 */
2315 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2316 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002317 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002318 return(NULL);
2319 }
2320 memset(cur, 0, sizeof(xmlDoc));
2321
2322 cur->type = XML_HTML_DOCUMENT_NODE;
2323 cur->version = NULL;
2324 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002325 cur->doc = cur;
2326 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002327 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002328 cur->extSubset = NULL;
2329 cur->oldNs = NULL;
2330 cur->encoding = NULL;
2331 cur->standalone = 1;
2332 cur->compression = 0;
2333 cur->ids = NULL;
2334 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002335 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002336 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002337 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002338 if ((ExternalID != NULL) ||
2339 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002340 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002341 return(cur);
2342}
2343
2344/**
2345 * htmlNewDoc:
2346 * @URI: URI for the dtd, or NULL
2347 * @ExternalID: the external ID of the DTD, or NULL
2348 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002349 * Creates a new HTML document
2350 *
Owen Taylor3473f882001-02-23 17:55:21 +00002351 * Returns a new document
2352 */
2353htmlDocPtr
2354htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2355 if ((URI == NULL) && (ExternalID == NULL))
2356 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002357 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2358 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002359
2360 return(htmlNewDocNoDtD(URI, ExternalID));
2361}
2362
2363
2364/************************************************************************
2365 * *
2366 * The parser itself *
2367 * Relates to http://www.w3.org/TR/html40 *
2368 * *
2369 ************************************************************************/
2370
2371/************************************************************************
2372 * *
2373 * The parser itself *
2374 * *
2375 ************************************************************************/
2376
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002377static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002378
Owen Taylor3473f882001-02-23 17:55:21 +00002379/**
2380 * htmlParseHTMLName:
2381 * @ctxt: an HTML parser context
2382 *
2383 * parse an HTML tag or attribute name, note that we convert it to lowercase
2384 * since HTML names are not case-sensitive.
2385 *
2386 * Returns the Tag Name parsed or NULL
2387 */
2388
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002389static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002390htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002391 int i = 0;
2392 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2393
William M. Brackd1757ab2004-10-02 22:07:48 +00002394 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002395 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002396
2397 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002398 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002399 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2400 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002401 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2402 else loc[i] = CUR;
2403 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002404
Owen Taylor3473f882001-02-23 17:55:21 +00002405 NEXT;
2406 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002407
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002408 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002409}
2410
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002411
2412/**
2413 * htmlParseHTMLName_nonInvasive:
2414 * @ctxt: an HTML parser context
2415 *
2416 * parse an HTML tag or attribute name, note that we convert it to lowercase
2417 * since HTML names are not case-sensitive, this doesn't consume the data
2418 * from the stream, it's a look-ahead
2419 *
2420 * Returns the Tag Name parsed or NULL
2421 */
2422
2423static const xmlChar *
2424htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2425 int i = 0;
2426 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2427
2428 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2429 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002430
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002431 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2432 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2433 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2434 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2435 else loc[i] = NXT(1+i);
2436 i++;
2437 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002438
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002439 return(xmlDictLookup(ctxt->dict, loc, i));
2440}
2441
2442
Owen Taylor3473f882001-02-23 17:55:21 +00002443/**
2444 * htmlParseName:
2445 * @ctxt: an HTML parser context
2446 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002447 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002448 *
2449 * Returns the Name parsed or NULL
2450 */
2451
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002452static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002453htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002454 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002455 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002456 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002457
2458 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002459
2460 /*
2461 * Accelerator for simple ASCII names
2462 */
2463 in = ctxt->input->cur;
2464 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2465 ((*in >= 0x41) && (*in <= 0x5A)) ||
2466 (*in == '_') || (*in == ':')) {
2467 in++;
2468 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2469 ((*in >= 0x41) && (*in <= 0x5A)) ||
2470 ((*in >= 0x30) && (*in <= 0x39)) ||
2471 (*in == '_') || (*in == '-') ||
2472 (*in == ':') || (*in == '.'))
2473 in++;
2474 if ((*in > 0) && (*in < 0x80)) {
2475 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002476 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002477 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002478 ctxt->nbChars += count;
2479 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002480 return(ret);
2481 }
2482 }
2483 return(htmlParseNameComplex(ctxt));
2484}
2485
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002486static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002487htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002488 int len = 0, l;
2489 int c;
2490 int count = 0;
2491
2492 /*
2493 * Handler for more complex cases
2494 */
2495 GROW;
2496 c = CUR_CHAR(l);
2497 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2498 (!IS_LETTER(c) && (c != '_') &&
2499 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002500 return(NULL);
2501 }
2502
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002503 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2504 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2505 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002506 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002507 (IS_COMBINING(c)) ||
2508 (IS_EXTENDER(c)))) {
2509 if (count++ > 100) {
2510 count = 0;
2511 GROW;
2512 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002513 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002514 NEXTL(l);
2515 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002516 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002517 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002518}
2519
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002520
Owen Taylor3473f882001-02-23 17:55:21 +00002521/**
2522 * htmlParseHTMLAttribute:
2523 * @ctxt: an HTML parser context
2524 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002525 *
Owen Taylor3473f882001-02-23 17:55:21 +00002526 * parse an HTML attribute value till the stop (quote), if
2527 * stop is 0 then it stops at the first space
2528 *
2529 * Returns the attribute parsed or NULL
2530 */
2531
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002532static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002533htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2534 xmlChar *buffer = NULL;
2535 int buffer_size = 0;
2536 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002537 const xmlChar *name = NULL;
2538 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002539 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002540
2541 /*
2542 * allocate a translation buffer.
2543 */
2544 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002545 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002546 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002547 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002548 return(NULL);
2549 }
2550 out = buffer;
2551
2552 /*
2553 * Ok loop until we reach one of the ending chars
2554 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002555 while ((CUR != 0) && (CUR != stop)) {
2556 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002557 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002558 if (CUR == '&') {
2559 if (NXT(1) == '#') {
2560 unsigned int c;
2561 int bits;
2562
2563 c = htmlParseCharRef(ctxt);
2564 if (c < 0x80)
2565 { *out++ = c; bits= -6; }
2566 else if (c < 0x800)
2567 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2568 else if (c < 0x10000)
2569 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002570 else
Owen Taylor3473f882001-02-23 17:55:21 +00002571 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002572
Owen Taylor3473f882001-02-23 17:55:21 +00002573 for ( ; bits >= 0; bits-= 6) {
2574 *out++ = ((c >> bits) & 0x3F) | 0x80;
2575 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002576
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002577 if (out - buffer > buffer_size - 100) {
2578 int indx = out - buffer;
2579
2580 growBuffer(buffer);
2581 out = &buffer[indx];
2582 }
Owen Taylor3473f882001-02-23 17:55:21 +00002583 } else {
2584 ent = htmlParseEntityRef(ctxt, &name);
2585 if (name == NULL) {
2586 *out++ = '&';
2587 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002588 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002589
2590 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002591 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002592 }
2593 } else if (ent == NULL) {
2594 *out++ = '&';
2595 cur = name;
2596 while (*cur != 0) {
2597 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002598 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002599
2600 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002601 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002602 }
2603 *out++ = *cur++;
2604 }
Owen Taylor3473f882001-02-23 17:55:21 +00002605 } else {
2606 unsigned int c;
2607 int bits;
2608
2609 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002610 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002611
2612 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002613 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002614 }
Daniel Veillard48519092006-10-17 15:56:35 +00002615 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002616 if (c < 0x80)
2617 { *out++ = c; bits= -6; }
2618 else if (c < 0x800)
2619 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2620 else if (c < 0x10000)
2621 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002622 else
Owen Taylor3473f882001-02-23 17:55:21 +00002623 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002624
Owen Taylor3473f882001-02-23 17:55:21 +00002625 for ( ; bits >= 0; bits-= 6) {
2626 *out++ = ((c >> bits) & 0x3F) | 0x80;
2627 }
Owen Taylor3473f882001-02-23 17:55:21 +00002628 }
2629 }
2630 } else {
2631 unsigned int c;
2632 int bits, l;
2633
2634 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002635 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002636
2637 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002638 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002639 }
2640 c = CUR_CHAR(l);
2641 if (c < 0x80)
2642 { *out++ = c; bits= -6; }
2643 else if (c < 0x800)
2644 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2645 else if (c < 0x10000)
2646 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002647 else
Owen Taylor3473f882001-02-23 17:55:21 +00002648 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002649
Owen Taylor3473f882001-02-23 17:55:21 +00002650 for ( ; bits >= 0; bits-= 6) {
2651 *out++ = ((c >> bits) & 0x3F) | 0x80;
2652 }
2653 NEXT;
2654 }
2655 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002656 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002657 return(buffer);
2658}
2659
2660/**
Owen Taylor3473f882001-02-23 17:55:21 +00002661 * htmlParseEntityRef:
2662 * @ctxt: an HTML parser context
2663 * @str: location to store the entity name
2664 *
2665 * parse an HTML ENTITY references
2666 *
2667 * [68] EntityRef ::= '&' Name ';'
2668 *
2669 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2670 * if non-NULL *str will have to be freed by the caller.
2671 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002672const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002673htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2674 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002675 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002676
2677 if (str != NULL) *str = NULL;
2678 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002679
2680 if (CUR == '&') {
2681 NEXT;
2682 name = htmlParseName(ctxt);
2683 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002684 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2685 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002686 } else {
2687 GROW;
2688 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002689 if (str != NULL)
2690 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002691
2692 /*
2693 * Lookup the entity in the table.
2694 */
2695 ent = htmlEntityLookup(name);
2696 if (ent != NULL) /* OK that's ugly !!! */
2697 NEXT;
2698 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002699 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2700 "htmlParseEntityRef: expecting ';'\n",
2701 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002702 if (str != NULL)
2703 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002704 }
2705 }
2706 }
2707 return(ent);
2708}
2709
2710/**
2711 * htmlParseAttValue:
2712 * @ctxt: an HTML parser context
2713 *
2714 * parse a value for an attribute
2715 * Note: the parser won't do substitution of entities here, this
2716 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002717 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002718 *
2719 * Returns the AttValue parsed or NULL.
2720 */
2721
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002722static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002723htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2724 xmlChar *ret = NULL;
2725
2726 if (CUR == '"') {
2727 NEXT;
2728 ret = htmlParseHTMLAttribute(ctxt, '"');
2729 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002730 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2731 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002732 } else
2733 NEXT;
2734 } else if (CUR == '\'') {
2735 NEXT;
2736 ret = htmlParseHTMLAttribute(ctxt, '\'');
2737 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002738 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2739 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002740 } else
2741 NEXT;
2742 } else {
2743 /*
2744 * That's an HTMLism, the attribute value may not be quoted
2745 */
2746 ret = htmlParseHTMLAttribute(ctxt, 0);
2747 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002748 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2749 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002750 }
2751 }
2752 return(ret);
2753}
2754
2755/**
2756 * htmlParseSystemLiteral:
2757 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002758 *
Owen Taylor3473f882001-02-23 17:55:21 +00002759 * parse an HTML Literal
2760 *
2761 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2762 *
2763 * Returns the SystemLiteral parsed or NULL
2764 */
2765
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002766static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002767htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2768 const xmlChar *q;
2769 xmlChar *ret = NULL;
2770
2771 if (CUR == '"') {
2772 NEXT;
2773 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002774 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002775 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002776 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002777 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2778 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002779 } else {
2780 ret = xmlStrndup(q, CUR_PTR - q);
2781 NEXT;
2782 }
2783 } else if (CUR == '\'') {
2784 NEXT;
2785 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002786 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002787 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002788 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002789 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2790 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002791 } else {
2792 ret = xmlStrndup(q, CUR_PTR - q);
2793 NEXT;
2794 }
2795 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002796 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2797 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002798 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002799
Owen Taylor3473f882001-02-23 17:55:21 +00002800 return(ret);
2801}
2802
2803/**
2804 * htmlParsePubidLiteral:
2805 * @ctxt: an HTML parser context
2806 *
2807 * parse an HTML public literal
2808 *
2809 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2810 *
2811 * Returns the PubidLiteral parsed or NULL.
2812 */
2813
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002814static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002815htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2816 const xmlChar *q;
2817 xmlChar *ret = NULL;
2818 /*
2819 * Name ::= (Letter | '_') (NameChar)*
2820 */
2821 if (CUR == '"') {
2822 NEXT;
2823 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002824 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002825 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002826 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2827 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002828 } else {
2829 ret = xmlStrndup(q, CUR_PTR - q);
2830 NEXT;
2831 }
2832 } else if (CUR == '\'') {
2833 NEXT;
2834 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002835 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002836 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002837 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002838 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2839 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002840 } else {
2841 ret = xmlStrndup(q, CUR_PTR - q);
2842 NEXT;
2843 }
2844 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002845 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2846 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002847 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002848
Owen Taylor3473f882001-02-23 17:55:21 +00002849 return(ret);
2850}
2851
2852/**
2853 * htmlParseScript:
2854 * @ctxt: an HTML parser context
2855 *
2856 * parse the content of an HTML SCRIPT or STYLE element
2857 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2858 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2859 * http://www.w3.org/TR/html4/types.html#type-script
2860 * http://www.w3.org/TR/html4/types.html#h-6.15
2861 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2862 *
2863 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2864 * element and the value of intrinsic event attributes. User agents must
2865 * not evaluate script data as HTML markup but instead must pass it on as
2866 * data to a script engine.
2867 * NOTES:
2868 * - The content is passed like CDATA
2869 * - the attributes for style and scripting "onXXX" are also described
2870 * as CDATA but SGML allows entities references in attributes so their
2871 * processing is identical as other attributes
2872 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002873static void
Owen Taylor3473f882001-02-23 17:55:21 +00002874htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002875 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002876 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002877 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002878
2879 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002880 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002881 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002882 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002883 /*
2884 * One should break here, the specification is clear:
2885 * Authors should therefore escape "</" within the content.
2886 * Escape mechanisms are specific to each scripting or
2887 * style sheet language.
2888 *
2889 * In recovery mode, only break if end tag match the
2890 * current tag, effectively ignoring all tags inside the
2891 * script/style block and treating the entire block as
2892 * CDATA.
2893 */
2894 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002895 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2896 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002897 {
2898 break; /* while */
2899 } else {
2900 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002901 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002902 ctxt->name, NULL);
2903 }
2904 } else {
2905 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002906 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002907 {
2908 break; /* while */
2909 }
2910 }
Owen Taylor3473f882001-02-23 17:55:21 +00002911 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002912 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002913 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2914 if (ctxt->sax->cdataBlock!= NULL) {
2915 /*
2916 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2917 */
2918 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002919 } else if (ctxt->sax->characters != NULL) {
2920 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002921 }
2922 nbchar = 0;
2923 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002924 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002925 NEXTL(l);
2926 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002927 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002928
Daniel Veillard68716a72006-10-16 09:32:17 +00002929 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Pierre Belziled4b54472010-11-04 10:18:17 +01002930 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2931 "Invalid char in CDATA 0x%X\n", cur);
2932 if (ctxt->input->cur < ctxt->input->end) {
2933 NEXT;
2934 }
Owen Taylor3473f882001-02-23 17:55:21 +00002935 }
2936
2937 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2938 if (ctxt->sax->cdataBlock!= NULL) {
2939 /*
2940 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2941 */
2942 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002943 } else if (ctxt->sax->characters != NULL) {
2944 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002945 }
2946 }
2947}
2948
2949
2950/**
Daniel Veillard140c2512015-06-30 11:36:28 +08002951 * htmlParseCharDataInternal:
Owen Taylor3473f882001-02-23 17:55:21 +00002952 * @ctxt: an HTML parser context
Daniel Veillard140c2512015-06-30 11:36:28 +08002953 * @readahead: optional read ahead character in ascii range
Owen Taylor3473f882001-02-23 17:55:21 +00002954 *
2955 * parse a CharData section.
2956 * if we are within a CDATA section ']]>' marks an end of section.
2957 *
2958 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2959 */
2960
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002961static void
Daniel Veillard140c2512015-06-30 11:36:28 +08002962htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
2963 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
Owen Taylor3473f882001-02-23 17:55:21 +00002964 int nbchar = 0;
2965 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002966 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002967
Daniel Veillard140c2512015-06-30 11:36:28 +08002968 if (readahead)
2969 buf[nbchar++] = readahead;
2970
Owen Taylor3473f882001-02-23 17:55:21 +00002971 SHRINK;
2972 cur = CUR_CHAR(l);
2973 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002974 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002975 (cur != 0)) {
2976 if (!(IS_CHAR(cur))) {
2977 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2978 "Invalid char in CDATA 0x%X\n", cur);
2979 } else {
2980 COPY_BUF(l,buf,nbchar,cur);
2981 }
Owen Taylor3473f882001-02-23 17:55:21 +00002982 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2983 /*
2984 * Ok the segment is to be consumed as chars.
2985 */
2986 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2987 if (areBlanks(ctxt, buf, nbchar)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08002988 if (ctxt->keepBlanks) {
2989 if (ctxt->sax->characters != NULL)
2990 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2991 } else {
2992 if (ctxt->sax->ignorableWhitespace != NULL)
2993 ctxt->sax->ignorableWhitespace(ctxt->userData,
2994 buf, nbchar);
2995 }
Owen Taylor3473f882001-02-23 17:55:21 +00002996 } else {
2997 htmlCheckParagraph(ctxt);
2998 if (ctxt->sax->characters != NULL)
2999 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3000 }
3001 }
3002 nbchar = 0;
3003 }
3004 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00003005 chunk++;
3006 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3007 chunk = 0;
3008 SHRINK;
3009 GROW;
3010 }
Owen Taylor3473f882001-02-23 17:55:21 +00003011 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00003012 if (cur == 0) {
3013 SHRINK;
3014 GROW;
3015 cur = CUR_CHAR(l);
3016 }
Owen Taylor3473f882001-02-23 17:55:21 +00003017 }
3018 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00003019 buf[nbchar] = 0;
3020
Owen Taylor3473f882001-02-23 17:55:21 +00003021 /*
3022 * Ok the segment is to be consumed as chars.
3023 */
3024 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3025 if (areBlanks(ctxt, buf, nbchar)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08003026 if (ctxt->keepBlanks) {
3027 if (ctxt->sax->characters != NULL)
3028 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3029 } else {
3030 if (ctxt->sax->ignorableWhitespace != NULL)
3031 ctxt->sax->ignorableWhitespace(ctxt->userData,
3032 buf, nbchar);
3033 }
Owen Taylor3473f882001-02-23 17:55:21 +00003034 } else {
3035 htmlCheckParagraph(ctxt);
3036 if (ctxt->sax->characters != NULL)
3037 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3038 }
3039 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00003040 } else {
3041 /*
3042 * Loop detection
3043 */
3044 if (cur == 0)
3045 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00003046 }
3047}
3048
3049/**
Daniel Veillard140c2512015-06-30 11:36:28 +08003050 * htmlParseCharData:
3051 * @ctxt: an HTML parser context
3052 *
3053 * parse a CharData section.
3054 * if we are within a CDATA section ']]>' marks an end of section.
3055 *
3056 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3057 */
3058
3059static void
3060htmlParseCharData(htmlParserCtxtPtr ctxt) {
3061 htmlParseCharDataInternal(ctxt, 0);
3062}
3063
3064/**
Owen Taylor3473f882001-02-23 17:55:21 +00003065 * htmlParseExternalID:
3066 * @ctxt: an HTML parser context
3067 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00003068 *
3069 * Parse an External ID or a Public ID
3070 *
Owen Taylor3473f882001-02-23 17:55:21 +00003071 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3072 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3073 *
3074 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3075 *
3076 * Returns the function returns SystemLiteral and in the second
3077 * case publicID receives PubidLiteral, is strict is off
3078 * it is possible to return NULL and have publicID set.
3079 */
3080
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003081static xmlChar *
3082htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00003083 xmlChar *URI = NULL;
3084
3085 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3086 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3087 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3088 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003089 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003090 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3091 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003092 }
3093 SKIP_BLANKS;
3094 URI = htmlParseSystemLiteral(ctxt);
3095 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003096 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3097 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003098 }
3099 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3100 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3101 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3102 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003103 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003104 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3105 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003106 }
3107 SKIP_BLANKS;
3108 *publicID = htmlParsePubidLiteral(ctxt);
3109 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003110 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3111 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3112 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003113 }
3114 SKIP_BLANKS;
3115 if ((CUR == '"') || (CUR == '\'')) {
3116 URI = htmlParseSystemLiteral(ctxt);
3117 }
3118 }
3119 return(URI);
3120}
3121
3122/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003123 * xmlParsePI:
3124 * @ctxt: an XML parser context
3125 *
3126 * parse an XML Processing Instruction.
3127 *
3128 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3129 */
3130static void
3131htmlParsePI(htmlParserCtxtPtr ctxt) {
3132 xmlChar *buf = NULL;
3133 int len = 0;
3134 int size = HTML_PARSER_BUFFER_SIZE;
3135 int cur, l;
3136 const xmlChar *target;
3137 xmlParserInputState state;
3138 int count = 0;
3139
3140 if ((RAW == '<') && (NXT(1) == '?')) {
3141 state = ctxt->instate;
3142 ctxt->instate = XML_PARSER_PI;
3143 /*
3144 * this is a Processing Instruction.
3145 */
3146 SKIP(2);
3147 SHRINK;
3148
3149 /*
3150 * Parse the target name and check for special support like
3151 * namespace.
3152 */
3153 target = htmlParseName(ctxt);
3154 if (target != NULL) {
3155 if (RAW == '>') {
3156 SKIP(1);
3157
3158 /*
3159 * SAX: PI detected.
3160 */
3161 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3162 (ctxt->sax->processingInstruction != NULL))
3163 ctxt->sax->processingInstruction(ctxt->userData,
3164 target, NULL);
3165 ctxt->instate = state;
3166 return;
3167 }
3168 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3169 if (buf == NULL) {
3170 htmlErrMemory(ctxt, NULL);
3171 ctxt->instate = state;
3172 return;
3173 }
3174 cur = CUR;
3175 if (!IS_BLANK(cur)) {
3176 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3177 "ParsePI: PI %s space expected\n", target, NULL);
3178 }
3179 SKIP_BLANKS;
3180 cur = CUR_CHAR(l);
3181 while (IS_CHAR(cur) && (cur != '>')) {
3182 if (len + 5 >= size) {
3183 xmlChar *tmp;
3184
3185 size *= 2;
3186 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3187 if (tmp == NULL) {
3188 htmlErrMemory(ctxt, NULL);
3189 xmlFree(buf);
3190 ctxt->instate = state;
3191 return;
3192 }
3193 buf = tmp;
3194 }
3195 count++;
3196 if (count > 50) {
3197 GROW;
3198 count = 0;
3199 }
3200 COPY_BUF(l,buf,len,cur);
3201 NEXTL(l);
3202 cur = CUR_CHAR(l);
3203 if (cur == 0) {
3204 SHRINK;
3205 GROW;
3206 cur = CUR_CHAR(l);
3207 }
3208 }
3209 buf[len] = 0;
3210 if (cur != '>') {
3211 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3212 "ParsePI: PI %s never end ...\n", target, NULL);
3213 } else {
3214 SKIP(1);
3215
3216 /*
3217 * SAX: PI detected.
3218 */
3219 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3220 (ctxt->sax->processingInstruction != NULL))
3221 ctxt->sax->processingInstruction(ctxt->userData,
3222 target, buf);
3223 }
3224 xmlFree(buf);
3225 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003226 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003227 "PI is not started correctly", NULL, NULL);
3228 }
3229 ctxt->instate = state;
3230 }
3231}
3232
3233/**
Owen Taylor3473f882001-02-23 17:55:21 +00003234 * htmlParseComment:
3235 * @ctxt: an HTML parser context
3236 *
3237 * Parse an XML (SGML) comment <!-- .... -->
3238 *
3239 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3240 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003241static void
Owen Taylor3473f882001-02-23 17:55:21 +00003242htmlParseComment(htmlParserCtxtPtr ctxt) {
3243 xmlChar *buf = NULL;
3244 int len;
3245 int size = HTML_PARSER_BUFFER_SIZE;
3246 int q, ql;
3247 int r, rl;
3248 int cur, l;
3249 xmlParserInputState state;
3250
3251 /*
3252 * Check that there is a comment right here.
3253 */
3254 if ((RAW != '<') || (NXT(1) != '!') ||
3255 (NXT(2) != '-') || (NXT(3) != '-')) return;
3256
3257 state = ctxt->instate;
3258 ctxt->instate = XML_PARSER_COMMENT;
3259 SHRINK;
3260 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003261 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003262 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003263 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003264 ctxt->instate = state;
3265 return;
3266 }
Daniel Veillarde7248792015-10-30 21:14:55 +08003267 len = 0;
3268 buf[len] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003269 q = CUR_CHAR(ql);
Daniel Veillarde7248792015-10-30 21:14:55 +08003270 if (!IS_CHAR(q))
3271 goto unfinished;
Owen Taylor3473f882001-02-23 17:55:21 +00003272 NEXTL(ql);
3273 r = CUR_CHAR(rl);
Daniel Veillarde7248792015-10-30 21:14:55 +08003274 if (!IS_CHAR(r))
3275 goto unfinished;
Owen Taylor3473f882001-02-23 17:55:21 +00003276 NEXTL(rl);
3277 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00003278 while (IS_CHAR(cur) &&
3279 ((cur != '>') ||
3280 (r != '-') || (q != '-'))) {
3281 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003282 xmlChar *tmp;
3283
Owen Taylor3473f882001-02-23 17:55:21 +00003284 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003285 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3286 if (tmp == NULL) {
3287 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003288 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003289 ctxt->instate = state;
3290 return;
3291 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003292 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003293 }
3294 COPY_BUF(ql,buf,len,q);
3295 q = r;
3296 ql = rl;
3297 r = cur;
3298 rl = l;
3299 NEXTL(l);
3300 cur = CUR_CHAR(l);
3301 if (cur == 0) {
3302 SHRINK;
3303 GROW;
3304 cur = CUR_CHAR(l);
3305 }
3306 }
3307 buf[len] = 0;
Daniel Veillarde7248792015-10-30 21:14:55 +08003308 if (IS_CHAR(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003309 NEXT;
3310 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3311 (!ctxt->disableSAX))
3312 ctxt->sax->comment(ctxt->userData, buf);
3313 xmlFree(buf);
Daniel Veillarde7248792015-10-30 21:14:55 +08003314 ctxt->instate = state;
3315 return;
Owen Taylor3473f882001-02-23 17:55:21 +00003316 }
Daniel Veillarde7248792015-10-30 21:14:55 +08003317
3318unfinished:
3319 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3320 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3321 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00003322}
3323
3324/**
3325 * htmlParseCharRef:
3326 * @ctxt: an HTML parser context
3327 *
3328 * parse Reference declarations
3329 *
3330 * [66] CharRef ::= '&#' [0-9]+ ';' |
3331 * '&#x' [0-9a-fA-F]+ ';'
3332 *
3333 * Returns the value parsed (as an int)
3334 */
3335int
3336htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3337 int val = 0;
3338
Daniel Veillarda03e3652004-11-02 18:45:30 +00003339 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3340 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3341 "htmlParseCharRef: context error\n",
3342 NULL, NULL);
3343 return(0);
3344 }
Owen Taylor3473f882001-02-23 17:55:21 +00003345 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003346 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003347 SKIP(3);
3348 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003349 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003350 val = val * 16 + (CUR - '0');
3351 else if ((CUR >= 'a') && (CUR <= 'f'))
3352 val = val * 16 + (CUR - 'a') + 10;
3353 else if ((CUR >= 'A') && (CUR <= 'F'))
3354 val = val * 16 + (CUR - 'A') + 10;
3355 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003356 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003357 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003358 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003359 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003360 }
3361 NEXT;
3362 }
3363 if (CUR == ';')
3364 NEXT;
3365 } else if ((CUR == '&') && (NXT(1) == '#')) {
3366 SKIP(2);
3367 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003368 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003369 val = val * 10 + (CUR - '0');
3370 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003371 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003372 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003373 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003374 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003375 }
3376 NEXT;
3377 }
3378 if (CUR == ';')
3379 NEXT;
3380 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003381 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3382 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003383 }
3384 /*
3385 * Check the value IS_CHAR ...
3386 */
3387 if (IS_CHAR(val)) {
3388 return(val);
3389 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003390 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3391 "htmlParseCharRef: invalid xmlChar value %d\n",
3392 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003393 }
3394 return(0);
3395}
3396
3397
3398/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003399 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003400 * @ctxt: an HTML parser context
3401 *
3402 * parse a DOCTYPE declaration
3403 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003404 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003405 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3406 */
3407
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003408static void
Owen Taylor3473f882001-02-23 17:55:21 +00003409htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003410 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003411 xmlChar *ExternalID = NULL;
3412 xmlChar *URI = NULL;
3413
3414 /*
3415 * We know that '<!DOCTYPE' has been detected.
3416 */
3417 SKIP(9);
3418
3419 SKIP_BLANKS;
3420
3421 /*
3422 * Parse the DOCTYPE name.
3423 */
3424 name = htmlParseName(ctxt);
3425 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003426 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3427 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3428 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003429 }
3430 /*
3431 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3432 */
3433
3434 SKIP_BLANKS;
3435
3436 /*
3437 * Check for SystemID and ExternalID
3438 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003439 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003440 SKIP_BLANKS;
3441
3442 /*
3443 * We should be at the end of the DOCTYPE declaration.
3444 */
3445 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003446 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3447 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003448 /* We shouldn't try to resynchronize ... */
3449 }
3450 NEXT;
3451
3452 /*
3453 * Create or update the document accordingly to the DOCTYPE
3454 */
3455 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3456 (!ctxt->disableSAX))
3457 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3458
3459 /*
3460 * Cleanup, since we don't use all those identifiers
3461 */
3462 if (URI != NULL) xmlFree(URI);
3463 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003464}
3465
3466/**
3467 * htmlParseAttribute:
3468 * @ctxt: an HTML parser context
3469 * @value: a xmlChar ** used to store the value of the attribute
3470 *
3471 * parse an attribute
3472 *
3473 * [41] Attribute ::= Name Eq AttValue
3474 *
3475 * [25] Eq ::= S? '=' S?
3476 *
3477 * With namespace:
3478 *
3479 * [NS 11] Attribute ::= QName Eq AttValue
3480 *
3481 * Also the case QName == xmlns:??? is handled independently as a namespace
3482 * definition.
3483 *
3484 * Returns the attribute name, and the value in *value.
3485 */
3486
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003487static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003488htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003489 const xmlChar *name;
3490 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003491
3492 *value = NULL;
3493 name = htmlParseHTMLName(ctxt);
3494 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003495 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3496 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003497 return(NULL);
3498 }
3499
3500 /*
3501 * read the value
3502 */
3503 SKIP_BLANKS;
3504 if (CUR == '=') {
3505 NEXT;
3506 SKIP_BLANKS;
3507 val = htmlParseAttValue(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003508 }
3509
3510 *value = val;
3511 return(name);
3512}
3513
3514/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003515 * htmlCheckEncodingDirect:
Owen Taylor3473f882001-02-23 17:55:21 +00003516 * @ctxt: an HTML parser context
3517 * @attvalue: the attribute value
3518 *
Denis Pauk868d92d2012-05-10 15:34:57 +08003519 * Checks an attribute value to detect
Owen Taylor3473f882001-02-23 17:55:21 +00003520 * the encoding
3521 * If a new encoding is detected the parser is switched to decode
3522 * it and pass UTF8
3523 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003524static void
Denis Pauk868d92d2012-05-10 15:34:57 +08003525htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00003526
Denis Pauk868d92d2012-05-10 15:34:57 +08003527 if ((ctxt == NULL) || (encoding == NULL) ||
Daniel Veillardc62efc82011-05-16 16:03:50 +08003528 (ctxt->options & HTML_PARSE_IGNORE_ENC))
Owen Taylor3473f882001-02-23 17:55:21 +00003529 return;
3530
Daniel Veillarde77db162009-08-22 11:32:38 +02003531 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003532 if (ctxt->input->encoding != NULL)
3533 return;
3534
Owen Taylor3473f882001-02-23 17:55:21 +00003535 if (encoding != NULL) {
3536 xmlCharEncoding enc;
3537 xmlCharEncodingHandlerPtr handler;
3538
3539 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3540
3541 if (ctxt->input->encoding != NULL)
3542 xmlFree((xmlChar *) ctxt->input->encoding);
3543 ctxt->input->encoding = xmlStrdup(encoding);
3544
3545 enc = xmlParseCharEncoding((const char *) encoding);
3546 /*
3547 * registered set of known encodings
3548 */
3549 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003550 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003551 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3552 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3553 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3554 (ctxt->input->buf != NULL) &&
3555 (ctxt->input->buf->encoder == NULL)) {
3556 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3557 "htmlCheckEncoding: wrong encoding meta\n",
3558 NULL, NULL);
3559 } else {
3560 xmlSwitchEncoding(ctxt, enc);
3561 }
Owen Taylor3473f882001-02-23 17:55:21 +00003562 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3563 } else {
3564 /*
3565 * fallback for unknown encodings
3566 */
3567 handler = xmlFindCharEncodingHandler((const char *) encoding);
3568 if (handler != NULL) {
3569 xmlSwitchToEncoding(ctxt, handler);
3570 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3571 } else {
Daniel Veillardc62efc82011-05-16 16:03:50 +08003572 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3573 "htmlCheckEncoding: unknown encoding %s\n",
3574 encoding, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003575 }
3576 }
3577
3578 if ((ctxt->input->buf != NULL) &&
3579 (ctxt->input->buf->encoder != NULL) &&
3580 (ctxt->input->buf->raw != NULL) &&
3581 (ctxt->input->buf->buffer != NULL)) {
3582 int nbchars;
3583 int processed;
3584
3585 /*
3586 * convert as much as possible to the parser reading buffer.
3587 */
3588 processed = ctxt->input->cur - ctxt->input->base;
Daniel Veillarda78d8032012-07-16 14:56:50 +08003589 xmlBufShrink(ctxt->input->buf->buffer, processed);
Daniel Veillardbf058dc2013-02-13 18:19:42 +08003590 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
Owen Taylor3473f882001-02-23 17:55:21 +00003591 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003592 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3593 "htmlCheckEncoding: encoder error\n",
3594 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003595 }
Daniel Veillard61551a12012-07-16 16:28:47 +08003596 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
Owen Taylor3473f882001-02-23 17:55:21 +00003597 }
3598 }
3599}
3600
3601/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003602 * htmlCheckEncoding:
3603 * @ctxt: an HTML parser context
3604 * @attvalue: the attribute value
3605 *
3606 * Checks an http-equiv attribute from a Meta tag to detect
3607 * the encoding
3608 * If a new encoding is detected the parser is switched to decode
3609 * it and pass UTF8
3610 */
3611static void
3612htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3613 const xmlChar *encoding;
3614
3615 if (!attvalue)
3616 return;
3617
3618 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3619 if (encoding != NULL) {
3620 encoding += 7;
3621 }
3622 /*
3623 * skip blank
3624 */
3625 if (encoding && IS_BLANK_CH(*encoding))
3626 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3627 if (encoding && *encoding == '=') {
3628 encoding ++;
3629 htmlCheckEncodingDirect(ctxt, encoding);
3630 }
3631}
3632
3633/**
Owen Taylor3473f882001-02-23 17:55:21 +00003634 * htmlCheckMeta:
3635 * @ctxt: an HTML parser context
3636 * @atts: the attributes values
3637 *
3638 * Checks an attributes from a Meta tag
3639 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003640static void
Owen Taylor3473f882001-02-23 17:55:21 +00003641htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3642 int i;
3643 const xmlChar *att, *value;
3644 int http = 0;
3645 const xmlChar *content = NULL;
3646
3647 if ((ctxt == NULL) || (atts == NULL))
3648 return;
3649
3650 i = 0;
3651 att = atts[i++];
3652 while (att != NULL) {
3653 value = atts[i++];
3654 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3655 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3656 http = 1;
Denis Pauk868d92d2012-05-10 15:34:57 +08003657 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3658 htmlCheckEncodingDirect(ctxt, value);
Owen Taylor3473f882001-02-23 17:55:21 +00003659 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3660 content = value;
3661 att = atts[i++];
3662 }
3663 if ((http) && (content != NULL))
3664 htmlCheckEncoding(ctxt, content);
3665
3666}
3667
3668/**
3669 * htmlParseStartTag:
3670 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003671 *
Owen Taylor3473f882001-02-23 17:55:21 +00003672 * parse a start of tag either for rule element or
3673 * EmptyElement. In both case we don't parse the tag closing chars.
3674 *
3675 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3676 *
3677 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3678 *
3679 * With namespace:
3680 *
3681 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3682 *
3683 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3684 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003685 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003686 */
3687
Daniel Veillard597f1c12005-07-03 23:00:18 +00003688static int
Owen Taylor3473f882001-02-23 17:55:21 +00003689htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003690 const xmlChar *name;
3691 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003692 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003693 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003694 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003695 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003696 int meta = 0;
3697 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003698 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003699
Daniel Veillarda03e3652004-11-02 18:45:30 +00003700 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3701 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3702 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003703 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003704 }
Gaurav3e0eec42014-06-13 14:45:20 +08003705 if (ctxt->instate == XML_PARSER_EOF)
3706 return(-1);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003707 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003708 NEXT;
3709
Daniel Veillard30e76072006-03-09 14:13:55 +00003710 atts = ctxt->atts;
3711 maxatts = ctxt->maxatts;
3712
Owen Taylor3473f882001-02-23 17:55:21 +00003713 GROW;
3714 name = htmlParseHTMLName(ctxt);
3715 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003716 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3717 "htmlParseStartTag: invalid element name\n",
3718 NULL, NULL);
Daniel Veillard140c2512015-06-30 11:36:28 +08003719 /* if recover preserve text on classic misconstructs */
3720 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3721 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3722 htmlParseCharDataInternal(ctxt, '<');
3723 return(-1);
3724 }
3725
3726
Owen Taylor3473f882001-02-23 17:55:21 +00003727 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003728 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3729 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003730 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003731 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003732 }
3733 if (xmlStrEqual(name, BAD_CAST"meta"))
3734 meta = 1;
3735
3736 /*
3737 * Check for auto-closure of HTML elements.
3738 */
3739 htmlAutoClose(ctxt, name);
3740
3741 /*
3742 * Check for implied HTML elements.
3743 */
3744 htmlCheckImplied(ctxt, name);
3745
3746 /*
3747 * Avoid html at any level > 0, head at any level != 1
3748 * or any attempt to recurse body
3749 */
3750 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003751 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3752 "htmlParseStartTag: misplaced <html> tag\n",
3753 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003754 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003755 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003756 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003757 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003758 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003759 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3760 "htmlParseStartTag: misplaced <head> tag\n",
3761 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003762 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003763 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003764 }
3765 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003766 int indx;
3767 for (indx = 0;indx < ctxt->nameNr;indx++) {
3768 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003769 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3770 "htmlParseStartTag: misplaced <body> tag\n",
3771 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003772 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003773 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003774 }
3775 }
3776 }
3777
3778 /*
3779 * Now parse the attributes, it ends up with the ending
3780 *
3781 * (S Attribute)* S?
3782 */
3783 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003784 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003785 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003786 ((CUR != '/') || (NXT(1) != '>'))) {
3787 long cons = ctxt->nbChars;
3788
3789 GROW;
3790 attname = htmlParseAttribute(ctxt, &attvalue);
3791 if (attname != NULL) {
3792
3793 /*
3794 * Well formedness requires at most one declaration of an attribute
3795 */
3796 for (i = 0; i < nbatts;i += 2) {
3797 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003798 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3799 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003800 if (attvalue != NULL)
3801 xmlFree(attvalue);
3802 goto failed;
3803 }
3804 }
3805
3806 /*
3807 * Add the pair to atts
3808 */
3809 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003810 maxatts = 22; /* allow for 10 attrs by default */
3811 atts = (const xmlChar **)
3812 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003813 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003814 htmlErrMemory(ctxt, NULL);
3815 if (attvalue != NULL)
3816 xmlFree(attvalue);
3817 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003818 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003819 ctxt->atts = atts;
3820 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003821 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003822 const xmlChar **n;
3823
Owen Taylor3473f882001-02-23 17:55:21 +00003824 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003825 n = (const xmlChar **) xmlRealloc((void *) atts,
3826 maxatts * sizeof(const xmlChar *));
3827 if (n == NULL) {
3828 htmlErrMemory(ctxt, NULL);
3829 if (attvalue != NULL)
3830 xmlFree(attvalue);
3831 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003832 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003833 atts = n;
3834 ctxt->atts = atts;
3835 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003836 }
3837 atts[nbatts++] = attname;
3838 atts[nbatts++] = attvalue;
3839 atts[nbatts] = NULL;
3840 atts[nbatts + 1] = NULL;
3841 }
3842 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003843 if (attvalue != NULL)
3844 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003845 /* Dump the bogus attribute string up to the next blank or
3846 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003847 while ((IS_CHAR_CH(CUR)) &&
3848 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003849 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003850 NEXT;
3851 }
3852
3853failed:
3854 SKIP_BLANKS;
3855 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003856 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3857 "htmlParseStartTag: problem parsing attributes\n",
3858 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003859 break;
3860 }
3861 }
3862
3863 /*
3864 * Handle specific association to the META tag
3865 */
William M. Bracke978ae22007-03-21 06:16:02 +00003866 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003867 htmlCheckMeta(ctxt, atts);
3868
3869 /*
3870 * SAX: Start of Element !
3871 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003872 if (!discardtag) {
3873 htmlnamePush(ctxt, name);
3874 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3875 if (nbatts != 0)
3876 ctxt->sax->startElement(ctxt->userData, name, atts);
3877 else
3878 ctxt->sax->startElement(ctxt->userData, name, NULL);
3879 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003880 }
Owen Taylor3473f882001-02-23 17:55:21 +00003881
3882 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003883 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003884 if (atts[i] != NULL)
3885 xmlFree((xmlChar *) atts[i]);
3886 }
Owen Taylor3473f882001-02-23 17:55:21 +00003887 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003888
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003889 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003890}
3891
3892/**
3893 * htmlParseEndTag:
3894 * @ctxt: an HTML parser context
3895 *
3896 * parse an end of tag
3897 *
3898 * [42] ETag ::= '</' Name S? '>'
3899 *
3900 * With namespace
3901 *
3902 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003903 *
3904 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003905 */
3906
Daniel Veillardf420ac52001-07-04 16:04:09 +00003907static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003908htmlParseEndTag(htmlParserCtxtPtr ctxt)
3909{
3910 const xmlChar *name;
3911 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003912 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003913
3914 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003915 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3916 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003917 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003918 }
3919 SKIP(2);
3920
3921 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003922 if (name == NULL)
3923 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003924 /*
3925 * We should definitely be at the ending "S? '>'" part
3926 */
3927 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003928 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003929 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3930 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003931 if (ctxt->recovery) {
3932 /*
3933 * We're not at the ending > !!
3934 * Error, unless in recover mode where we search forwards
3935 * until we find a >
3936 */
3937 while (CUR != '\0' && CUR != '>') NEXT;
3938 NEXT;
3939 }
Owen Taylor3473f882001-02-23 17:55:21 +00003940 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003941 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003942
3943 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003944 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3945 * out now.
3946 */
3947 if ((ctxt->depth > 0) &&
3948 (xmlStrEqual(name, BAD_CAST "html") ||
3949 xmlStrEqual(name, BAD_CAST "body") ||
3950 xmlStrEqual(name, BAD_CAST "head"))) {
3951 ctxt->depth--;
3952 return (0);
3953 }
3954
3955 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003956 * If the name read is not one of the element in the parsing stack
3957 * then return, it's just an error.
3958 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003959 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3960 if (xmlStrEqual(name, ctxt->nameTab[i]))
3961 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003962 }
3963 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003964 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3965 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003966 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003967 }
3968
3969
3970 /*
3971 * Check for auto-closure of HTML elements.
3972 */
3973
3974 htmlAutoCloseOnClose(ctxt, name);
3975
3976 /*
3977 * Well formedness constraints, opening and closing must match.
3978 * With the exception that the autoclose may have popped stuff out
3979 * of the stack.
3980 */
3981 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003982 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003983 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3984 "Opening and ending tag mismatch: %s and %s\n",
3985 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003986 }
3987 }
3988
3989 /*
3990 * SAX: End of Tag
3991 */
3992 oldname = ctxt->name;
3993 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003994 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3995 ctxt->sax->endElement(ctxt->userData, name);
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08003996 htmlNodeInfoPop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003997 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003998 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003999 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004000 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004001 }
4002
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004003 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00004004}
4005
4006
4007/**
4008 * htmlParseReference:
4009 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004010 *
Owen Taylor3473f882001-02-23 17:55:21 +00004011 * parse and handle entity references in content,
4012 * this will end-up in a call to character() since this is either a
4013 * CharRef, or a predefined entity.
4014 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004015static void
Owen Taylor3473f882001-02-23 17:55:21 +00004016htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00004017 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00004018 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004019 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004020 if (CUR != '&') return;
4021
4022 if (NXT(1) == '#') {
4023 unsigned int c;
4024 int bits, i = 0;
4025
4026 c = htmlParseCharRef(ctxt);
4027 if (c == 0)
4028 return;
4029
4030 if (c < 0x80) { out[i++]= c; bits= -6; }
4031 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4032 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4033 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004034
Owen Taylor3473f882001-02-23 17:55:21 +00004035 for ( ; bits >= 0; bits-= 6) {
4036 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4037 }
4038 out[i] = 0;
4039
4040 htmlCheckParagraph(ctxt);
4041 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4042 ctxt->sax->characters(ctxt->userData, out, i);
4043 } else {
4044 ent = htmlParseEntityRef(ctxt, &name);
4045 if (name == NULL) {
4046 htmlCheckParagraph(ctxt);
4047 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4048 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4049 return;
4050 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00004051 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004052 htmlCheckParagraph(ctxt);
4053 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4054 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4055 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4056 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4057 }
4058 } else {
4059 unsigned int c;
4060 int bits, i = 0;
4061
4062 c = ent->value;
4063 if (c < 0x80)
4064 { out[i++]= c; bits= -6; }
4065 else if (c < 0x800)
4066 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4067 else if (c < 0x10000)
4068 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004069 else
Owen Taylor3473f882001-02-23 17:55:21 +00004070 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004071
Owen Taylor3473f882001-02-23 17:55:21 +00004072 for ( ; bits >= 0; bits-= 6) {
4073 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4074 }
4075 out[i] = 0;
4076
4077 htmlCheckParagraph(ctxt);
4078 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4079 ctxt->sax->characters(ctxt->userData, out, i);
4080 }
Owen Taylor3473f882001-02-23 17:55:21 +00004081 }
4082}
4083
4084/**
4085 * htmlParseContent:
4086 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004087 *
4088 * Parse a content: comment, sub-element, reference or text.
Eugene Pimenov615904f2010-03-15 15:16:02 +01004089 * Kept for compatibility with old code
Owen Taylor3473f882001-02-23 17:55:21 +00004090 */
4091
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004092static void
Owen Taylor3473f882001-02-23 17:55:21 +00004093htmlParseContent(htmlParserCtxtPtr ctxt) {
4094 xmlChar *currentNode;
4095 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004096 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004097
4098 currentNode = xmlStrdup(ctxt->name);
4099 depth = ctxt->nameNr;
4100 while (1) {
4101 long cons = ctxt->nbChars;
4102
4103 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02004104
4105 if (ctxt->instate == XML_PARSER_EOF)
4106 break;
4107
Owen Taylor3473f882001-02-23 17:55:21 +00004108 /*
4109 * Our tag or one of it's parent or children is ending.
4110 */
4111 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00004112 if (htmlParseEndTag(ctxt) &&
4113 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4114 if (currentNode != NULL)
4115 xmlFree(currentNode);
4116 return;
4117 }
4118 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00004119 }
4120
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004121 else if ((CUR == '<') &&
4122 ((IS_ASCII_LETTER(NXT(1))) ||
4123 (NXT(1) == '_') || (NXT(1) == ':'))) {
4124 name = htmlParseHTMLName_nonInvasive(ctxt);
4125 if (name == NULL) {
4126 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4127 "htmlParseStartTag: invalid element name\n",
4128 NULL, NULL);
4129 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02004130 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004131 NEXT;
4132
4133 if (currentNode != NULL)
4134 xmlFree(currentNode);
4135 return;
4136 }
4137
4138 if (ctxt->name != NULL) {
4139 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4140 htmlAutoClose(ctxt, name);
4141 continue;
4142 }
Daniel Veillarde77db162009-08-22 11:32:38 +02004143 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004144 }
4145
Owen Taylor3473f882001-02-23 17:55:21 +00004146 /*
4147 * Has this node been popped out during parsing of
4148 * the next element
4149 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00004150 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4151 (!xmlStrEqual(currentNode, ctxt->name)))
4152 {
Owen Taylor3473f882001-02-23 17:55:21 +00004153 if (currentNode != NULL) xmlFree(currentNode);
4154 return;
4155 }
4156
Daniel Veillardf9533d12001-03-03 10:04:57 +00004157 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4158 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004159 /*
4160 * Handle SCRIPT/STYLE separately
4161 */
4162 htmlParseScript(ctxt);
4163 } else {
4164 /*
4165 * Sometimes DOCTYPE arrives in the middle of the document
4166 */
4167 if ((CUR == '<') && (NXT(1) == '!') &&
4168 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4169 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4170 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4171 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004172 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4173 "Misplaced DOCTYPE declaration\n",
4174 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004175 htmlParseDocTypeDecl(ctxt);
4176 }
4177
4178 /*
4179 * First case : a comment
4180 */
4181 if ((CUR == '<') && (NXT(1) == '!') &&
4182 (NXT(2) == '-') && (NXT(3) == '-')) {
4183 htmlParseComment(ctxt);
4184 }
4185
4186 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004187 * Second case : a Processing Instruction.
4188 */
4189 else if ((CUR == '<') && (NXT(1) == '?')) {
4190 htmlParsePI(ctxt);
4191 }
4192
4193 /*
4194 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004195 */
4196 else if (CUR == '<') {
4197 htmlParseElement(ctxt);
4198 }
4199
4200 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004201 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004202 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004203 */
4204 else if (CUR == '&') {
4205 htmlParseReference(ctxt);
4206 }
4207
4208 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004209 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004210 */
4211 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004212 htmlAutoCloseOnEnd(ctxt);
4213 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004214 }
4215
4216 /*
4217 * Last case, text. Note that References are handled directly.
4218 */
4219 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004220 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004221 }
4222
4223 if (cons == ctxt->nbChars) {
4224 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004225 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4226 "detected an error in element content\n",
4227 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004228 }
4229 break;
4230 }
4231 }
4232 GROW;
4233 }
4234 if (currentNode != NULL) xmlFree(currentNode);
4235}
4236
4237/**
4238 * htmlParseElement:
4239 * @ctxt: an HTML parser context
4240 *
4241 * parse an HTML element, this is highly recursive
Eugene Pimenov615904f2010-03-15 15:16:02 +01004242 * this is kept for compatibility with previous code versions
Owen Taylor3473f882001-02-23 17:55:21 +00004243 *
4244 * [39] element ::= EmptyElemTag | STag content ETag
4245 *
4246 * [41] Attribute ::= Name Eq AttValue
4247 */
4248
4249void
4250htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004251 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004252 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004253 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004254 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004255 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004256 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004257 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004258
Daniel Veillarda03e3652004-11-02 18:45:30 +00004259 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4260 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004261 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004262 return;
4263 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004264
4265 if (ctxt->instate == XML_PARSER_EOF)
4266 return;
4267
Owen Taylor3473f882001-02-23 17:55:21 +00004268 /* Capture start position */
4269 if (ctxt->record_info) {
4270 node_info.begin_pos = ctxt->input->consumed +
4271 (CUR_PTR - ctxt->input->base);
4272 node_info.begin_line = ctxt->input->line;
4273 }
4274
Daniel Veillard597f1c12005-07-03 23:00:18 +00004275 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004276 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004277 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004278 if (CUR == '>')
4279 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004280 return;
4281 }
Owen Taylor3473f882001-02-23 17:55:21 +00004282
4283 /*
4284 * Lookup the info for that element.
4285 */
4286 info = htmlTagLookup(name);
4287 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004288 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4289 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004290 }
4291
4292 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004293 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004294 */
4295 if ((CUR == '/') && (NXT(1) == '>')) {
4296 SKIP(2);
4297 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4298 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004299 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004300 return;
4301 }
4302
4303 if (CUR == '>') {
4304 NEXT;
4305 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004306 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4307 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004308
4309 /*
4310 * end of parsing of this node.
4311 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004312 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004313 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004314 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004315 }
Owen Taylor3473f882001-02-23 17:55:21 +00004316
4317 /*
4318 * Capture end position and add node
4319 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004320 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004321 node_info.end_pos = ctxt->input->consumed +
4322 (CUR_PTR - ctxt->input->base);
4323 node_info.end_line = ctxt->input->line;
4324 node_info.node = ctxt->node;
4325 xmlParserAddNodeInfo(ctxt, &node_info);
4326 }
4327 return;
4328 }
4329
4330 /*
4331 * Check for an Empty Element from DTD definition
4332 */
4333 if ((info != NULL) && (info->empty)) {
4334 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4335 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004336 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004337 return;
4338 }
4339
4340 /*
4341 * Parse the content of the element:
4342 */
4343 currentNode = xmlStrdup(ctxt->name);
4344 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004345 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004346 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004347 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004348 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004349 if (ctxt->nameNr < depth) break;
4350 }
Owen Taylor3473f882001-02-23 17:55:21 +00004351
Owen Taylor3473f882001-02-23 17:55:21 +00004352 /*
4353 * Capture end position and add node
4354 */
4355 if ( currentNode != NULL && ctxt->record_info ) {
4356 node_info.end_pos = ctxt->input->consumed +
4357 (CUR_PTR - ctxt->input->base);
4358 node_info.end_line = ctxt->input->line;
4359 node_info.node = ctxt->node;
4360 xmlParserAddNodeInfo(ctxt, &node_info);
4361 }
William M. Brack76e95df2003-10-18 16:20:14 +00004362 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004363 htmlAutoCloseOnEnd(ctxt);
4364 }
4365
Owen Taylor3473f882001-02-23 17:55:21 +00004366 if (currentNode != NULL)
4367 xmlFree(currentNode);
4368}
4369
Eugene Pimenov615904f2010-03-15 15:16:02 +01004370static void
4371htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4372 /*
4373 * Capture end position and add node
4374 */
4375 if ( ctxt->node != NULL && ctxt->record_info ) {
4376 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4377 (CUR_PTR - ctxt->input->base);
4378 ctxt->nodeInfo->end_line = ctxt->input->line;
4379 ctxt->nodeInfo->node = ctxt->node;
4380 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4381 htmlNodeInfoPop(ctxt);
4382 }
4383 if (!IS_CHAR_CH(CUR)) {
4384 htmlAutoCloseOnEnd(ctxt);
4385 }
4386}
4387
4388/**
4389 * htmlParseElementInternal:
4390 * @ctxt: an HTML parser context
4391 *
4392 * parse an HTML element, new version, non recursive
4393 *
4394 * [39] element ::= EmptyElemTag | STag content ETag
4395 *
4396 * [41] Attribute ::= Name Eq AttValue
4397 */
4398
4399static void
4400htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4401 const xmlChar *name;
4402 const htmlElemDesc * info;
Philip Withnall579ebbc2014-06-20 21:03:42 +01004403 htmlParserNodeInfo node_info = { 0, };
Eugene Pimenov615904f2010-03-15 15:16:02 +01004404 int failed;
Eugene Pimenov615904f2010-03-15 15:16:02 +01004405
4406 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4407 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4408 "htmlParseElementInternal: context error\n", NULL, NULL);
4409 return;
4410 }
4411
4412 if (ctxt->instate == XML_PARSER_EOF)
4413 return;
4414
4415 /* Capture start position */
4416 if (ctxt->record_info) {
4417 node_info.begin_pos = ctxt->input->consumed +
4418 (CUR_PTR - ctxt->input->base);
4419 node_info.begin_line = ctxt->input->line;
4420 }
4421
4422 failed = htmlParseStartTag(ctxt);
4423 name = ctxt->name;
4424 if ((failed == -1) || (name == NULL)) {
4425 if (CUR == '>')
4426 NEXT;
4427 return;
4428 }
4429
4430 /*
4431 * Lookup the info for that element.
4432 */
4433 info = htmlTagLookup(name);
4434 if (info == NULL) {
4435 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4436 "Tag %s invalid\n", name, NULL);
4437 }
4438
4439 /*
4440 * Check for an Empty Element labeled the XML/SGML way
4441 */
4442 if ((CUR == '/') && (NXT(1) == '>')) {
4443 SKIP(2);
4444 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4445 ctxt->sax->endElement(ctxt->userData, name);
4446 htmlnamePop(ctxt);
4447 return;
4448 }
4449
4450 if (CUR == '>') {
4451 NEXT;
4452 } else {
4453 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4454 "Couldn't find end of Start Tag %s\n", name, NULL);
4455
4456 /*
4457 * end of parsing of this node.
4458 */
4459 if (xmlStrEqual(name, ctxt->name)) {
4460 nodePop(ctxt);
4461 htmlnamePop(ctxt);
4462 }
4463
4464 if (ctxt->record_info)
4465 htmlNodeInfoPush(ctxt, &node_info);
4466 htmlParserFinishElementParsing(ctxt);
4467 return;
4468 }
4469
4470 /*
4471 * Check for an Empty Element from DTD definition
4472 */
4473 if ((info != NULL) && (info->empty)) {
4474 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4475 ctxt->sax->endElement(ctxt->userData, name);
4476 htmlnamePop(ctxt);
4477 return;
4478 }
4479
4480 if (ctxt->record_info)
4481 htmlNodeInfoPush(ctxt, &node_info);
4482}
4483
4484/**
4485 * htmlParseContentInternal:
4486 * @ctxt: an HTML parser context
4487 *
4488 * Parse a content: comment, sub-element, reference or text.
4489 * New version for non recursive htmlParseElementInternal
4490 */
4491
4492static void
4493htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4494 xmlChar *currentNode;
4495 int depth;
4496 const xmlChar *name;
4497
4498 currentNode = xmlStrdup(ctxt->name);
4499 depth = ctxt->nameNr;
4500 while (1) {
4501 long cons = ctxt->nbChars;
4502
4503 GROW;
4504
4505 if (ctxt->instate == XML_PARSER_EOF)
4506 break;
4507
4508 /*
4509 * Our tag or one of it's parent or children is ending.
4510 */
4511 if ((CUR == '<') && (NXT(1) == '/')) {
4512 if (htmlParseEndTag(ctxt) &&
4513 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4514 if (currentNode != NULL)
4515 xmlFree(currentNode);
4516
4517 currentNode = xmlStrdup(ctxt->name);
4518 depth = ctxt->nameNr;
4519 }
4520 continue; /* while */
4521 }
4522
4523 else if ((CUR == '<') &&
4524 ((IS_ASCII_LETTER(NXT(1))) ||
4525 (NXT(1) == '_') || (NXT(1) == ':'))) {
4526 name = htmlParseHTMLName_nonInvasive(ctxt);
4527 if (name == NULL) {
4528 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4529 "htmlParseStartTag: invalid element name\n",
4530 NULL, NULL);
4531 /* Dump the bogus tag like browsers do */
4532 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4533 NEXT;
4534
4535 htmlParserFinishElementParsing(ctxt);
4536 if (currentNode != NULL)
4537 xmlFree(currentNode);
4538
4539 currentNode = xmlStrdup(ctxt->name);
4540 depth = ctxt->nameNr;
4541 continue;
4542 }
4543
4544 if (ctxt->name != NULL) {
4545 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4546 htmlAutoClose(ctxt, name);
4547 continue;
4548 }
4549 }
4550 }
4551
4552 /*
4553 * Has this node been popped out during parsing of
4554 * the next element
4555 */
4556 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4557 (!xmlStrEqual(currentNode, ctxt->name)))
4558 {
4559 htmlParserFinishElementParsing(ctxt);
4560 if (currentNode != NULL) xmlFree(currentNode);
4561
4562 currentNode = xmlStrdup(ctxt->name);
4563 depth = ctxt->nameNr;
4564 continue;
4565 }
4566
4567 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4568 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4569 /*
4570 * Handle SCRIPT/STYLE separately
4571 */
4572 htmlParseScript(ctxt);
4573 } else {
4574 /*
4575 * Sometimes DOCTYPE arrives in the middle of the document
4576 */
4577 if ((CUR == '<') && (NXT(1) == '!') &&
4578 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4579 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4580 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4581 (UPP(8) == 'E')) {
4582 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4583 "Misplaced DOCTYPE declaration\n",
4584 BAD_CAST "DOCTYPE" , NULL);
4585 htmlParseDocTypeDecl(ctxt);
4586 }
4587
4588 /*
4589 * First case : a comment
4590 */
4591 if ((CUR == '<') && (NXT(1) == '!') &&
4592 (NXT(2) == '-') && (NXT(3) == '-')) {
4593 htmlParseComment(ctxt);
4594 }
4595
4596 /*
4597 * Second case : a Processing Instruction.
4598 */
4599 else if ((CUR == '<') && (NXT(1) == '?')) {
4600 htmlParsePI(ctxt);
4601 }
4602
4603 /*
4604 * Third case : a sub-element.
4605 */
4606 else if (CUR == '<') {
4607 htmlParseElementInternal(ctxt);
4608 if (currentNode != NULL) xmlFree(currentNode);
4609
4610 currentNode = xmlStrdup(ctxt->name);
4611 depth = ctxt->nameNr;
4612 }
4613
4614 /*
4615 * Fourth case : a reference. If if has not been resolved,
4616 * parsing returns it's Name, create the node
4617 */
4618 else if (CUR == '&') {
4619 htmlParseReference(ctxt);
4620 }
4621
4622 /*
4623 * Fifth case : end of the resource
4624 */
4625 else if (CUR == 0) {
4626 htmlAutoCloseOnEnd(ctxt);
4627 break;
4628 }
4629
4630 /*
4631 * Last case, text. Note that References are handled directly.
4632 */
4633 else {
4634 htmlParseCharData(ctxt);
4635 }
4636
4637 if (cons == ctxt->nbChars) {
4638 if (ctxt->node != NULL) {
4639 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4640 "detected an error in element content\n",
4641 NULL, NULL);
4642 }
4643 break;
4644 }
4645 }
4646 GROW;
4647 }
4648 if (currentNode != NULL) xmlFree(currentNode);
4649}
4650
4651/**
4652 * htmlParseContent:
4653 * @ctxt: an HTML parser context
4654 *
4655 * Parse a content: comment, sub-element, reference or text.
4656 * This is the entry point when called from parser.c
4657 */
4658
4659void
4660__htmlParseContent(void *ctxt) {
4661 if (ctxt != NULL)
4662 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4663}
4664
Owen Taylor3473f882001-02-23 17:55:21 +00004665/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004666 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004667 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004668 *
Owen Taylor3473f882001-02-23 17:55:21 +00004669 * parse an HTML document (and build a tree if using the standard SAX
4670 * interface).
4671 *
4672 * Returns 0, -1 in case of error. the parser context is augmented
4673 * as a result of the parsing.
4674 */
4675
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004676int
Owen Taylor3473f882001-02-23 17:55:21 +00004677htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004678 xmlChar start[4];
4679 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004680 xmlDtdPtr dtd;
4681
Daniel Veillardd0463562001-10-13 09:15:48 +00004682 xmlInitParser();
4683
Owen Taylor3473f882001-02-23 17:55:21 +00004684 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004685
Daniel Veillarda03e3652004-11-02 18:45:30 +00004686 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4687 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4688 "htmlParseDocument: context error\n", NULL, NULL);
4689 return(XML_ERR_INTERNAL_ERROR);
4690 }
4691 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004692 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004693 GROW;
4694 /*
4695 * SAX: beginning of the document processing.
4696 */
4697 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4698 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4699
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004700 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4701 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4702 /*
4703 * Get the 4 first bytes and decode the charset
4704 * if enc != XML_CHAR_ENCODING_NONE
4705 * plug some encoding conversion routines.
4706 */
4707 start[0] = RAW;
4708 start[1] = NXT(1);
4709 start[2] = NXT(2);
4710 start[3] = NXT(3);
4711 enc = xmlDetectCharEncoding(&start[0], 4);
4712 if (enc != XML_CHAR_ENCODING_NONE) {
4713 xmlSwitchEncoding(ctxt, enc);
4714 }
4715 }
4716
Owen Taylor3473f882001-02-23 17:55:21 +00004717 /*
4718 * Wipe out everything which is before the first '<'
4719 */
4720 SKIP_BLANKS;
4721 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004722 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004723 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004724 }
4725
4726 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4727 ctxt->sax->startDocument(ctxt->userData);
4728
4729
4730 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004731 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004732 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004733 while (((CUR == '<') && (NXT(1) == '!') &&
4734 (NXT(2) == '-') && (NXT(3) == '-')) ||
4735 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004736 htmlParseComment(ctxt);
4737 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004738 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004739 }
Owen Taylor3473f882001-02-23 17:55:21 +00004740
4741
4742 /*
4743 * Then possibly doc type declaration(s) and more Misc
4744 * (doctypedecl Misc*)?
4745 */
4746 if ((CUR == '<') && (NXT(1) == '!') &&
4747 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4748 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4749 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4750 (UPP(8) == 'E')) {
4751 htmlParseDocTypeDecl(ctxt);
4752 }
4753 SKIP_BLANKS;
4754
4755 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004756 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004757 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004758 while (((CUR == '<') && (NXT(1) == '!') &&
4759 (NXT(2) == '-') && (NXT(3) == '-')) ||
4760 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004761 htmlParseComment(ctxt);
4762 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004763 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004764 }
Owen Taylor3473f882001-02-23 17:55:21 +00004765
4766 /*
4767 * Time to start parsing the tree itself
4768 */
Eugene Pimenov615904f2010-03-15 15:16:02 +01004769 htmlParseContentInternal(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004770
4771 /*
4772 * autoclose
4773 */
4774 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004775 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004776
4777
4778 /*
4779 * SAX: end of the document processing.
4780 */
4781 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4782 ctxt->sax->endDocument(ctxt->userData);
4783
Daniel Veillardf1121c42010-07-26 14:02:42 +02004784 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004785 dtd = xmlGetIntSubset(ctxt->myDoc);
4786 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004787 ctxt->myDoc->intSubset =
4788 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004789 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4790 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4791 }
4792 if (! ctxt->wellFormed) return(-1);
4793 return(0);
4794}
4795
4796
4797/************************************************************************
4798 * *
4799 * Parser contexts handling *
4800 * *
4801 ************************************************************************/
4802
4803/**
William M. Brackedb65a72004-02-06 07:36:04 +00004804 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004805 * @ctxt: an HTML parser context
4806 *
4807 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004808 *
4809 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004810 */
4811
Daniel Veillardf403d292003-10-05 13:51:35 +00004812static int
Owen Taylor3473f882001-02-23 17:55:21 +00004813htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4814{
4815 htmlSAXHandler *sax;
4816
Daniel Veillardf403d292003-10-05 13:51:35 +00004817 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004818 memset(ctxt, 0, sizeof(htmlParserCtxt));
4819
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004820 ctxt->dict = xmlDictCreate();
4821 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004822 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4823 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004824 }
Owen Taylor3473f882001-02-23 17:55:21 +00004825 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4826 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004827 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4828 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004829 }
4830 else
4831 memset(sax, 0, sizeof(htmlSAXHandler));
4832
4833 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004834 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004835 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4836 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004837 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004838 ctxt->inputNr = 0;
4839 ctxt->inputMax = 0;
4840 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004841 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004842 }
4843 ctxt->inputNr = 0;
4844 ctxt->inputMax = 5;
4845 ctxt->input = NULL;
4846 ctxt->version = NULL;
4847 ctxt->encoding = NULL;
4848 ctxt->standalone = -1;
4849 ctxt->instate = XML_PARSER_START;
4850
4851 /* Allocate the Node stack */
4852 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4853 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004854 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004855 ctxt->nodeNr = 0;
4856 ctxt->nodeMax = 0;
4857 ctxt->node = NULL;
4858 ctxt->inputNr = 0;
4859 ctxt->inputMax = 0;
4860 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004861 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004862 }
4863 ctxt->nodeNr = 0;
4864 ctxt->nodeMax = 10;
4865 ctxt->node = NULL;
4866
4867 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004868 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004869 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004870 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004871 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004872 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004873 ctxt->name = NULL;
4874 ctxt->nodeNr = 0;
4875 ctxt->nodeMax = 0;
4876 ctxt->node = NULL;
4877 ctxt->inputNr = 0;
4878 ctxt->inputMax = 0;
4879 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004880 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004881 }
4882 ctxt->nameNr = 0;
4883 ctxt->nameMax = 10;
4884 ctxt->name = NULL;
4885
Eugene Pimenov615904f2010-03-15 15:16:02 +01004886 ctxt->nodeInfoTab = NULL;
4887 ctxt->nodeInfoNr = 0;
4888 ctxt->nodeInfoMax = 0;
4889
Daniel Veillard092643b2003-09-25 14:29:29 +00004890 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004891 else {
4892 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004893 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004894 }
4895 ctxt->userData = ctxt;
4896 ctxt->myDoc = NULL;
4897 ctxt->wellFormed = 1;
4898 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004899 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004900 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004901 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004902 ctxt->vctxt.userData = ctxt;
4903 ctxt->vctxt.error = xmlParserValidityError;
4904 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004905 ctxt->record_info = 0;
4906 ctxt->validate = 0;
4907 ctxt->nbChars = 0;
4908 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004909 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004910 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004911 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004912}
4913
4914/**
4915 * htmlFreeParserCtxt:
4916 * @ctxt: an HTML parser context
4917 *
4918 * Free all the memory used by a parser context. However the parsed
4919 * document in ctxt->myDoc is not freed.
4920 */
4921
4922void
4923htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4924{
4925 xmlFreeParserCtxt(ctxt);
4926}
4927
4928/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004929 * htmlNewParserCtxt:
4930 *
4931 * Allocate and initialize a new parser context.
4932 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004933 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004934 */
4935
Daniel Veillard34c647c2006-09-21 06:53:59 +00004936htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004937htmlNewParserCtxt(void)
4938{
4939 xmlParserCtxtPtr ctxt;
4940
4941 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4942 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004943 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004944 return(NULL);
4945 }
4946 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004947 if (htmlInitParserCtxt(ctxt) < 0) {
4948 htmlFreeParserCtxt(ctxt);
4949 return(NULL);
4950 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004951 return(ctxt);
4952}
4953
4954/**
4955 * htmlCreateMemoryParserCtxt:
4956 * @buffer: a pointer to a char array
4957 * @size: the size of the array
4958 *
4959 * Create a parser context for an HTML in-memory document.
4960 *
4961 * Returns the new parser context or NULL
4962 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004963htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004964htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4965 xmlParserCtxtPtr ctxt;
4966 xmlParserInputPtr input;
4967 xmlParserInputBufferPtr buf;
4968
4969 if (buffer == NULL)
4970 return(NULL);
4971 if (size <= 0)
4972 return(NULL);
4973
4974 ctxt = htmlNewParserCtxt();
4975 if (ctxt == NULL)
4976 return(NULL);
4977
4978 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4979 if (buf == NULL) return(NULL);
4980
4981 input = xmlNewInputStream(ctxt);
4982 if (input == NULL) {
4983 xmlFreeParserCtxt(ctxt);
4984 return(NULL);
4985 }
4986
4987 input->filename = NULL;
4988 input->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08004989 xmlBufResetInput(buf->buffer, input);
Daniel Veillard1d995272002-07-22 16:43:32 +00004990
4991 inputPush(ctxt, input);
4992 return(ctxt);
4993}
4994
4995/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004996 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004997 * @cur: a pointer to an array of xmlChar
4998 * @encoding: a free form C string describing the HTML document encoding, or NULL
4999 *
5000 * Create a parser context for an HTML document.
5001 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005002 * TODO: check the need to add encoding handling there
5003 *
Owen Taylor3473f882001-02-23 17:55:21 +00005004 * Returns the new parser context or NULL
5005 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005006static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00005007htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00005008 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005009 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00005010
Daniel Veillard1d995272002-07-22 16:43:32 +00005011 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00005012 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00005013 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005014 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00005015 if (ctxt == NULL)
5016 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005017
5018 if (encoding != NULL) {
5019 xmlCharEncoding enc;
5020 xmlCharEncodingHandlerPtr handler;
5021
5022 if (ctxt->input->encoding != NULL)
5023 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00005024 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005025
5026 enc = xmlParseCharEncoding(encoding);
5027 /*
5028 * registered set of known encodings
5029 */
5030 if (enc != XML_CHAR_ENCODING_ERROR) {
5031 xmlSwitchEncoding(ctxt, enc);
5032 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005033 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02005034 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00005035 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005036 }
5037 } else {
5038 /*
5039 * fallback for unknown encodings
5040 */
5041 handler = xmlFindCharEncodingHandler((const char *) encoding);
5042 if (handler != NULL) {
5043 xmlSwitchToEncoding(ctxt, handler);
5044 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005045 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5046 "Unsupported encoding %s\n",
5047 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005048 }
5049 }
5050 }
5051 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005052}
5053
Daniel Veillard73b013f2003-09-30 12:36:01 +00005054#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00005055/************************************************************************
5056 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02005057 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00005058 * *
5059 ************************************************************************/
5060
5061/**
5062 * htmlParseLookupSequence:
5063 * @ctxt: an HTML parser context
5064 * @first: the first char to lookup
5065 * @next: the next char to lookup or zero
5066 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00005067 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00005068 *
5069 * Try to find if a sequence (first, next, third) or just (first next) or
5070 * (first) is available in the input stream.
5071 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5072 * to avoid rescanning sequences of bytes, it DOES change the state of the
5073 * parser, do not use liberally.
5074 * This is basically similar to xmlParseLookupSequence()
5075 *
5076 * Returns the index to the current parsing point if the full sequence
5077 * is available, -1 otherwise.
5078 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005079static int
Owen Taylor3473f882001-02-23 17:55:21 +00005080htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02005081 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02005082 int ignoreattrval)
5083{
Owen Taylor3473f882001-02-23 17:55:21 +00005084 int base, len;
5085 htmlParserInputPtr in;
5086 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00005087 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02005088 int invalue = 0;
5089 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00005090
5091 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005092 if (in == NULL)
5093 return (-1);
5094
Owen Taylor3473f882001-02-23 17:55:21 +00005095 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005096 if (base < 0)
5097 return (-1);
5098
Owen Taylor3473f882001-02-23 17:55:21 +00005099 if (ctxt->checkIndex > base)
5100 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005101
Owen Taylor3473f882001-02-23 17:55:21 +00005102 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005103 buf = in->base;
5104 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00005105 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005106 buf = xmlBufContent(in->buf->buffer);
5107 len = xmlBufUse(in->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00005108 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005109
Owen Taylor3473f882001-02-23 17:55:21 +00005110 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02005111 if (third)
5112 len -= 2;
5113 else if (next)
5114 len--;
5115 for (; base < len; base++) {
5116 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5117 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5118 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5119 incomment = 1;
5120 /* do not increment past <! - some people use <!--> */
5121 base += 2;
5122 }
5123 }
5124 if (ignoreattrval) {
5125 if (buf[base] == '"' || buf[base] == '\'') {
5126 if (invalue) {
5127 if (buf[base] == valdellim) {
5128 invalue = 0;
5129 continue;
5130 }
5131 } else {
5132 valdellim = buf[base];
5133 invalue = 1;
5134 continue;
5135 }
5136 } else if (invalue) {
5137 continue;
5138 }
5139 }
5140 if (incomment) {
5141 if (base + 3 > len)
5142 return (-1);
5143 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5144 (buf[base + 2] == '>')) {
5145 incomment = 0;
5146 base += 2;
5147 }
5148 continue;
5149 }
Owen Taylor3473f882001-02-23 17:55:21 +00005150 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005151 if (third != 0) {
5152 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5153 continue;
5154 } else if (next != 0) {
5155 if (buf[base + 1] != next)
5156 continue;
5157 }
5158 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00005159#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02005160 if (next == 0)
5161 xmlGenericError(xmlGenericErrorContext,
5162 "HPP: lookup '%c' found at %d\n",
5163 first, base);
5164 else if (third == 0)
5165 xmlGenericError(xmlGenericErrorContext,
5166 "HPP: lookup '%c%c' found at %d\n",
5167 first, next, base);
5168 else
5169 xmlGenericError(xmlGenericErrorContext,
5170 "HPP: lookup '%c%c%c' found at %d\n",
5171 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00005172#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005173 return (base - (in->cur - in->base));
5174 }
Owen Taylor3473f882001-02-23 17:55:21 +00005175 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005176 if ((!incomment) && (!invalue))
5177 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00005178#ifdef DEBUG_PUSH
5179 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005180 xmlGenericError(xmlGenericErrorContext,
5181 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00005182 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005183 xmlGenericError(xmlGenericErrorContext,
5184 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02005185 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02005186 xmlGenericError(xmlGenericErrorContext,
5187 "HPP: lookup '%c%c%c' failed\n", first, next,
5188 third);
Owen Taylor3473f882001-02-23 17:55:21 +00005189#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005190 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00005191}
5192
5193/**
Markus Kull56a03032009-08-24 19:00:23 +02005194 * htmlParseLookupChars:
5195 * @ctxt: an HTML parser context
5196 * @stop: Array of chars, which stop the lookup.
5197 * @stopLen: Length of stop-Array
5198 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005199 * Try to find if any char of the stop-Array is available in the input
Markus Kull56a03032009-08-24 19:00:23 +02005200 * stream.
5201 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5202 * to avoid rescanning sequences of bytes, it DOES change the state of the
5203 * parser, do not use liberally.
5204 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005205 * Returns the index to the current parsing point if a stopChar
Markus Kull56a03032009-08-24 19:00:23 +02005206 * is available, -1 otherwise.
5207 */
5208static int
5209htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5210 int stopLen)
5211{
5212 int base, len;
5213 htmlParserInputPtr in;
5214 const xmlChar *buf;
5215 int incomment = 0;
5216 int i;
5217
5218 in = ctxt->input;
5219 if (in == NULL)
5220 return (-1);
5221
5222 base = in->cur - in->base;
5223 if (base < 0)
5224 return (-1);
5225
5226 if (ctxt->checkIndex > base)
5227 base = ctxt->checkIndex;
5228
5229 if (in->buf == NULL) {
5230 buf = in->base;
5231 len = in->length;
5232 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005233 buf = xmlBufContent(in->buf->buffer);
5234 len = xmlBufUse(in->buf->buffer);
Markus Kull56a03032009-08-24 19:00:23 +02005235 }
5236
5237 for (; base < len; base++) {
5238 if (!incomment && (base + 4 < len)) {
5239 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5240 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5241 incomment = 1;
5242 /* do not increment past <! - some people use <!--> */
5243 base += 2;
5244 }
5245 }
5246 if (incomment) {
5247 if (base + 3 > len)
5248 return (-1);
5249 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5250 (buf[base + 2] == '>')) {
5251 incomment = 0;
5252 base += 2;
5253 }
5254 continue;
5255 }
5256 for (i = 0; i < stopLen; ++i) {
5257 if (buf[base] == stop[i]) {
5258 ctxt->checkIndex = 0;
5259 return (base - (in->cur - in->base));
5260 }
5261 }
5262 }
5263 ctxt->checkIndex = base;
5264 return (-1);
5265}
5266
5267/**
Owen Taylor3473f882001-02-23 17:55:21 +00005268 * htmlParseTryOrFinish:
5269 * @ctxt: an HTML parser context
5270 * @terminate: last chunk indicator
5271 *
5272 * Try to progress on parsing
5273 *
5274 * Returns zero if no parsing was possible
5275 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005276static int
Owen Taylor3473f882001-02-23 17:55:21 +00005277htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5278 int ret = 0;
5279 htmlParserInputPtr in;
5280 int avail = 0;
5281 xmlChar cur, next;
5282
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005283 htmlParserNodeInfo node_info;
5284
Owen Taylor3473f882001-02-23 17:55:21 +00005285#ifdef DEBUG_PUSH
5286 switch (ctxt->instate) {
5287 case XML_PARSER_EOF:
5288 xmlGenericError(xmlGenericErrorContext,
5289 "HPP: try EOF\n"); break;
5290 case XML_PARSER_START:
5291 xmlGenericError(xmlGenericErrorContext,
5292 "HPP: try START\n"); break;
5293 case XML_PARSER_MISC:
5294 xmlGenericError(xmlGenericErrorContext,
5295 "HPP: try MISC\n");break;
5296 case XML_PARSER_COMMENT:
5297 xmlGenericError(xmlGenericErrorContext,
5298 "HPP: try COMMENT\n");break;
5299 case XML_PARSER_PROLOG:
5300 xmlGenericError(xmlGenericErrorContext,
5301 "HPP: try PROLOG\n");break;
5302 case XML_PARSER_START_TAG:
5303 xmlGenericError(xmlGenericErrorContext,
5304 "HPP: try START_TAG\n");break;
5305 case XML_PARSER_CONTENT:
5306 xmlGenericError(xmlGenericErrorContext,
5307 "HPP: try CONTENT\n");break;
5308 case XML_PARSER_CDATA_SECTION:
5309 xmlGenericError(xmlGenericErrorContext,
5310 "HPP: try CDATA_SECTION\n");break;
5311 case XML_PARSER_END_TAG:
5312 xmlGenericError(xmlGenericErrorContext,
5313 "HPP: try END_TAG\n");break;
5314 case XML_PARSER_ENTITY_DECL:
5315 xmlGenericError(xmlGenericErrorContext,
5316 "HPP: try ENTITY_DECL\n");break;
5317 case XML_PARSER_ENTITY_VALUE:
5318 xmlGenericError(xmlGenericErrorContext,
5319 "HPP: try ENTITY_VALUE\n");break;
5320 case XML_PARSER_ATTRIBUTE_VALUE:
5321 xmlGenericError(xmlGenericErrorContext,
5322 "HPP: try ATTRIBUTE_VALUE\n");break;
5323 case XML_PARSER_DTD:
5324 xmlGenericError(xmlGenericErrorContext,
5325 "HPP: try DTD\n");break;
5326 case XML_PARSER_EPILOG:
5327 xmlGenericError(xmlGenericErrorContext,
5328 "HPP: try EPILOG\n");break;
5329 case XML_PARSER_PI:
5330 xmlGenericError(xmlGenericErrorContext,
5331 "HPP: try PI\n");break;
5332 case XML_PARSER_SYSTEM_LITERAL:
5333 xmlGenericError(xmlGenericErrorContext,
5334 "HPP: try SYSTEM_LITERAL\n");break;
5335 }
5336#endif
5337
5338 while (1) {
5339
5340 in = ctxt->input;
5341 if (in == NULL) break;
5342 if (in->buf == NULL)
5343 avail = in->length - (in->cur - in->base);
5344 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005345 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005346 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005347 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005348 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005349 /*
5350 * SAX: end of the document processing.
5351 */
5352 ctxt->instate = XML_PARSER_EOF;
5353 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5354 ctxt->sax->endDocument(ctxt->userData);
5355 }
5356 }
5357 if (avail < 1)
5358 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00005359 cur = in->cur[0];
5360 if (cur == 0) {
5361 SKIP(1);
5362 continue;
5363 }
5364
Owen Taylor3473f882001-02-23 17:55:21 +00005365 switch (ctxt->instate) {
5366 case XML_PARSER_EOF:
5367 /*
5368 * Document parsing is done !
5369 */
5370 goto done;
5371 case XML_PARSER_START:
5372 /*
5373 * Very first chars read from the document flow.
5374 */
5375 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005376 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005377 SKIP_BLANKS;
5378 if (in->buf == NULL)
5379 avail = in->length - (in->cur - in->base);
5380 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005381 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005382 }
5383 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5384 ctxt->sax->setDocumentLocator(ctxt->userData,
5385 &xmlDefaultSAXLocator);
5386 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5387 (!ctxt->disableSAX))
5388 ctxt->sax->startDocument(ctxt->userData);
5389
5390 cur = in->cur[0];
5391 next = in->cur[1];
5392 if ((cur == '<') && (next == '!') &&
5393 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5394 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5395 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5396 (UPP(8) == 'E')) {
5397 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005398 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005399 goto done;
5400#ifdef DEBUG_PUSH
5401 xmlGenericError(xmlGenericErrorContext,
5402 "HPP: Parsing internal subset\n");
5403#endif
5404 htmlParseDocTypeDecl(ctxt);
5405 ctxt->instate = XML_PARSER_PROLOG;
5406#ifdef DEBUG_PUSH
5407 xmlGenericError(xmlGenericErrorContext,
5408 "HPP: entering PROLOG\n");
5409#endif
5410 } else {
5411 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005412#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00005413 xmlGenericError(xmlGenericErrorContext,
5414 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005415#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00005416 }
Owen Taylor3473f882001-02-23 17:55:21 +00005417 break;
5418 case XML_PARSER_MISC:
5419 SKIP_BLANKS;
5420 if (in->buf == NULL)
5421 avail = in->length - (in->cur - in->base);
5422 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005423 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Denis Paukfdf990c2012-05-10 20:40:49 +08005424 /*
5425 * no chars in buffer
5426 */
5427 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005428 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005429 /*
5430 * not enouth chars in buffer
5431 */
5432 if (avail < 2) {
5433 if (!terminate)
5434 goto done;
5435 else
5436 next = ' ';
5437 } else {
5438 next = in->cur[1];
5439 }
Owen Taylor3473f882001-02-23 17:55:21 +00005440 cur = in->cur[0];
Owen Taylor3473f882001-02-23 17:55:21 +00005441 if ((cur == '<') && (next == '!') &&
5442 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5443 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005444 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005445 goto done;
5446#ifdef DEBUG_PUSH
5447 xmlGenericError(xmlGenericErrorContext,
5448 "HPP: Parsing Comment\n");
5449#endif
5450 htmlParseComment(ctxt);
5451 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005452 } else if ((cur == '<') && (next == '?')) {
5453 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005454 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005455 goto done;
5456#ifdef DEBUG_PUSH
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: Parsing PI\n");
5459#endif
5460 htmlParsePI(ctxt);
5461 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005462 } else if ((cur == '<') && (next == '!') &&
5463 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5464 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5465 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5466 (UPP(8) == 'E')) {
5467 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005468 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005469 goto done;
5470#ifdef DEBUG_PUSH
5471 xmlGenericError(xmlGenericErrorContext,
5472 "HPP: Parsing internal subset\n");
5473#endif
5474 htmlParseDocTypeDecl(ctxt);
5475 ctxt->instate = XML_PARSER_PROLOG;
5476#ifdef DEBUG_PUSH
5477 xmlGenericError(xmlGenericErrorContext,
5478 "HPP: entering PROLOG\n");
5479#endif
5480 } else if ((cur == '<') && (next == '!') &&
5481 (avail < 9)) {
5482 goto done;
5483 } else {
5484 ctxt->instate = XML_PARSER_START_TAG;
5485#ifdef DEBUG_PUSH
5486 xmlGenericError(xmlGenericErrorContext,
5487 "HPP: entering START_TAG\n");
5488#endif
5489 }
5490 break;
5491 case XML_PARSER_PROLOG:
5492 SKIP_BLANKS;
5493 if (in->buf == NULL)
5494 avail = in->length - (in->cur - in->base);
5495 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005496 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005497 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005498 goto done;
5499 cur = in->cur[0];
5500 next = in->cur[1];
5501 if ((cur == '<') && (next == '!') &&
5502 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5503 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005504 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005505 goto done;
5506#ifdef DEBUG_PUSH
5507 xmlGenericError(xmlGenericErrorContext,
5508 "HPP: Parsing Comment\n");
5509#endif
5510 htmlParseComment(ctxt);
5511 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005512 } else if ((cur == '<') && (next == '?')) {
5513 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005514 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005515 goto done;
5516#ifdef DEBUG_PUSH
5517 xmlGenericError(xmlGenericErrorContext,
5518 "HPP: Parsing PI\n");
5519#endif
5520 htmlParsePI(ctxt);
5521 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005522 } else if ((cur == '<') && (next == '!') &&
5523 (avail < 4)) {
5524 goto done;
5525 } else {
5526 ctxt->instate = XML_PARSER_START_TAG;
5527#ifdef DEBUG_PUSH
5528 xmlGenericError(xmlGenericErrorContext,
5529 "HPP: entering START_TAG\n");
5530#endif
5531 }
5532 break;
5533 case XML_PARSER_EPILOG:
5534 if (in->buf == NULL)
5535 avail = in->length - (in->cur - in->base);
5536 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005537 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005538 if (avail < 1)
5539 goto done;
5540 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005541 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005542 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005543 goto done;
5544 }
5545 if (avail < 2)
5546 goto done;
5547 next = in->cur[1];
5548 if ((cur == '<') && (next == '!') &&
5549 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5550 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005551 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005552 goto done;
5553#ifdef DEBUG_PUSH
5554 xmlGenericError(xmlGenericErrorContext,
5555 "HPP: Parsing Comment\n");
5556#endif
5557 htmlParseComment(ctxt);
5558 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005559 } else if ((cur == '<') && (next == '?')) {
5560 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005561 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005562 goto done;
5563#ifdef DEBUG_PUSH
5564 xmlGenericError(xmlGenericErrorContext,
5565 "HPP: Parsing PI\n");
5566#endif
5567 htmlParsePI(ctxt);
5568 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005569 } else if ((cur == '<') && (next == '!') &&
5570 (avail < 4)) {
5571 goto done;
5572 } else {
5573 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005574 ctxt->wellFormed = 0;
5575 ctxt->instate = XML_PARSER_EOF;
5576#ifdef DEBUG_PUSH
5577 xmlGenericError(xmlGenericErrorContext,
5578 "HPP: entering EOF\n");
5579#endif
5580 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5581 ctxt->sax->endDocument(ctxt->userData);
5582 goto done;
5583 }
5584 break;
5585 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005586 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005587 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005588 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005589
Denis Paukfdf990c2012-05-10 20:40:49 +08005590 /*
5591 * no chars in buffer
5592 */
5593 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005594 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005595 /*
5596 * not enouth chars in buffer
5597 */
5598 if (avail < 2) {
5599 if (!terminate)
5600 goto done;
5601 else
5602 next = ' ';
5603 } else {
5604 next = in->cur[1];
5605 }
Owen Taylor3473f882001-02-23 17:55:21 +00005606 cur = in->cur[0];
5607 if (cur != '<') {
5608 ctxt->instate = XML_PARSER_CONTENT;
5609#ifdef DEBUG_PUSH
5610 xmlGenericError(xmlGenericErrorContext,
5611 "HPP: entering CONTENT\n");
5612#endif
5613 break;
5614 }
Denis Paukfdf990c2012-05-10 20:40:49 +08005615 if (next == '/') {
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005616 ctxt->instate = XML_PARSER_END_TAG;
5617 ctxt->checkIndex = 0;
5618#ifdef DEBUG_PUSH
5619 xmlGenericError(xmlGenericErrorContext,
5620 "HPP: entering END_TAG\n");
5621#endif
5622 break;
5623 }
Owen Taylor3473f882001-02-23 17:55:21 +00005624 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005625 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005626 goto done;
5627
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005628 /* Capture start position */
5629 if (ctxt->record_info) {
5630 node_info.begin_pos = ctxt->input->consumed +
5631 (CUR_PTR - ctxt->input->base);
5632 node_info.begin_line = ctxt->input->line;
5633 }
5634
5635
Daniel Veillard597f1c12005-07-03 23:00:18 +00005636 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005637 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005638 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005639 (name == NULL)) {
5640 if (CUR == '>')
5641 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005642 break;
5643 }
Owen Taylor3473f882001-02-23 17:55:21 +00005644
5645 /*
5646 * Lookup the info for that element.
5647 */
5648 info = htmlTagLookup(name);
5649 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005650 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5651 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005652 }
5653
5654 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005655 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005656 */
5657 if ((CUR == '/') && (NXT(1) == '>')) {
5658 SKIP(2);
5659 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5660 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005661 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005662 ctxt->instate = XML_PARSER_CONTENT;
5663#ifdef DEBUG_PUSH
5664 xmlGenericError(xmlGenericErrorContext,
5665 "HPP: entering CONTENT\n");
5666#endif
5667 break;
5668 }
5669
5670 if (CUR == '>') {
5671 NEXT;
5672 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005673 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5674 "Couldn't find end of Start Tag %s\n",
5675 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005676
5677 /*
5678 * end of parsing of this node.
5679 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005680 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005681 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005682 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005683 }
Owen Taylor3473f882001-02-23 17:55:21 +00005684
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005685 if (ctxt->record_info)
5686 htmlNodeInfoPush(ctxt, &node_info);
5687
Owen Taylor3473f882001-02-23 17:55:21 +00005688 ctxt->instate = XML_PARSER_CONTENT;
5689#ifdef DEBUG_PUSH
5690 xmlGenericError(xmlGenericErrorContext,
5691 "HPP: entering CONTENT\n");
5692#endif
5693 break;
5694 }
5695
5696 /*
5697 * Check for an Empty Element from DTD definition
5698 */
5699 if ((info != NULL) && (info->empty)) {
5700 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5701 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005702 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005703 }
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005704
5705 if (ctxt->record_info)
5706 htmlNodeInfoPush(ctxt, &node_info);
5707
Owen Taylor3473f882001-02-23 17:55:21 +00005708 ctxt->instate = XML_PARSER_CONTENT;
5709#ifdef DEBUG_PUSH
5710 xmlGenericError(xmlGenericErrorContext,
5711 "HPP: entering CONTENT\n");
5712#endif
5713 break;
5714 }
5715 case XML_PARSER_CONTENT: {
5716 long cons;
5717 /*
5718 * Handle preparsed entities and charRef
5719 */
5720 if (ctxt->token != 0) {
5721 xmlChar chr[2] = { 0 , 0 } ;
5722
5723 chr[0] = (xmlChar) ctxt->token;
5724 htmlCheckParagraph(ctxt);
5725 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5726 ctxt->sax->characters(ctxt->userData, chr, 1);
5727 ctxt->token = 0;
5728 ctxt->checkIndex = 0;
5729 }
5730 if ((avail == 1) && (terminate)) {
5731 cur = in->cur[0];
5732 if ((cur != '<') && (cur != '&')) {
5733 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005734 if (IS_BLANK_CH(cur)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08005735 if (ctxt->keepBlanks) {
5736 if (ctxt->sax->characters != NULL)
5737 ctxt->sax->characters(
Hugh Davenport8fb4a772015-11-20 17:16:06 +08005738 ctxt->userData, &in->cur[0], 1);
Daniel Veillardf933c892012-09-07 19:32:12 +08005739 } else {
5740 if (ctxt->sax->ignorableWhitespace != NULL)
5741 ctxt->sax->ignorableWhitespace(
Hugh Davenport8fb4a772015-11-20 17:16:06 +08005742 ctxt->userData, &in->cur[0], 1);
Daniel Veillardf933c892012-09-07 19:32:12 +08005743 }
Owen Taylor3473f882001-02-23 17:55:21 +00005744 } else {
5745 htmlCheckParagraph(ctxt);
5746 if (ctxt->sax->characters != NULL)
5747 ctxt->sax->characters(
Hugh Davenport8fb4a772015-11-20 17:16:06 +08005748 ctxt->userData, &in->cur[0], 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005749 }
5750 }
5751 ctxt->token = 0;
5752 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005753 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005754 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005755 }
Owen Taylor3473f882001-02-23 17:55:21 +00005756 }
5757 if (avail < 2)
5758 goto done;
5759 cur = in->cur[0];
5760 next = in->cur[1];
5761 cons = ctxt->nbChars;
5762 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5763 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5764 /*
5765 * Handle SCRIPT/STYLE separately
5766 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005767 if (!terminate) {
5768 int idx;
5769 xmlChar val;
5770
Denis Pauk91d239c2010-11-04 12:39:18 +01005771 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
Daniel Veillard68716a72006-10-16 09:32:17 +00005772 if (idx < 0)
5773 goto done;
5774 val = in->cur[idx + 2];
5775 if (val == 0) /* bad cut of input */
5776 goto done;
5777 }
Owen Taylor3473f882001-02-23 17:55:21 +00005778 htmlParseScript(ctxt);
5779 if ((cur == '<') && (next == '/')) {
5780 ctxt->instate = XML_PARSER_END_TAG;
5781 ctxt->checkIndex = 0;
5782#ifdef DEBUG_PUSH
5783 xmlGenericError(xmlGenericErrorContext,
5784 "HPP: entering END_TAG\n");
5785#endif
5786 break;
5787 }
5788 } else {
5789 /*
5790 * Sometimes DOCTYPE arrives in the middle of the document
5791 */
5792 if ((cur == '<') && (next == '!') &&
5793 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5794 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5795 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5796 (UPP(8) == 'E')) {
5797 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005798 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005799 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005800 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5801 "Misplaced DOCTYPE declaration\n",
5802 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005803 htmlParseDocTypeDecl(ctxt);
5804 } else if ((cur == '<') && (next == '!') &&
5805 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5806 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005807 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005808 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005809 goto done;
5810#ifdef DEBUG_PUSH
5811 xmlGenericError(xmlGenericErrorContext,
5812 "HPP: Parsing Comment\n");
5813#endif
5814 htmlParseComment(ctxt);
5815 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005816 } else if ((cur == '<') && (next == '?')) {
5817 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005818 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005819 goto done;
5820#ifdef DEBUG_PUSH
5821 xmlGenericError(xmlGenericErrorContext,
5822 "HPP: Parsing PI\n");
5823#endif
5824 htmlParsePI(ctxt);
5825 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005826 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5827 goto done;
5828 } else if ((cur == '<') && (next == '/')) {
5829 ctxt->instate = XML_PARSER_END_TAG;
5830 ctxt->checkIndex = 0;
5831#ifdef DEBUG_PUSH
5832 xmlGenericError(xmlGenericErrorContext,
5833 "HPP: entering END_TAG\n");
5834#endif
5835 break;
5836 } else if (cur == '<') {
5837 ctxt->instate = XML_PARSER_START_TAG;
5838 ctxt->checkIndex = 0;
5839#ifdef DEBUG_PUSH
5840 xmlGenericError(xmlGenericErrorContext,
5841 "HPP: entering START_TAG\n");
5842#endif
5843 break;
5844 } else if (cur == '&') {
5845 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005846 (htmlParseLookupChars(ctxt,
5847 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005848 goto done;
5849#ifdef DEBUG_PUSH
5850 xmlGenericError(xmlGenericErrorContext,
5851 "HPP: Parsing Reference\n");
5852#endif
5853 /* TODO: check generation of subtrees if noent !!! */
5854 htmlParseReference(ctxt);
5855 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005856 /*
5857 * check that the text sequence is complete
5858 * before handing out the data to the parser
5859 * to avoid problems with erroneous end of
5860 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005861 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005862 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005863 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005864 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005865 ctxt->checkIndex = 0;
5866#ifdef DEBUG_PUSH
5867 xmlGenericError(xmlGenericErrorContext,
5868 "HPP: Parsing char data\n");
5869#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005870 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005871 }
5872 }
5873 if (cons == ctxt->nbChars) {
5874 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005875 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5876 "detected an error in element content\n",
5877 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005878 }
5879 NEXT;
5880 break;
5881 }
5882
5883 break;
5884 }
5885 case XML_PARSER_END_TAG:
5886 if (avail < 2)
5887 goto done;
5888 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005889 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005890 goto done;
5891 htmlParseEndTag(ctxt);
5892 if (ctxt->nameNr == 0) {
5893 ctxt->instate = XML_PARSER_EPILOG;
5894 } else {
5895 ctxt->instate = XML_PARSER_CONTENT;
5896 }
5897 ctxt->checkIndex = 0;
5898#ifdef DEBUG_PUSH
5899 xmlGenericError(xmlGenericErrorContext,
5900 "HPP: entering CONTENT\n");
5901#endif
5902 break;
5903 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005904 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5905 "HPP: internal error, state == CDATA\n",
5906 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005907 ctxt->instate = XML_PARSER_CONTENT;
5908 ctxt->checkIndex = 0;
5909#ifdef DEBUG_PUSH
5910 xmlGenericError(xmlGenericErrorContext,
5911 "HPP: entering CONTENT\n");
5912#endif
5913 break;
5914 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005915 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5916 "HPP: internal error, state == DTD\n",
5917 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005918 ctxt->instate = XML_PARSER_CONTENT;
5919 ctxt->checkIndex = 0;
5920#ifdef DEBUG_PUSH
5921 xmlGenericError(xmlGenericErrorContext,
5922 "HPP: entering CONTENT\n");
5923#endif
5924 break;
5925 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005926 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5927 "HPP: internal error, state == COMMENT\n",
5928 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005929 ctxt->instate = XML_PARSER_CONTENT;
5930 ctxt->checkIndex = 0;
5931#ifdef DEBUG_PUSH
5932 xmlGenericError(xmlGenericErrorContext,
5933 "HPP: entering CONTENT\n");
5934#endif
5935 break;
5936 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005937 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5938 "HPP: internal error, state == PI\n",
5939 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005940 ctxt->instate = XML_PARSER_CONTENT;
5941 ctxt->checkIndex = 0;
5942#ifdef DEBUG_PUSH
5943 xmlGenericError(xmlGenericErrorContext,
5944 "HPP: entering CONTENT\n");
5945#endif
5946 break;
5947 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005948 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5949 "HPP: internal error, state == ENTITY_DECL\n",
5950 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005951 ctxt->instate = XML_PARSER_CONTENT;
5952 ctxt->checkIndex = 0;
5953#ifdef DEBUG_PUSH
5954 xmlGenericError(xmlGenericErrorContext,
5955 "HPP: entering CONTENT\n");
5956#endif
5957 break;
5958 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005959 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5960 "HPP: internal error, state == ENTITY_VALUE\n",
5961 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005962 ctxt->instate = XML_PARSER_CONTENT;
5963 ctxt->checkIndex = 0;
5964#ifdef DEBUG_PUSH
5965 xmlGenericError(xmlGenericErrorContext,
5966 "HPP: entering DTD\n");
5967#endif
5968 break;
5969 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005970 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5971 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5972 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005973 ctxt->instate = XML_PARSER_START_TAG;
5974 ctxt->checkIndex = 0;
5975#ifdef DEBUG_PUSH
5976 xmlGenericError(xmlGenericErrorContext,
5977 "HPP: entering START_TAG\n");
5978#endif
5979 break;
5980 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005981 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5982 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5983 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005984 ctxt->instate = XML_PARSER_CONTENT;
5985 ctxt->checkIndex = 0;
5986#ifdef DEBUG_PUSH
5987 xmlGenericError(xmlGenericErrorContext,
5988 "HPP: entering CONTENT\n");
5989#endif
5990 break;
5991 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005992 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5993 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5994 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005995 ctxt->instate = XML_PARSER_CONTENT;
5996 ctxt->checkIndex = 0;
5997#ifdef DEBUG_PUSH
5998 xmlGenericError(xmlGenericErrorContext,
5999 "HPP: entering CONTENT\n");
6000#endif
6001 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00006002 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00006003 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6004 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6005 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00006006 ctxt->instate = XML_PARSER_CONTENT;
6007 ctxt->checkIndex = 0;
6008#ifdef DEBUG_PUSH
6009 xmlGenericError(xmlGenericErrorContext,
6010 "HPP: entering CONTENT\n");
6011#endif
6012 break;
6013
Owen Taylor3473f882001-02-23 17:55:21 +00006014 }
6015 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006016done:
Owen Taylor3473f882001-02-23 17:55:21 +00006017 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00006018 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006019 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00006020 /*
6021 * SAX: end of the document processing.
6022 */
6023 ctxt->instate = XML_PARSER_EOF;
6024 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6025 ctxt->sax->endDocument(ctxt->userData);
6026 }
6027 }
Arnold Hendriks826bc322013-11-29 14:12:12 +08006028 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00006029 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6030 (ctxt->instate == XML_PARSER_EPILOG))) {
6031 xmlDtdPtr dtd;
6032 dtd = xmlGetIntSubset(ctxt->myDoc);
6033 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02006034 ctxt->myDoc->intSubset =
6035 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00006036 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6037 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6038 }
6039#ifdef DEBUG_PUSH
6040 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6041#endif
6042 return(ret);
6043}
6044
6045/**
Owen Taylor3473f882001-02-23 17:55:21 +00006046 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00006047 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00006048 * @chunk: an char array
6049 * @size: the size in byte of the chunk
6050 * @terminate: last chunk indicator
6051 *
6052 * Parse a Chunk of memory
6053 *
6054 * Returns zero if no error, the xmlParserErrors otherwise.
6055 */
6056int
6057htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6058 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00006059 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6060 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6061 "htmlParseChunk: context error\n", NULL, NULL);
6062 return(XML_ERR_INTERNAL_ERROR);
6063 }
Owen Taylor3473f882001-02-23 17:55:21 +00006064 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6065 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006066 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6067 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00006068 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02006069
6070 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00006071 if (res < 0) {
6072 ctxt->errNo = XML_PARSER_EOF;
6073 ctxt->disableSAX = 1;
6074 return (XML_PARSER_EOF);
6075 }
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006076 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00006077#ifdef DEBUG_PUSH
6078 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6079#endif
6080
Daniel Veillard14f752c2003-08-09 11:44:50 +00006081#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00006082 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6083 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006084#endif
Owen Taylor3473f882001-02-23 17:55:21 +00006085 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00006086 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6087 xmlParserInputBufferPtr in = ctxt->input->buf;
6088 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6089 (in->raw != NULL)) {
6090 int nbchars;
Daniel Veillardde0cc202013-02-12 16:55:34 +08006091 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6092 size_t current = ctxt->input->cur - ctxt->input->base;
Daniel Veillarde77db162009-08-22 11:32:38 +02006093
Daniel Veillardbf058dc2013-02-13 18:19:42 +08006094 nbchars = xmlCharEncInput(in, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006095 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006096 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6097 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006098 return(XML_ERR_INVALID_ENCODING);
6099 }
Daniel Veillardde0cc202013-02-12 16:55:34 +08006100 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006101 }
6102 }
Owen Taylor3473f882001-02-23 17:55:21 +00006103 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00006104 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00006105 if (terminate) {
6106 if ((ctxt->instate != XML_PARSER_EOF) &&
6107 (ctxt->instate != XML_PARSER_EPILOG) &&
6108 (ctxt->instate != XML_PARSER_MISC)) {
6109 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00006110 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02006111 }
Owen Taylor3473f882001-02-23 17:55:21 +00006112 if (ctxt->instate != XML_PARSER_EOF) {
6113 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6114 ctxt->sax->endDocument(ctxt->userData);
6115 }
6116 ctxt->instate = XML_PARSER_EOF;
6117 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006118 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00006119}
6120
6121/************************************************************************
6122 * *
6123 * User entry points *
6124 * *
6125 ************************************************************************/
6126
6127/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006128 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006129 * @sax: a SAX handler
6130 * @user_data: The user data returned on SAX callbacks
6131 * @chunk: a pointer to an array of chars
6132 * @size: number of chars in the array
6133 * @filename: an optional file name or URI
6134 * @enc: an optional encoding
6135 *
6136 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00006137 * The value of @filename is used for fetching external entities
6138 * and error/warning reports.
6139 *
6140 * Returns the new parser context or NULL
6141 */
6142htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006143htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00006144 const char *chunk, int size, const char *filename,
6145 xmlCharEncoding enc) {
6146 htmlParserCtxtPtr ctxt;
6147 htmlParserInputPtr inputStream;
6148 xmlParserInputBufferPtr buf;
6149
Daniel Veillardd0463562001-10-13 09:15:48 +00006150 xmlInitParser();
6151
Owen Taylor3473f882001-02-23 17:55:21 +00006152 buf = xmlAllocParserInputBuffer(enc);
6153 if (buf == NULL) return(NULL);
6154
Daniel Veillardf403d292003-10-05 13:51:35 +00006155 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006156 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006157 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006158 return(NULL);
6159 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00006160 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6161 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00006162 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00006163 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00006164 xmlFree(ctxt->sax);
6165 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6166 if (ctxt->sax == NULL) {
6167 xmlFree(buf);
6168 xmlFree(ctxt);
6169 return(NULL);
6170 }
6171 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6172 if (user_data != NULL)
6173 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02006174 }
Owen Taylor3473f882001-02-23 17:55:21 +00006175 if (filename == NULL) {
6176 ctxt->directory = NULL;
6177 } else {
6178 ctxt->directory = xmlParserGetDirectory(filename);
6179 }
6180
6181 inputStream = htmlNewInputStream(ctxt);
6182 if (inputStream == NULL) {
6183 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00006184 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006185 return(NULL);
6186 }
6187
6188 if (filename == NULL)
6189 inputStream->filename = NULL;
6190 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00006191 inputStream->filename = (char *)
6192 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00006193 inputStream->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08006194 xmlBufResetInput(buf->buffer, inputStream);
Owen Taylor3473f882001-02-23 17:55:21 +00006195
6196 inputPush(ctxt, inputStream);
6197
6198 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02006199 (ctxt->input->buf != NULL)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006200 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6201 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillard5f704af2003-03-05 10:01:43 +00006202
Daniel Veillarde77db162009-08-22 11:32:38 +02006203 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006204
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006205 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00006206#ifdef DEBUG_PUSH
6207 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6208#endif
6209 }
Daniel Veillard68716a72006-10-16 09:32:17 +00006210 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00006211
6212 return(ctxt);
6213}
William M. Brack21e4ef22005-01-02 09:53:13 +00006214#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00006215
6216/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006217 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006218 * @cur: a pointer to an array of xmlChar
6219 * @encoding: a free form C string describing the HTML document encoding, or NULL
6220 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006221 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006222 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006223 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6224 * to handle parse events. If sax is NULL, fallback to the default DOM
6225 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006226 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006227 * Returns the resulting document tree unless SAX is NULL or the document is
6228 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006229 */
6230
6231htmlDocPtr
6232htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6233 htmlDocPtr ret;
6234 htmlParserCtxtPtr ctxt;
6235
Daniel Veillardd0463562001-10-13 09:15:48 +00006236 xmlInitParser();
6237
Owen Taylor3473f882001-02-23 17:55:21 +00006238 if (cur == NULL) return(NULL);
6239
6240
6241 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6242 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02006243 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00006244 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00006245 ctxt->sax = sax;
6246 ctxt->userData = userData;
6247 }
6248
6249 htmlParseDocument(ctxt);
6250 ret = ctxt->myDoc;
6251 if (sax != NULL) {
6252 ctxt->sax = NULL;
6253 ctxt->userData = NULL;
6254 }
6255 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006256
Owen Taylor3473f882001-02-23 17:55:21 +00006257 return(ret);
6258}
6259
6260/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006261 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006262 * @cur: a pointer to an array of xmlChar
6263 * @encoding: a free form C string describing the HTML document encoding, or NULL
6264 *
6265 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006266 *
Owen Taylor3473f882001-02-23 17:55:21 +00006267 * Returns the resulting document tree
6268 */
6269
6270htmlDocPtr
6271htmlParseDoc(xmlChar *cur, const char *encoding) {
6272 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6273}
6274
6275
6276/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006277 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006278 * @filename: the filename
6279 * @encoding: a free form C string describing the HTML document encoding, or NULL
6280 *
Daniel Veillarde77db162009-08-22 11:32:38 +02006281 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00006282 * Automatic support for ZLIB/Compress compressed document is provided
6283 * by default if found at compile-time.
6284 *
6285 * Returns the new parser context or NULL
6286 */
6287htmlParserCtxtPtr
6288htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6289{
6290 htmlParserCtxtPtr ctxt;
6291 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006292 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00006293 /* htmlCharEncoding enc; */
6294 xmlChar *content, *content_line = (xmlChar *) "charset=";
6295
Daniel Veillarda03e3652004-11-02 18:45:30 +00006296 if (filename == NULL)
6297 return(NULL);
6298
Daniel Veillardf403d292003-10-05 13:51:35 +00006299 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006300 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00006301 return(NULL);
6302 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006303 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6304 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00006305#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006306 if (xmlDefaultSAXHandler.error != NULL) {
6307 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6308 }
Daniel Veillard87247e82004-01-13 20:42:02 +00006309#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00006310 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00006311 return(NULL);
6312 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006313
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006314 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6315 xmlFree(canonicFilename);
6316 if (inputStream == NULL) {
6317 xmlFreeParserCtxt(ctxt);
6318 return(NULL);
6319 }
Owen Taylor3473f882001-02-23 17:55:21 +00006320
6321 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006322
Owen Taylor3473f882001-02-23 17:55:21 +00006323 /* set encoding */
6324 if (encoding) {
Daniel Veillard292a9f22014-10-06 18:51:04 +08006325 size_t l = strlen(encoding);
6326
6327 if (l < 1000) {
6328 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6329 if (content) {
6330 strcpy ((char *)content, (char *)content_line);
6331 strcat ((char *)content, (char *)encoding);
6332 htmlCheckEncoding (ctxt, content);
6333 xmlFree (content);
6334 }
Owen Taylor3473f882001-02-23 17:55:21 +00006335 }
6336 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006337
Owen Taylor3473f882001-02-23 17:55:21 +00006338 return(ctxt);
6339}
6340
6341/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006342 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006343 * @filename: the filename
6344 * @encoding: a free form C string describing the HTML document encoding, or NULL
6345 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006346 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006347 *
6348 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6349 * compressed document is provided by default if found at compile-time.
6350 * It use the given SAX function block to handle the parsing callback.
6351 * If sax is NULL, fallback to the default DOM tree building routines.
6352 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006353 * Returns the resulting document tree unless SAX is NULL or the document is
6354 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006355 */
6356
6357htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006358htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00006359 void *userData) {
6360 htmlDocPtr ret;
6361 htmlParserCtxtPtr ctxt;
6362 htmlSAXHandlerPtr oldsax = NULL;
6363
Daniel Veillardd0463562001-10-13 09:15:48 +00006364 xmlInitParser();
6365
Owen Taylor3473f882001-02-23 17:55:21 +00006366 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6367 if (ctxt == NULL) return(NULL);
6368 if (sax != NULL) {
6369 oldsax = ctxt->sax;
6370 ctxt->sax = sax;
6371 ctxt->userData = userData;
6372 }
6373
6374 htmlParseDocument(ctxt);
6375
6376 ret = ctxt->myDoc;
6377 if (sax != NULL) {
6378 ctxt->sax = oldsax;
6379 ctxt->userData = NULL;
6380 }
6381 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006382
Owen Taylor3473f882001-02-23 17:55:21 +00006383 return(ret);
6384}
6385
6386/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006387 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006388 * @filename: the filename
6389 * @encoding: a free form C string describing the HTML document encoding, or NULL
6390 *
6391 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6392 * compressed document is provided by default if found at compile-time.
6393 *
6394 * Returns the resulting document tree
6395 */
6396
6397htmlDocPtr
6398htmlParseFile(const char *filename, const char *encoding) {
6399 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6400}
6401
6402/**
6403 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02006404 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00006405 *
6406 * Set and return the previous value for handling HTML omitted tags.
6407 *
6408 * Returns the last value for 0 for no handling, 1 for auto insertion.
6409 */
6410
6411int
6412htmlHandleOmittedElem(int val) {
6413 int old = htmlOmittedDefaultValue;
6414
6415 htmlOmittedDefaultValue = val;
6416 return(old);
6417}
6418
Daniel Veillard930dfb62003-02-05 10:17:38 +00006419/**
6420 * htmlElementAllowedHere:
6421 * @parent: HTML parent element
6422 * @elt: HTML element
6423 *
6424 * Checks whether an HTML element may be a direct child of a parent element.
6425 * Note - doesn't check for deprecated elements
6426 *
6427 * Returns 1 if allowed; 0 otherwise.
6428 */
6429int
6430htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6431 const char** p ;
6432
6433 if ( ! elt || ! parent || ! parent->subelts )
6434 return 0 ;
6435
6436 for ( p = parent->subelts; *p; ++p )
6437 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6438 return 1 ;
6439
6440 return 0 ;
6441}
6442/**
6443 * htmlElementStatusHere:
6444 * @parent: HTML parent element
6445 * @elt: HTML element
6446 *
6447 * Checks whether an HTML element may be a direct child of a parent element.
6448 * and if so whether it is valid or deprecated.
6449 *
6450 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6451 */
6452htmlStatus
6453htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6454 if ( ! parent || ! elt )
6455 return HTML_INVALID ;
6456 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6457 return HTML_INVALID ;
6458
6459 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6460}
6461/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006462 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00006463 * @elt: HTML element
6464 * @attr: HTML attribute
6465 * @legacy: whether to allow deprecated attributes
6466 *
6467 * Checks whether an attribute is valid for an element
6468 * Has full knowledge of Required and Deprecated attributes
6469 *
6470 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6471 */
6472htmlStatus
6473htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6474 const char** p ;
6475
6476 if ( !elt || ! attr )
6477 return HTML_INVALID ;
6478
6479 if ( elt->attrs_req )
6480 for ( p = elt->attrs_req; *p; ++p)
6481 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6482 return HTML_REQUIRED ;
6483
6484 if ( elt->attrs_opt )
6485 for ( p = elt->attrs_opt; *p; ++p)
6486 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6487 return HTML_VALID ;
6488
6489 if ( legacy && elt->attrs_depr )
6490 for ( p = elt->attrs_depr; *p; ++p)
6491 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6492 return HTML_DEPRECATED ;
6493
6494 return HTML_INVALID ;
6495}
6496/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006497 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006498 * @node: an htmlNodePtr in a tree
6499 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006500 * for Element nodes)
6501 *
6502 * Checks whether the tree node is valid. Experimental (the author
6503 * only uses the HTML enhancements in a SAX parser)
6504 *
6505 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6506 * legacy allowed) or htmlElementStatusHere (otherwise).
6507 * for Attribute nodes, a return from htmlAttrAllowed
6508 * for other nodes, HTML_NA (no checks performed)
6509 */
6510htmlStatus
6511htmlNodeStatus(const htmlNodePtr node, int legacy) {
6512 if ( ! node )
6513 return HTML_INVALID ;
6514
6515 switch ( node->type ) {
6516 case XML_ELEMENT_NODE:
6517 return legacy
6518 ? ( htmlElementAllowedHere (
6519 htmlTagLookup(node->parent->name) , node->name
6520 ) ? HTML_VALID : HTML_INVALID )
6521 : htmlElementStatusHere(
6522 htmlTagLookup(node->parent->name) ,
6523 htmlTagLookup(node->name) )
6524 ;
6525 case XML_ATTRIBUTE_NODE:
6526 return htmlAttrAllowed(
6527 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6528 default: return HTML_NA ;
6529 }
6530}
Daniel Veillard9475a352003-09-26 12:47:50 +00006531/************************************************************************
6532 * *
6533 * New set (2.6.0) of simpler and more flexible APIs *
6534 * *
6535 ************************************************************************/
6536/**
6537 * DICT_FREE:
6538 * @str: a string
6539 *
6540 * Free a string if it is not owned by the "dict" dictionnary in the
6541 * current scope
6542 */
6543#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006544 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006545 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6546 xmlFree((char *)(str));
6547
6548/**
6549 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006550 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006551 *
6552 * Reset a parser context
6553 */
6554void
6555htmlCtxtReset(htmlParserCtxtPtr ctxt)
6556{
6557 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006558 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006559
Daniel Veillarda03e3652004-11-02 18:45:30 +00006560 if (ctxt == NULL)
6561 return;
6562
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006563 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006564 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006565
6566 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6567 xmlFreeInputStream(input);
6568 }
6569 ctxt->inputNr = 0;
6570 ctxt->input = NULL;
6571
6572 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006573 if (ctxt->spaceTab != NULL) {
6574 ctxt->spaceTab[0] = -1;
6575 ctxt->space = &ctxt->spaceTab[0];
6576 } else {
6577 ctxt->space = NULL;
6578 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006579
6580
6581 ctxt->nodeNr = 0;
6582 ctxt->node = NULL;
6583
6584 ctxt->nameNr = 0;
6585 ctxt->name = NULL;
6586
6587 DICT_FREE(ctxt->version);
6588 ctxt->version = NULL;
6589 DICT_FREE(ctxt->encoding);
6590 ctxt->encoding = NULL;
6591 DICT_FREE(ctxt->directory);
6592 ctxt->directory = NULL;
6593 DICT_FREE(ctxt->extSubURI);
6594 ctxt->extSubURI = NULL;
6595 DICT_FREE(ctxt->extSubSystem);
6596 ctxt->extSubSystem = NULL;
6597 if (ctxt->myDoc != NULL)
6598 xmlFreeDoc(ctxt->myDoc);
6599 ctxt->myDoc = NULL;
6600
6601 ctxt->standalone = -1;
6602 ctxt->hasExternalSubset = 0;
6603 ctxt->hasPErefs = 0;
6604 ctxt->html = 1;
6605 ctxt->external = 0;
6606 ctxt->instate = XML_PARSER_START;
6607 ctxt->token = 0;
6608
6609 ctxt->wellFormed = 1;
6610 ctxt->nsWellFormed = 1;
Daniel Veillard8ad29302010-10-28 11:51:22 +02006611 ctxt->disableSAX = 0;
Daniel Veillard9475a352003-09-26 12:47:50 +00006612 ctxt->valid = 1;
6613 ctxt->vctxt.userData = ctxt;
6614 ctxt->vctxt.error = xmlParserValidityError;
6615 ctxt->vctxt.warning = xmlParserValidityWarning;
6616 ctxt->record_info = 0;
6617 ctxt->nbChars = 0;
6618 ctxt->checkIndex = 0;
6619 ctxt->inSubset = 0;
6620 ctxt->errNo = XML_ERR_OK;
6621 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006622 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006623 ctxt->catalogs = NULL;
6624 xmlInitNodeInfoSeq(&ctxt->node_seq);
6625
6626 if (ctxt->attsDefault != NULL) {
6627 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6628 ctxt->attsDefault = NULL;
6629 }
6630 if (ctxt->attsSpecial != NULL) {
6631 xmlHashFree(ctxt->attsSpecial, NULL);
6632 ctxt->attsSpecial = NULL;
6633 }
6634}
6635
6636/**
6637 * htmlCtxtUseOptions:
6638 * @ctxt: an HTML parser context
6639 * @options: a combination of htmlParserOption(s)
6640 *
6641 * Applies the options to the parser context
6642 *
6643 * Returns 0 in case of success, the set of unknown or unimplemented options
6644 * in case of error.
6645 */
6646int
6647htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6648{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006649 if (ctxt == NULL)
6650 return(-1);
6651
Daniel Veillard9475a352003-09-26 12:47:50 +00006652 if (options & HTML_PARSE_NOWARNING) {
6653 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006654 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006655 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006656 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006657 }
6658 if (options & HTML_PARSE_NOERROR) {
6659 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006660 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006661 ctxt->sax->fatalError = NULL;
6662 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006663 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006664 }
6665 if (options & HTML_PARSE_PEDANTIC) {
6666 ctxt->pedantic = 1;
6667 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006668 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006669 } else
6670 ctxt->pedantic = 0;
6671 if (options & XML_PARSE_NOBLANKS) {
6672 ctxt->keepBlanks = 0;
6673 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6674 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006675 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006676 } else
6677 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006678 if (options & HTML_PARSE_RECOVER) {
6679 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006680 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006681 } else
6682 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006683 if (options & HTML_PARSE_COMPACT) {
6684 ctxt->options |= HTML_PARSE_COMPACT;
6685 options -= HTML_PARSE_COMPACT;
6686 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006687 if (options & XML_PARSE_HUGE) {
6688 ctxt->options |= XML_PARSE_HUGE;
6689 options -= XML_PARSE_HUGE;
6690 }
Daniel Veillardf1121c42010-07-26 14:02:42 +02006691 if (options & HTML_PARSE_NODEFDTD) {
6692 ctxt->options |= HTML_PARSE_NODEFDTD;
6693 options -= HTML_PARSE_NODEFDTD;
6694 }
Daniel Veillardc62efc82011-05-16 16:03:50 +08006695 if (options & HTML_PARSE_IGNORE_ENC) {
6696 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6697 options -= HTML_PARSE_IGNORE_ENC;
6698 }
Martin Schröderb91111b2012-05-10 18:52:37 +08006699 if (options & HTML_PARSE_NOIMPLIED) {
6700 ctxt->options |= HTML_PARSE_NOIMPLIED;
6701 options -= HTML_PARSE_NOIMPLIED;
6702 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006703 ctxt->dictNames = 0;
6704 return (options);
6705}
6706
6707/**
6708 * htmlDoRead:
6709 * @ctxt: an HTML parser context
6710 * @URL: the base URL to use for the document
6711 * @encoding: the document encoding, or NULL
6712 * @options: a combination of htmlParserOption(s)
6713 * @reuse: keep the context for reuse
6714 *
6715 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006716 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006717 * Returns the resulting document tree or NULL
6718 */
6719static htmlDocPtr
6720htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6721 int options, int reuse)
6722{
6723 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006724
Daniel Veillard9475a352003-09-26 12:47:50 +00006725 htmlCtxtUseOptions(ctxt, options);
6726 ctxt->html = 1;
6727 if (encoding != NULL) {
6728 xmlCharEncodingHandlerPtr hdlr;
6729
6730 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006731 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006732 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006733 if (ctxt->input->encoding != NULL)
6734 xmlFree((xmlChar *) ctxt->input->encoding);
6735 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6736 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006737 }
6738 if ((URL != NULL) && (ctxt->input != NULL) &&
6739 (ctxt->input->filename == NULL))
6740 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6741 htmlParseDocument(ctxt);
6742 ret = ctxt->myDoc;
6743 ctxt->myDoc = NULL;
6744 if (!reuse) {
6745 if ((ctxt->dictNames) &&
6746 (ret != NULL) &&
6747 (ret->dict == ctxt->dict))
6748 ctxt->dict = NULL;
6749 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006750 }
6751 return (ret);
6752}
6753
6754/**
6755 * htmlReadDoc:
6756 * @cur: a pointer to a zero terminated string
6757 * @URL: the base URL to use for the document
6758 * @encoding: the document encoding, or NULL
6759 * @options: a combination of htmlParserOption(s)
6760 *
6761 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006762 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006763 * Returns the resulting document tree
6764 */
6765htmlDocPtr
6766htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6767{
6768 htmlParserCtxtPtr ctxt;
6769
6770 if (cur == NULL)
6771 return (NULL);
6772
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006773 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006774 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006775 if (ctxt == NULL)
6776 return (NULL);
6777 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6778}
6779
6780/**
6781 * htmlReadFile:
6782 * @filename: a file or URL
6783 * @encoding: the document encoding, or NULL
6784 * @options: a combination of htmlParserOption(s)
6785 *
6786 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006787 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006788 * Returns the resulting document tree
6789 */
6790htmlDocPtr
6791htmlReadFile(const char *filename, const char *encoding, int options)
6792{
6793 htmlParserCtxtPtr ctxt;
6794
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006795 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006796 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6797 if (ctxt == NULL)
6798 return (NULL);
6799 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6800}
6801
6802/**
6803 * htmlReadMemory:
6804 * @buffer: a pointer to a char array
6805 * @size: the size of the array
6806 * @URL: the base URL to use for the document
6807 * @encoding: the document encoding, or NULL
6808 * @options: a combination of htmlParserOption(s)
6809 *
6810 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006811 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006812 * Returns the resulting document tree
6813 */
6814htmlDocPtr
6815htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6816{
6817 htmlParserCtxtPtr ctxt;
6818
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006819 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006820 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6821 if (ctxt == NULL)
6822 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006823 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006824 if (ctxt->sax != NULL)
6825 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006826 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6827}
6828
6829/**
6830 * htmlReadFd:
6831 * @fd: an open file descriptor
6832 * @URL: the base URL to use for the document
6833 * @encoding: the document encoding, or NULL
6834 * @options: a combination of htmlParserOption(s)
6835 *
6836 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006837 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006838 * Returns the resulting document tree
6839 */
6840htmlDocPtr
6841htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6842{
6843 htmlParserCtxtPtr ctxt;
6844 xmlParserInputBufferPtr input;
6845 xmlParserInputPtr stream;
6846
6847 if (fd < 0)
6848 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08006849 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006850
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006851 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006852 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6853 if (input == NULL)
6854 return (NULL);
6855 ctxt = xmlNewParserCtxt();
6856 if (ctxt == NULL) {
6857 xmlFreeParserInputBuffer(input);
6858 return (NULL);
6859 }
6860 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6861 if (stream == NULL) {
6862 xmlFreeParserInputBuffer(input);
6863 xmlFreeParserCtxt(ctxt);
6864 return (NULL);
6865 }
6866 inputPush(ctxt, stream);
6867 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6868}
6869
6870/**
6871 * htmlReadIO:
6872 * @ioread: an I/O read function
6873 * @ioclose: an I/O close function
6874 * @ioctx: an I/O handler
6875 * @URL: the base URL to use for the document
6876 * @encoding: the document encoding, or NULL
6877 * @options: a combination of htmlParserOption(s)
6878 *
6879 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006880 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006881 * Returns the resulting document tree
6882 */
6883htmlDocPtr
6884htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6885 void *ioctx, const char *URL, const char *encoding, int options)
6886{
6887 htmlParserCtxtPtr ctxt;
6888 xmlParserInputBufferPtr input;
6889 xmlParserInputPtr stream;
6890
6891 if (ioread == NULL)
6892 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006893 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006894
6895 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6896 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006897 if (input == NULL) {
6898 if (ioclose != NULL)
6899 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00006900 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006901 }
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006902 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006903 if (ctxt == NULL) {
6904 xmlFreeParserInputBuffer(input);
6905 return (NULL);
6906 }
6907 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6908 if (stream == NULL) {
6909 xmlFreeParserInputBuffer(input);
6910 xmlFreeParserCtxt(ctxt);
6911 return (NULL);
6912 }
6913 inputPush(ctxt, stream);
6914 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6915}
6916
6917/**
6918 * htmlCtxtReadDoc:
6919 * @ctxt: an HTML parser context
6920 * @cur: a pointer to a zero terminated string
6921 * @URL: the base URL to use for the document
6922 * @encoding: the document encoding, or NULL
6923 * @options: a combination of htmlParserOption(s)
6924 *
6925 * parse an XML in-memory document and build a tree.
6926 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006927 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006928 * Returns the resulting document tree
6929 */
6930htmlDocPtr
6931htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6932 const char *URL, const char *encoding, int options)
6933{
6934 xmlParserInputPtr stream;
6935
6936 if (cur == NULL)
6937 return (NULL);
6938 if (ctxt == NULL)
6939 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08006940 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006941
6942 htmlCtxtReset(ctxt);
6943
6944 stream = xmlNewStringInputStream(ctxt, cur);
6945 if (stream == NULL) {
6946 return (NULL);
6947 }
6948 inputPush(ctxt, stream);
6949 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6950}
6951
6952/**
6953 * htmlCtxtReadFile:
6954 * @ctxt: an HTML parser context
6955 * @filename: a file or URL
6956 * @encoding: the document encoding, or NULL
6957 * @options: a combination of htmlParserOption(s)
6958 *
6959 * parse an XML file from the filesystem or the network.
6960 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006961 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006962 * Returns the resulting document tree
6963 */
6964htmlDocPtr
6965htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6966 const char *encoding, int options)
6967{
6968 xmlParserInputPtr stream;
6969
6970 if (filename == NULL)
6971 return (NULL);
6972 if (ctxt == NULL)
6973 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08006974 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006975
6976 htmlCtxtReset(ctxt);
6977
Daniel Veillard29614c72004-11-26 10:47:26 +00006978 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006979 if (stream == NULL) {
6980 return (NULL);
6981 }
6982 inputPush(ctxt, stream);
6983 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6984}
6985
6986/**
6987 * htmlCtxtReadMemory:
6988 * @ctxt: an HTML parser context
6989 * @buffer: a pointer to a char array
6990 * @size: the size of the array
6991 * @URL: the base URL to use for the document
6992 * @encoding: the document encoding, or NULL
6993 * @options: a combination of htmlParserOption(s)
6994 *
6995 * parse an XML in-memory document and build a tree.
6996 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006997 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006998 * Returns the resulting document tree
6999 */
7000htmlDocPtr
7001htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7002 const char *URL, const char *encoding, int options)
7003{
7004 xmlParserInputBufferPtr input;
7005 xmlParserInputPtr stream;
7006
7007 if (ctxt == NULL)
7008 return (NULL);
7009 if (buffer == NULL)
7010 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007011 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007012
7013 htmlCtxtReset(ctxt);
7014
7015 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7016 if (input == NULL) {
7017 return(NULL);
7018 }
7019
7020 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7021 if (stream == NULL) {
7022 xmlFreeParserInputBuffer(input);
7023 return(NULL);
7024 }
7025
7026 inputPush(ctxt, stream);
7027 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7028}
7029
7030/**
7031 * htmlCtxtReadFd:
7032 * @ctxt: an HTML parser context
7033 * @fd: an open file descriptor
7034 * @URL: the base URL to use for the document
7035 * @encoding: the document encoding, or NULL
7036 * @options: a combination of htmlParserOption(s)
7037 *
7038 * parse an XML from a file descriptor and build a tree.
7039 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007040 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007041 * Returns the resulting document tree
7042 */
7043htmlDocPtr
7044htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7045 const char *URL, const char *encoding, int options)
7046{
7047 xmlParserInputBufferPtr input;
7048 xmlParserInputPtr stream;
7049
7050 if (fd < 0)
7051 return (NULL);
7052 if (ctxt == NULL)
7053 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007054 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007055
7056 htmlCtxtReset(ctxt);
7057
7058
7059 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7060 if (input == NULL)
7061 return (NULL);
7062 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7063 if (stream == NULL) {
7064 xmlFreeParserInputBuffer(input);
7065 return (NULL);
7066 }
7067 inputPush(ctxt, stream);
7068 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7069}
7070
7071/**
7072 * htmlCtxtReadIO:
7073 * @ctxt: an HTML parser context
7074 * @ioread: an I/O read function
7075 * @ioclose: an I/O close function
7076 * @ioctx: an I/O handler
7077 * @URL: the base URL to use for the document
7078 * @encoding: the document encoding, or NULL
7079 * @options: a combination of htmlParserOption(s)
7080 *
7081 * parse an HTML document from I/O functions and source and build a tree.
7082 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007083 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007084 * Returns the resulting document tree
7085 */
7086htmlDocPtr
7087htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7088 xmlInputCloseCallback ioclose, void *ioctx,
7089 const char *URL,
7090 const char *encoding, int options)
7091{
7092 xmlParserInputBufferPtr input;
7093 xmlParserInputPtr stream;
7094
7095 if (ioread == NULL)
7096 return (NULL);
7097 if (ctxt == NULL)
7098 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007099 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007100
7101 htmlCtxtReset(ctxt);
7102
7103 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7104 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007105 if (input == NULL) {
7106 if (ioclose != NULL)
7107 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00007108 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007109 }
Daniel Veillard9475a352003-09-26 12:47:50 +00007110 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7111 if (stream == NULL) {
7112 xmlFreeParserInputBuffer(input);
7113 return (NULL);
7114 }
7115 inputPush(ctxt, stream);
7116 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7117}
7118
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007119#define bottom_HTMLparser
7120#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00007121#endif /* LIBXML_HTML_ENABLED */