blob: de624f8d0bf283b380620dbc08a81eb1388be057 [file] [log] [blame]
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef LIBXML_ZLIB_ENABLED
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#include "buf.h"
48#include "enc.h"
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57static int htmlOmittedDefaultValue = 1;
58
59xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63/************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69/**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
Haibo Huangcfd91dc2020-07-30 23:01:33 -070072 * @extra: extra information
Elliott Hughes7fbecab2019-01-10 16:42:03 -080073 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void LIBXML_ATTR_FORMAT(3,0)
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void LIBXML_ATTR_FORMAT(3,0)
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149}
150
151/************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166static int
167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168{
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196static const xmlChar *
197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213}
214
215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Haibo Huangf0a546b2020-09-01 20:28:19 -0700299#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306#define BASE_PTR ctxt->input->base
307
308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700320/* Imported from XML */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800321
322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
324#define NEXT xmlNextChar(ctxt)
325
326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
Haibo Huangf0a546b2020-09-01 20:28:19 -0700333 ctxt->token = 0; ctxt->input->cur += l; \
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800334 } while (0)
335
336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415static int
416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
420
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
423
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
427 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
431
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700432 /*
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
436 */
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
444 }
445 return((int) *ctxt->input->cur);
446 }
447
448 /*
449 * Humm this is bad, do an automatic flow conversion
450 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
460 xmlSwitchToEncoding(ctxt, handler);
461 } else {
462 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
463 "Unsupported encoding %s", guess, NULL);
464 }
465 }
466 ctxt->charset = XML_CHAR_ENCODING_UTF8;
467 }
468
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700469 /*
470 * We are supposed to handle UTF8, check it's valid
471 * From rfc2044: encoding of the Unicode values on UTF-8:
472 *
473 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
474 * 0000 0000-0000 007F 0xxxxxxx
475 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
476 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
477 *
478 * Check for the 0x110000 limit too
479 */
480 cur = ctxt->input->cur;
481 c = *cur;
482 if (c & 0x80) {
483 if ((c & 0x40) == 0)
484 goto encoding_error;
485 if (cur[1] == 0) {
486 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
487 cur = ctxt->input->cur;
488 }
489 if ((cur[1] & 0xc0) != 0x80)
490 goto encoding_error;
491 if ((c & 0xe0) == 0xe0) {
492
493 if (cur[2] == 0) {
494 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
495 cur = ctxt->input->cur;
496 }
497 if ((cur[2] & 0xc0) != 0x80)
498 goto encoding_error;
499 if ((c & 0xf0) == 0xf0) {
500 if (cur[3] == 0) {
501 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
502 cur = ctxt->input->cur;
503 }
504 if (((c & 0xf8) != 0xf0) ||
505 ((cur[3] & 0xc0) != 0x80))
506 goto encoding_error;
507 /* 4-byte code */
508 *len = 4;
509 val = (cur[0] & 0x7) << 18;
510 val |= (cur[1] & 0x3f) << 12;
511 val |= (cur[2] & 0x3f) << 6;
512 val |= cur[3] & 0x3f;
513 if (val < 0x10000)
514 goto encoding_error;
515 } else {
516 /* 3-byte code */
517 *len = 3;
518 val = (cur[0] & 0xf) << 12;
519 val |= (cur[1] & 0x3f) << 6;
520 val |= cur[2] & 0x3f;
521 if (val < 0x800)
522 goto encoding_error;
523 }
524 } else {
525 /* 2-byte code */
526 *len = 2;
527 val = (cur[0] & 0x1f) << 6;
528 val |= cur[1] & 0x3f;
529 if (val < 0x80)
530 goto encoding_error;
531 }
532 if (!IS_CHAR(val)) {
533 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
534 "Char 0x%X out of allowed range\n", val);
535 }
536 return(val);
537 } else {
538 if ((*ctxt->input->cur == 0) &&
539 (ctxt->input->cur < ctxt->input->end)) {
540 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
541 "Char 0x%X out of allowed range\n", 0);
542 *len = 1;
543 return(' ');
544 }
545 /* 1-byte code */
546 *len = 1;
547 return((int) *ctxt->input->cur);
548 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800549
550encoding_error:
551 /*
552 * If we detect an UTF8 error that probably mean that the
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700553 * input encoding didn't get properly advertised in the
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800554 * declaration header. Report the error and switch the encoding
555 * to ISO-Latin-1 (if you don't like this policy, just declare the
556 * encoding !)
557 */
558 {
559 char buffer[150];
560
561 if (ctxt->input->end - ctxt->input->cur >= 4) {
562 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
563 ctxt->input->cur[0], ctxt->input->cur[1],
564 ctxt->input->cur[2], ctxt->input->cur[3]);
565 } else {
566 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
567 }
568 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
569 "Input is not proper UTF-8, indicate encoding !\n",
570 BAD_CAST buffer, NULL);
571 }
572
573 ctxt->charset = XML_CHAR_ENCODING_8859_1;
574 *len = 1;
575 return((int) *ctxt->input->cur);
576}
577
578/**
579 * htmlSkipBlankChars:
580 * @ctxt: the HTML parser context
581 *
582 * skip all blanks character found at that point in the input streams.
583 *
584 * Returns the number of space chars skipped
585 */
586
587static int
588htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
589 int res = 0;
590
591 while (IS_BLANK_CH(*(ctxt->input->cur))) {
592 if ((*ctxt->input->cur == 0) &&
593 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
594 xmlPopInput(ctxt);
595 } else {
596 if (*(ctxt->input->cur) == '\n') {
597 ctxt->input->line++; ctxt->input->col = 1;
598 } else ctxt->input->col++;
599 ctxt->input->cur++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800600 if (*ctxt->input->cur == 0)
601 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
602 }
603 res++;
604 }
605 return(res);
606}
607
608
609
610/************************************************************************
611 * *
612 * The list of HTML elements and their properties *
613 * *
614 ************************************************************************/
615
616/*
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700617 * Start Tag: 1 means the start tag can be omitted
618 * End Tag: 1 means the end tag can be omitted
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800619 * 2 means it's forbidden (empty elements)
620 * 3 means the tag is stylistic and should be closed easily
621 * Depr: this element is deprecated
622 * DTD: 1 means that this element is valid only in the Loose DTD
623 * 2 means that this element is valid only in the Frameset DTD
624 *
625 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
626 , subElements , impliedsubelt , Attributes, userdata
627 */
628
629/* Definitions and a couple of vars for HTML Elements */
630
631#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
632#define NB_FONTSTYLE 8
633#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
634#define NB_PHRASE 10
635#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
636#define NB_SPECIAL 16
637#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
638#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
639#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
640#define NB_BLOCK NB_HEADING + NB_LIST + 14
641#define FORMCTRL "input", "select", "textarea", "label", "button"
642#define NB_FORMCTRL 5
643#define PCDATA
644#define NB_PCDATA 0
645#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
646#define NB_HEADING 6
647#define LIST "ul", "ol", "dir", "menu"
648#define NB_LIST 4
649#define MODIFIER
650#define NB_MODIFIER 0
651#define FLOW BLOCK,INLINE
652#define NB_FLOW NB_BLOCK + NB_INLINE
653#define EMPTY NULL
654
655
656static const char* const html_flow[] = { FLOW, NULL } ;
657static const char* const html_inline[] = { INLINE, NULL } ;
658
659/* placeholders: elts with content but no subelements */
660static const char* const html_pcdata[] = { NULL } ;
661#define html_cdata html_pcdata
662
663
664/* ... and for HTML Attributes */
665
666#define COREATTRS "id", "class", "style", "title"
667#define NB_COREATTRS 4
668#define I18N "lang", "dir"
669#define NB_I18N 2
670#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
671#define NB_EVENTS 9
672#define ATTRS COREATTRS,I18N,EVENTS
673#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
674#define CELLHALIGN "align", "char", "charoff"
675#define NB_CELLHALIGN 3
676#define CELLVALIGN "valign"
677#define NB_CELLVALIGN 1
678
679static const char* const html_attrs[] = { ATTRS, NULL } ;
680static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
681static const char* const core_attrs[] = { COREATTRS, NULL } ;
682static const char* const i18n_attrs[] = { I18N, NULL } ;
683
684
685/* Other declarations that should go inline ... */
686static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
687 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
688 "tabindex", "onfocus", "onblur", NULL } ;
689static const char* const target_attr[] = { "target", NULL } ;
690static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
691static const char* const alt_attr[] = { "alt", NULL } ;
692static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
693static const char* const href_attrs[] = { "href", NULL } ;
694static const char* const clear_attrs[] = { "clear", NULL } ;
695static const char* const inline_p[] = { INLINE, "p", NULL } ;
696
697static const char* const flow_param[] = { FLOW, "param", NULL } ;
698static const char* const applet_attrs[] = { COREATTRS , "codebase",
699 "archive", "alt", "name", "height", "width", "align",
700 "hspace", "vspace", NULL } ;
701static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
702 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
703static const char* const basefont_attrs[] =
704 { "id", "size", "color", "face", NULL } ;
705static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
706static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
707static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
708static const char* const body_depr[] = { "background", "bgcolor", "text",
709 "link", "vlink", "alink", NULL } ;
710static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
711 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
712
713
714static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
715static const char* const col_elt[] = { "col", NULL } ;
716static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
717static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
718static const char* const dl_contents[] = { "dt", "dd", NULL } ;
719static const char* const compact_attr[] = { "compact", NULL } ;
720static const char* const label_attr[] = { "label", NULL } ;
721static const char* const fieldset_contents[] = { FLOW, "legend" } ;
722static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
723static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
724static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
725static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
726static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
727static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
728static const char* const head_attrs[] = { I18N, "profile", NULL } ;
729static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
730static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
731static const char* const version_attr[] = { "version", NULL } ;
732static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
733static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
734static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
735static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
736static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
737static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
738static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
739static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
740static const char* const align_attr[] = { "align", NULL } ;
741static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
742static const char* const map_contents[] = { BLOCK, "area", NULL } ;
743static const char* const name_attr[] = { "name", NULL } ;
744static const char* const action_attr[] = { "action", NULL } ;
745static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
746static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
747static const char* const content_attr[] = { "content", NULL } ;
748static const char* const type_attr[] = { "type", NULL } ;
749static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
750static const char* const object_contents[] = { FLOW, "param", NULL } ;
751static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
752static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
753static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
754static const char* const option_elt[] = { "option", NULL } ;
755static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
756static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
757static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
758static const char* const width_attr[] = { "width", NULL } ;
759static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
760static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
761static const char* const language_attr[] = { "language", NULL } ;
762static const char* const select_content[] = { "optgroup", "option", NULL } ;
763static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
764static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
765static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
766static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
767static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
768static const char* const tr_elt[] = { "tr", NULL } ;
769static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
770static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
771static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
772static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
773static const char* const tr_contents[] = { "th", "td", NULL } ;
774static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
775static const char* const li_elt[] = { "li", NULL } ;
776static const char* const ul_depr[] = { "type", "compact", NULL} ;
777static const char* const dir_attr[] = { "dir", NULL} ;
778
779#define DECL (const char**)
780
781static const htmlElemDesc
782html40ElementTable[] = {
783{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
784 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
785},
786{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
787 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
788},
789{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791},
792{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
793 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
794},
795{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
796 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
797},
798{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
799 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
800},
801{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803},
804{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
805 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
806},
807{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
808 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
809},
810{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
811 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
812},
813{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
814 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
815},
816{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
817 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
818},
819{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
820 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
821},
822{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
823 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
824},
825{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
826 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
827},
828{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
829 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
830},
831{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
832 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
833},
834{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
835 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
836},
837{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
838 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
839},
840{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
841 EMPTY , NULL , DECL col_attrs , NULL, NULL
842},
843{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
844 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
845},
846{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
847 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
848},
849{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
850 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
851},
852{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
853 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
854},
855{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
856 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
857},
858{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
859 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
860},
861{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
862 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
863},
864{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
865 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
866},
867{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
868 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
869},
870{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
871 EMPTY, NULL, DECL embed_attrs, NULL, NULL
872},
873{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
874 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
875},
876{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
877 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
878},
879{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
880 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
881},
882{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
883 EMPTY, NULL, NULL, DECL frame_attrs, NULL
884},
885{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
886 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
887},
888{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890},
891{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
895 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
896},
897{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
898 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
899},
900{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
901 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
902},
903{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
904 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
905},
906{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
907 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
908},
909{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
910 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
911},
912{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
913 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
914},
915{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
916 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
917},
918{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
919 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
920},
921{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
922 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
923},
924{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
925 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
926},
927{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
928 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
929},
930{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
931 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
932},
933{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
934 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
935},
936{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
937 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
938},
939{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
940 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
941},
942{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
943 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
944},
945{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
946 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
947},
948{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
949 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
950},
951{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
952 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
953},
954{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
955 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
956},
957{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
958 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
959},
960{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
961 DECL html_flow, "div", DECL html_attrs, NULL, NULL
962},
963{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
964 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
965},
966{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
967 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
968},
969{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
970 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
971},
972{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
973 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
974},
975{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
976 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
977},
978{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
979 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
980},
981{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
982 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
983},
984{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
985 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
986},
987{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
988 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
989},
990{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992},
993{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
994 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
995},
996{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
997 DECL select_content, NULL, DECL select_attrs, NULL, NULL
998},
999{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1000 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1001},
1002{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004},
1005{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1006 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1007},
1008{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1009 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1010},
1011{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1012 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1013},
1014{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1015 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1016},
1017{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1018 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1019},
1020{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1021 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1022},
1023{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1024 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1025},
1026{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1027 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1028},
1029{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1030 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1031},
1032{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1033 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1034},
1035{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1036 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1037},
1038{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1039 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1040},
1041{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1042 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1043},
1044{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1045 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1046},
1047{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1048 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1049},
1050{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1051 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1052},
1053{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1054 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1055},
1056{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1057 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1058}
1059};
1060
1061/*
1062 * start tags that imply the end of current element
1063 */
1064static const char * const htmlStartClose[] = {
1065"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1066 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1067 "listing", "xmp", "head", NULL,
1068"head", "p", NULL,
1069"title", "p", NULL,
1070"body", "head", "style", "link", "title", "p", NULL,
1071"frameset", "head", "style", "link", "title", "p", NULL,
1072"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1073 "pre", "listing", "xmp", "head", "li", NULL,
1074"hr", "p", "head", NULL,
1075"h1", "p", "head", NULL,
1076"h2", "p", "head", NULL,
1077"h3", "p", "head", NULL,
1078"h4", "p", "head", NULL,
1079"h5", "p", "head", NULL,
1080"h6", "p", "head", NULL,
1081"dir", "p", "head", NULL,
1082"address", "p", "head", "ul", NULL,
1083"pre", "p", "head", "ul", NULL,
1084"listing", "p", "head", NULL,
1085"xmp", "p", "head", NULL,
1086"blockquote", "p", "head", NULL,
1087"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1088 "xmp", "head", NULL,
1089"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1090 "head", "dd", NULL,
1091"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1092 "head", "dt", NULL,
1093"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1094 "listing", "xmp", NULL,
1095"ol", "p", "head", "ul", NULL,
1096"menu", "p", "head", "ul", NULL,
1097"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1098"div", "p", "head", NULL,
1099"noscript", "script", NULL,
1100"center", "font", "b", "i", "p", "head", NULL,
1101"a", "a", "head", NULL,
1102"caption", "p", NULL,
1103"colgroup", "caption", "colgroup", "col", "p", NULL,
1104"col", "caption", "col", "p", NULL,
1105"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1106 "listing", "xmp", "a", NULL,
1107"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1108"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1109"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1110"thead", "caption", "col", "colgroup", NULL,
1111"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1112 "tbody", "p", NULL,
1113"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1114 "tfoot", "tbody", "p", NULL,
1115"optgroup", "option", NULL,
1116"option", "option", NULL,
1117"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1118 "pre", "listing", "xmp", "a", NULL,
1119/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1120"tt", "head", NULL,
1121"i", "head", NULL,
1122"b", "head", NULL,
1123"u", "head", NULL,
1124"s", "head", NULL,
1125"strike", "head", NULL,
1126"big", "head", NULL,
1127"small", "head", NULL,
1128
1129"em", "head", NULL,
1130"strong", "head", NULL,
1131"dfn", "head", NULL,
1132"code", "head", NULL,
1133"samp", "head", NULL,
1134"kbd", "head", NULL,
1135"var", "head", NULL,
1136"cite", "head", NULL,
1137"abbr", "head", NULL,
1138"acronym", "head", NULL,
1139
1140/* "a" */
1141"img", "head", NULL,
1142/* "applet" */
1143/* "embed" */
1144/* "object" */
1145"font", "head", NULL,
1146/* "basefont" */
1147"br", "head", NULL,
1148/* "script" */
1149"map", "head", NULL,
1150"q", "head", NULL,
1151"sub", "head", NULL,
1152"sup", "head", NULL,
1153"span", "head", NULL,
1154"bdo", "head", NULL,
1155"iframe", "head", NULL,
1156NULL
1157};
1158
1159/*
1160 * The list of HTML elements which are supposed not to have
1161 * CDATA content and where a p element will be implied
1162 *
1163 * TODO: extend that list by reading the HTML SGML DTD on
1164 * implied paragraph
1165 */
1166static const char *const htmlNoContentElements[] = {
1167 "html",
1168 "head",
1169 NULL
1170};
1171
1172/*
1173 * The list of HTML attributes which are of content %Script;
1174 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1175 * it assumes the name starts with 'on'
1176 */
1177static const char *const htmlScriptAttributes[] = {
1178 "onclick",
1179 "ondblclick",
1180 "onmousedown",
1181 "onmouseup",
1182 "onmouseover",
1183 "onmousemove",
1184 "onmouseout",
1185 "onkeypress",
1186 "onkeydown",
1187 "onkeyup",
1188 "onload",
1189 "onunload",
1190 "onfocus",
1191 "onblur",
1192 "onsubmit",
1193 "onreset",
1194 "onchange",
1195 "onselect"
1196};
1197
1198/*
1199 * This table is used by the htmlparser to know what to do with
1200 * broken html pages. By assigning different priorities to different
1201 * elements the parser can decide how to handle extra endtags.
1202 * Endtags are only allowed to close elements with lower or equal
1203 * priority.
1204 */
1205
1206typedef struct {
1207 const char *name;
1208 int priority;
1209} elementPriority;
1210
1211static const elementPriority htmlEndPriority[] = {
1212 {"div", 150},
1213 {"td", 160},
1214 {"th", 160},
1215 {"tr", 170},
1216 {"thead", 180},
1217 {"tbody", 180},
1218 {"tfoot", 180},
1219 {"table", 190},
1220 {"head", 200},
1221 {"body", 200},
1222 {"html", 220},
1223 {NULL, 100} /* Default priority */
1224};
1225
1226static const char** htmlStartCloseIndex[100];
1227static int htmlStartCloseIndexinitialized = 0;
1228
1229/************************************************************************
1230 * *
1231 * functions to handle HTML specific data *
1232 * *
1233 ************************************************************************/
1234
1235/**
1236 * htmlInitAutoClose:
1237 *
1238 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1239 * This is not reentrant. Call xmlInitParser() once before processing in
1240 * case of use in multithreaded programs.
1241 */
1242void
1243htmlInitAutoClose(void) {
1244 int indx, i = 0;
1245
1246 if (htmlStartCloseIndexinitialized) return;
1247
1248 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1249 indx = 0;
1250 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1251 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1252 while (htmlStartClose[i] != NULL) i++;
1253 i++;
1254 }
1255 htmlStartCloseIndexinitialized = 1;
1256}
1257
1258/**
1259 * htmlTagLookup:
1260 * @tag: The tag name in lowercase
1261 *
1262 * Lookup the HTML tag in the ElementTable
1263 *
1264 * Returns the related htmlElemDescPtr or NULL if not found.
1265 */
1266const htmlElemDesc *
1267htmlTagLookup(const xmlChar *tag) {
1268 unsigned int i;
1269
1270 for (i = 0; i < (sizeof(html40ElementTable) /
1271 sizeof(html40ElementTable[0]));i++) {
1272 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1273 return((htmlElemDescPtr) &html40ElementTable[i]);
1274 }
1275 return(NULL);
1276}
1277
1278/**
1279 * htmlGetEndPriority:
1280 * @name: The name of the element to look up the priority for.
1281 *
1282 * Return value: The "endtag" priority.
1283 **/
1284static int
1285htmlGetEndPriority (const xmlChar *name) {
1286 int i = 0;
1287
1288 while ((htmlEndPriority[i].name != NULL) &&
1289 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1290 i++;
1291
1292 return(htmlEndPriority[i].priority);
1293}
1294
1295
1296/**
1297 * htmlCheckAutoClose:
1298 * @newtag: The new tag name
1299 * @oldtag: The old tag name
1300 *
1301 * Checks whether the new tag is one of the registered valid tags for
1302 * closing old.
1303 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1304 *
1305 * Returns 0 if no, 1 if yes.
1306 */
1307static int
1308htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1309{
1310 int i, indx;
1311 const char **closed = NULL;
1312
1313 if (htmlStartCloseIndexinitialized == 0)
1314 htmlInitAutoClose();
1315
1316 /* inefficient, but not a big deal */
1317 for (indx = 0; indx < 100; indx++) {
1318 closed = htmlStartCloseIndex[indx];
1319 if (closed == NULL)
1320 return (0);
1321 if (xmlStrEqual(BAD_CAST * closed, newtag))
1322 break;
1323 }
1324
1325 i = closed - htmlStartClose;
1326 i++;
1327 while (htmlStartClose[i] != NULL) {
1328 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1329 return (1);
1330 }
1331 i++;
1332 }
1333 return (0);
1334}
1335
1336/**
1337 * htmlAutoCloseOnClose:
1338 * @ctxt: an HTML parser context
1339 * @newtag: The new tag name
1340 * @force: force the tag closure
1341 *
1342 * The HTML DTD allows an ending tag to implicitly close other tags.
1343 */
1344static void
1345htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1346{
1347 const htmlElemDesc *info;
1348 int i, priority;
1349
1350 priority = htmlGetEndPriority(newtag);
1351
1352 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1353
1354 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1355 break;
1356 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001357 * A misplaced endtag can only close elements with lower
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001358 * or equal priority, so if we find an element with higher
1359 * priority before we find an element with
1360 * matching name, we just ignore this endtag
1361 */
1362 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1363 return;
1364 }
1365 if (i < 0)
1366 return;
1367
1368 while (!xmlStrEqual(newtag, ctxt->name)) {
1369 info = htmlTagLookup(ctxt->name);
1370 if ((info != NULL) && (info->endTag == 3)) {
1371 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1372 "Opening and ending tag mismatch: %s and %s\n",
1373 newtag, ctxt->name);
1374 }
1375 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1376 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1377 htmlnamePop(ctxt);
1378 }
1379}
1380
1381/**
1382 * htmlAutoCloseOnEnd:
1383 * @ctxt: an HTML parser context
1384 *
1385 * Close all remaining tags at the end of the stream
1386 */
1387static void
1388htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1389{
1390 int i;
1391
1392 if (ctxt->nameNr == 0)
1393 return;
1394 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1395 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1396 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1397 htmlnamePop(ctxt);
1398 }
1399}
1400
1401/**
1402 * htmlAutoClose:
1403 * @ctxt: an HTML parser context
1404 * @newtag: The new tag name or NULL
1405 *
1406 * The HTML DTD allows a tag to implicitly close other tags.
1407 * The list is kept in htmlStartClose array. This function is
1408 * called when a new tag has been detected and generates the
1409 * appropriates closes if possible/needed.
1410 * If newtag is NULL this mean we are at the end of the resource
1411 * and we should check
1412 */
1413static void
1414htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1415{
1416 while ((newtag != NULL) && (ctxt->name != NULL) &&
1417 (htmlCheckAutoClose(newtag, ctxt->name))) {
1418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420 htmlnamePop(ctxt);
1421 }
1422 if (newtag == NULL) {
1423 htmlAutoCloseOnEnd(ctxt);
1424 return;
1425 }
1426 while ((newtag == NULL) && (ctxt->name != NULL) &&
1427 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1428 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1429 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1430 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1431 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1432 htmlnamePop(ctxt);
1433 }
1434}
1435
1436/**
1437 * htmlAutoCloseTag:
1438 * @doc: the HTML document
1439 * @name: The tag name
1440 * @elem: the HTML element
1441 *
1442 * The HTML DTD allows a tag to implicitly close other tags.
1443 * The list is kept in htmlStartClose array. This function checks
1444 * if the element or one of it's children would autoclose the
1445 * given tag.
1446 *
1447 * Returns 1 if autoclose, 0 otherwise
1448 */
1449int
1450htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1451 htmlNodePtr child;
1452
1453 if (elem == NULL) return(1);
1454 if (xmlStrEqual(name, elem->name)) return(0);
1455 if (htmlCheckAutoClose(elem->name, name)) return(1);
1456 child = elem->children;
1457 while (child != NULL) {
1458 if (htmlAutoCloseTag(doc, name, child)) return(1);
1459 child = child->next;
1460 }
1461 return(0);
1462}
1463
1464/**
1465 * htmlIsAutoClosed:
1466 * @doc: the HTML document
1467 * @elem: the HTML element
1468 *
1469 * The HTML DTD allows a tag to implicitly close other tags.
1470 * The list is kept in htmlStartClose array. This function checks
1471 * if a tag is autoclosed by one of it's child
1472 *
1473 * Returns 1 if autoclosed, 0 otherwise
1474 */
1475int
1476htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1477 htmlNodePtr child;
1478
1479 if (elem == NULL) return(1);
1480 child = elem->children;
1481 while (child != NULL) {
1482 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1483 child = child->next;
1484 }
1485 return(0);
1486}
1487
1488/**
1489 * htmlCheckImplied:
1490 * @ctxt: an HTML parser context
1491 * @newtag: The new tag name
1492 *
1493 * The HTML DTD allows a tag to exists only implicitly
1494 * called when a new tag has been detected and generates the
1495 * appropriates implicit tags if missing
1496 */
1497static void
1498htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1499 int i;
1500
1501 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1502 return;
1503 if (!htmlOmittedDefaultValue)
1504 return;
1505 if (xmlStrEqual(newtag, BAD_CAST"html"))
1506 return;
1507 if (ctxt->nameNr <= 0) {
1508 htmlnamePush(ctxt, BAD_CAST"html");
1509 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1510 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1511 }
1512 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1513 return;
1514 if ((ctxt->nameNr <= 1) &&
1515 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1516 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1517 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1518 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1519 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1520 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1521 if (ctxt->html >= 3) {
1522 /* we already saw or generated an <head> before */
1523 return;
1524 }
1525 /*
1526 * dropped OBJECT ... i you put it first BODY will be
1527 * assumed !
1528 */
1529 htmlnamePush(ctxt, BAD_CAST"head");
1530 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1531 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1532 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1533 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1534 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1535 if (ctxt->html >= 10) {
1536 /* we already saw or generated a <body> before */
1537 return;
1538 }
1539 for (i = 0;i < ctxt->nameNr;i++) {
1540 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1541 return;
1542 }
1543 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1544 return;
1545 }
1546 }
1547
1548 htmlnamePush(ctxt, BAD_CAST"body");
1549 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1550 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1551 }
1552}
1553
1554/**
1555 * htmlCheckParagraph
1556 * @ctxt: an HTML parser context
1557 *
1558 * Check whether a p element need to be implied before inserting
1559 * characters in the current element.
1560 *
1561 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1562 * in case of error.
1563 */
1564
1565static int
1566htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1567 const xmlChar *tag;
1568 int i;
1569
1570 if (ctxt == NULL)
1571 return(-1);
1572 tag = ctxt->name;
1573 if (tag == NULL) {
1574 htmlAutoClose(ctxt, BAD_CAST"p");
1575 htmlCheckImplied(ctxt, BAD_CAST"p");
1576 htmlnamePush(ctxt, BAD_CAST"p");
1577 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1578 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1579 return(1);
1580 }
1581 if (!htmlOmittedDefaultValue)
1582 return(0);
1583 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1584 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1585 htmlAutoClose(ctxt, BAD_CAST"p");
1586 htmlCheckImplied(ctxt, BAD_CAST"p");
1587 htmlnamePush(ctxt, BAD_CAST"p");
1588 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1589 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1590 return(1);
1591 }
1592 }
1593 return(0);
1594}
1595
1596/**
1597 * htmlIsScriptAttribute:
1598 * @name: an attribute name
1599 *
1600 * Check if an attribute is of content type Script
1601 *
1602 * Returns 1 is the attribute is a script 0 otherwise
1603 */
1604int
1605htmlIsScriptAttribute(const xmlChar *name) {
1606 unsigned int i;
1607
1608 if (name == NULL)
1609 return(0);
1610 /*
1611 * all script attributes start with 'on'
1612 */
1613 if ((name[0] != 'o') || (name[1] != 'n'))
1614 return(0);
1615 for (i = 0;
1616 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1617 i++) {
1618 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1619 return(1);
1620 }
1621 return(0);
1622}
1623
1624/************************************************************************
1625 * *
1626 * The list of HTML predefined entities *
1627 * *
1628 ************************************************************************/
1629
1630
1631static const htmlEntityDesc html40EntitiesTable[] = {
1632/*
1633 * the 4 absolute ones, plus apostrophe.
1634 */
1635{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1636{ 38, "amp", "ampersand, U+0026 ISOnum" },
1637{ 39, "apos", "single quote" },
1638{ 60, "lt", "less-than sign, U+003C ISOnum" },
1639{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1640
1641/*
1642 * A bunch still in the 128-255 range
1643 * Replacing them depend really on the charset used.
1644 */
1645{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1646{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1647{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1648{ 163, "pound","pound sign, U+00A3 ISOnum" },
1649{ 164, "curren","currency sign, U+00A4 ISOnum" },
1650{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1651{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1652{ 167, "sect", "section sign, U+00A7 ISOnum" },
1653{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1654{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1655{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1656{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1657{ 172, "not", "not sign, U+00AC ISOnum" },
1658{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1659{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1660{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1661{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1662{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1663{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1664{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1665{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1666{ 181, "micro","micro sign, U+00B5 ISOnum" },
1667{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1668{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1669{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1670{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1671{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1672{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1673{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1674{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1675{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1676{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1677{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1678{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1679{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1680{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1681{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1682{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1683{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1684{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1685{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1686{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1687{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1688{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1689{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1690{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1691{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1692{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1693{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1694{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1695{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1696{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1697{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1698{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1699{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1700{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1701{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1702{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1703{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1704{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1705{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1706{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1707{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1708{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1709{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1710{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1711{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1712{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1713{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1714{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1715{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1716{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1717{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1718{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1719{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1720{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1721{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1722{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1723{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1724{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1725{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1726{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1727{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1728{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1729{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1730{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1731{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1732{ 247, "divide","division sign, U+00F7 ISOnum" },
1733{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1734{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1735{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1736{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1737{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1738{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1739{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1740{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1741
1742{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1743{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1744{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1745{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1746{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1747
1748/*
1749 * Anything below should really be kept as entities references
1750 */
1751{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1752
1753{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1754{ 732, "tilde","small tilde, U+02DC ISOdia" },
1755
1756{ 913, "Alpha","greek capital letter alpha, U+0391" },
1757{ 914, "Beta", "greek capital letter beta, U+0392" },
1758{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1759{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1760{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1761{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1762{ 919, "Eta", "greek capital letter eta, U+0397" },
1763{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1764{ 921, "Iota", "greek capital letter iota, U+0399" },
1765{ 922, "Kappa","greek capital letter kappa, U+039A" },
1766{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1767{ 924, "Mu", "greek capital letter mu, U+039C" },
1768{ 925, "Nu", "greek capital letter nu, U+039D" },
1769{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1770{ 927, "Omicron","greek capital letter omicron, U+039F" },
1771{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1772{ 929, "Rho", "greek capital letter rho, U+03A1" },
1773{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1774{ 932, "Tau", "greek capital letter tau, U+03A4" },
1775{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1776{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1777{ 935, "Chi", "greek capital letter chi, U+03A7" },
1778{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1779{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1780
1781{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1782{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1783{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1784{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1785{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1786{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1787{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1788{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1789{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1790{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1791{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1792{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1793{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1794{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1795{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1796{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1797{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1798{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1799{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1800{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1801{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1802{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1803{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1804{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1805{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1806{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1807{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1808{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1809
1810{ 8194, "ensp", "en space, U+2002 ISOpub" },
1811{ 8195, "emsp", "em space, U+2003 ISOpub" },
1812{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1813{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1814{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1815{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1816{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1817{ 8211, "ndash","en dash, U+2013 ISOpub" },
1818{ 8212, "mdash","em dash, U+2014 ISOpub" },
1819{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1820{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1821{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1822{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1823{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1824{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1825{ 8224, "dagger","dagger, U+2020 ISOpub" },
1826{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1827
1828{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1829{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1830
1831{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1832
1833{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1834{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1835
1836{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1837{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1838
1839{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1840{ 8260, "frasl","fraction slash, U+2044 NEW" },
1841
1842{ 8364, "euro", "euro sign, U+20AC NEW" },
1843
1844{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1845{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1846{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1847{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1848{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1849{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1850{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1851{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1852{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1853{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1854{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1855{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1856{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1857{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1858{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1859{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1860
1861{ 8704, "forall","for all, U+2200 ISOtech" },
1862{ 8706, "part", "partial differential, U+2202 ISOtech" },
1863{ 8707, "exist","there exists, U+2203 ISOtech" },
1864{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1865{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1866{ 8712, "isin", "element of, U+2208 ISOtech" },
1867{ 8713, "notin","not an element of, U+2209 ISOtech" },
1868{ 8715, "ni", "contains as member, U+220B ISOtech" },
1869{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1870{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1871{ 8722, "minus","minus sign, U+2212 ISOtech" },
1872{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1873{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1874{ 8733, "prop", "proportional to, U+221D ISOtech" },
1875{ 8734, "infin","infinity, U+221E ISOtech" },
1876{ 8736, "ang", "angle, U+2220 ISOamso" },
1877{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1878{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1879{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1880{ 8746, "cup", "union = cup, U+222A ISOtech" },
1881{ 8747, "int", "integral, U+222B ISOtech" },
1882{ 8756, "there4","therefore, U+2234 ISOtech" },
1883{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1884{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1885{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1886{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1887{ 8801, "equiv","identical to, U+2261 ISOtech" },
1888{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1889{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1890{ 8834, "sub", "subset of, U+2282 ISOtech" },
1891{ 8835, "sup", "superset of, U+2283 ISOtech" },
1892{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1893{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1894{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1895{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1896{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1897{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1898{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1899{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1900{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1901{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1902{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1903{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1904{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1905{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1906
1907{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1908{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1909{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1910{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1911
1912};
1913
1914/************************************************************************
1915 * *
1916 * Commodity functions to handle entities *
1917 * *
1918 ************************************************************************/
1919
1920/*
1921 * Macro used to grow the current buffer.
1922 */
1923#define growBuffer(buffer) { \
1924 xmlChar *tmp; \
1925 buffer##_size *= 2; \
1926 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1927 if (tmp == NULL) { \
1928 htmlErrMemory(ctxt, "growing buffer\n"); \
1929 xmlFree(buffer); \
1930 return(NULL); \
1931 } \
1932 buffer = tmp; \
1933}
1934
1935/**
1936 * htmlEntityLookup:
1937 * @name: the entity name
1938 *
1939 * Lookup the given entity in EntitiesTable
1940 *
1941 * TODO: the linear scan is really ugly, an hash table is really needed.
1942 *
1943 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1944 */
1945const htmlEntityDesc *
1946htmlEntityLookup(const xmlChar *name) {
1947 unsigned int i;
1948
1949 for (i = 0;i < (sizeof(html40EntitiesTable)/
1950 sizeof(html40EntitiesTable[0]));i++) {
1951 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1952 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1953 }
1954 }
1955 return(NULL);
1956}
1957
1958/**
1959 * htmlEntityValueLookup:
1960 * @value: the entity's unicode value
1961 *
1962 * Lookup the given entity in EntitiesTable
1963 *
1964 * TODO: the linear scan is really ugly, an hash table is really needed.
1965 *
1966 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1967 */
1968const htmlEntityDesc *
1969htmlEntityValueLookup(unsigned int value) {
1970 unsigned int i;
1971
1972 for (i = 0;i < (sizeof(html40EntitiesTable)/
1973 sizeof(html40EntitiesTable[0]));i++) {
1974 if (html40EntitiesTable[i].value >= value) {
1975 if (html40EntitiesTable[i].value > value)
1976 break;
1977 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1978 }
1979 }
1980 return(NULL);
1981}
1982
1983/**
1984 * UTF8ToHtml:
1985 * @out: a pointer to an array of bytes to store the result
1986 * @outlen: the length of @out
1987 * @in: a pointer to an array of UTF-8 chars
1988 * @inlen: the length of @in
1989 *
1990 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1991 * plus HTML entities block of chars out.
1992 *
1993 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1994 * The value of @inlen after return is the number of octets consumed
1995 * as the return value is positive, else unpredictable.
1996 * The value of @outlen after return is the number of octets consumed.
1997 */
1998int
1999UTF8ToHtml(unsigned char* out, int *outlen,
2000 const unsigned char* in, int *inlen) {
2001 const unsigned char* processed = in;
2002 const unsigned char* outend;
2003 const unsigned char* outstart = out;
2004 const unsigned char* instart = in;
2005 const unsigned char* inend;
2006 unsigned int c, d;
2007 int trailing;
2008
2009 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2010 if (in == NULL) {
2011 /*
2012 * initialization nothing to do
2013 */
2014 *outlen = 0;
2015 *inlen = 0;
2016 return(0);
2017 }
2018 inend = in + (*inlen);
2019 outend = out + (*outlen);
2020 while (in < inend) {
2021 d = *in++;
2022 if (d < 0x80) { c= d; trailing= 0; }
2023 else if (d < 0xC0) {
2024 /* trailing byte in leading position */
2025 *outlen = out - outstart;
2026 *inlen = processed - instart;
2027 return(-2);
2028 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2029 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2030 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2031 else {
2032 /* no chance for this in Ascii */
2033 *outlen = out - outstart;
2034 *inlen = processed - instart;
2035 return(-2);
2036 }
2037
2038 if (inend - in < trailing) {
2039 break;
2040 }
2041
2042 for ( ; trailing; trailing--) {
2043 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2044 break;
2045 c <<= 6;
2046 c |= d & 0x3F;
2047 }
2048
2049 /* assertion: c is a single UTF-4 value */
2050 if (c < 0x80) {
2051 if (out + 1 >= outend)
2052 break;
2053 *out++ = c;
2054 } else {
2055 int len;
2056 const htmlEntityDesc * ent;
2057 const char *cp;
2058 char nbuf[16];
2059
2060 /*
2061 * Try to lookup a predefined HTML entity for it
2062 */
2063
2064 ent = htmlEntityValueLookup(c);
2065 if (ent == NULL) {
2066 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2067 cp = nbuf;
2068 }
2069 else
2070 cp = ent->name;
2071 len = strlen(cp);
2072 if (out + 2 + len >= outend)
2073 break;
2074 *out++ = '&';
2075 memcpy(out, cp, len);
2076 out += len;
2077 *out++ = ';';
2078 }
2079 processed = in;
2080 }
2081 *outlen = out - outstart;
2082 *inlen = processed - instart;
2083 return(0);
2084}
2085
2086/**
2087 * htmlEncodeEntities:
2088 * @out: a pointer to an array of bytes to store the result
2089 * @outlen: the length of @out
2090 * @in: a pointer to an array of UTF-8 chars
2091 * @inlen: the length of @in
2092 * @quoteChar: the quote character to escape (' or ") or zero.
2093 *
2094 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2095 * plus HTML entities block of chars out.
2096 *
2097 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2098 * The value of @inlen after return is the number of octets consumed
2099 * as the return value is positive, else unpredictable.
2100 * The value of @outlen after return is the number of octets consumed.
2101 */
2102int
2103htmlEncodeEntities(unsigned char* out, int *outlen,
2104 const unsigned char* in, int *inlen, int quoteChar) {
2105 const unsigned char* processed = in;
2106 const unsigned char* outend;
2107 const unsigned char* outstart = out;
2108 const unsigned char* instart = in;
2109 const unsigned char* inend;
2110 unsigned int c, d;
2111 int trailing;
2112
2113 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2114 return(-1);
2115 outend = out + (*outlen);
2116 inend = in + (*inlen);
2117 while (in < inend) {
2118 d = *in++;
2119 if (d < 0x80) { c= d; trailing= 0; }
2120 else if (d < 0xC0) {
2121 /* trailing byte in leading position */
2122 *outlen = out - outstart;
2123 *inlen = processed - instart;
2124 return(-2);
2125 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2126 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2127 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2128 else {
2129 /* no chance for this in Ascii */
2130 *outlen = out - outstart;
2131 *inlen = processed - instart;
2132 return(-2);
2133 }
2134
2135 if (inend - in < trailing)
2136 break;
2137
2138 while (trailing--) {
2139 if (((d= *in++) & 0xC0) != 0x80) {
2140 *outlen = out - outstart;
2141 *inlen = processed - instart;
2142 return(-2);
2143 }
2144 c <<= 6;
2145 c |= d & 0x3F;
2146 }
2147
2148 /* assertion: c is a single UTF-4 value */
2149 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2150 (c != '&') && (c != '<') && (c != '>')) {
2151 if (out >= outend)
2152 break;
2153 *out++ = c;
2154 } else {
2155 const htmlEntityDesc * ent;
2156 const char *cp;
2157 char nbuf[16];
2158 int len;
2159
2160 /*
2161 * Try to lookup a predefined HTML entity for it
2162 */
2163 ent = htmlEntityValueLookup(c);
2164 if (ent == NULL) {
2165 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2166 cp = nbuf;
2167 }
2168 else
2169 cp = ent->name;
2170 len = strlen(cp);
2171 if (out + 2 + len > outend)
2172 break;
2173 *out++ = '&';
2174 memcpy(out, cp, len);
2175 out += len;
2176 *out++ = ';';
2177 }
2178 processed = in;
2179 }
2180 *outlen = out - outstart;
2181 *inlen = processed - instart;
2182 return(0);
2183}
2184
2185/************************************************************************
2186 * *
2187 * Commodity functions to handle streams *
2188 * *
2189 ************************************************************************/
2190
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002191#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002192/**
2193 * htmlNewInputStream:
2194 * @ctxt: an HTML parser context
2195 *
2196 * Create a new input stream structure
2197 * Returns the new input stream or NULL
2198 */
2199static htmlParserInputPtr
2200htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2201 htmlParserInputPtr input;
2202
2203 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2204 if (input == NULL) {
2205 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2206 return(NULL);
2207 }
2208 memset(input, 0, sizeof(htmlParserInput));
2209 input->filename = NULL;
2210 input->directory = NULL;
2211 input->base = NULL;
2212 input->cur = NULL;
2213 input->buf = NULL;
2214 input->line = 1;
2215 input->col = 1;
2216 input->buf = NULL;
2217 input->free = NULL;
2218 input->version = NULL;
2219 input->consumed = 0;
2220 input->length = 0;
2221 return(input);
2222}
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002223#endif
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002224
2225
2226/************************************************************************
2227 * *
2228 * Commodity functions, cleanup needed ? *
2229 * *
2230 ************************************************************************/
2231/*
2232 * all tags allowing pc data from the html 4.01 loose dtd
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002233 * NOTE: it might be more appropriate to integrate this information
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002234 * into the html40ElementTable array but I don't want to risk any
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002235 * binary incompatibility
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002236 */
2237static const char *allowPCData[] = {
2238 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2239 "blockquote", "body", "button", "caption", "center", "cite", "code",
2240 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2241 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2242 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2243 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2244};
2245
2246/**
2247 * areBlanks:
2248 * @ctxt: an HTML parser context
2249 * @str: a xmlChar *
2250 * @len: the size of @str
2251 *
2252 * Is this a sequence of blank chars that one can ignore ?
2253 *
2254 * Returns 1 if ignorable 0 otherwise.
2255 */
2256
2257static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2258 unsigned int i;
2259 int j;
2260 xmlNodePtr lastChild;
2261 xmlDtdPtr dtd;
2262
2263 for (j = 0;j < len;j++)
2264 if (!(IS_BLANK_CH(str[j]))) return(0);
2265
2266 if (CUR == 0) return(1);
2267 if (CUR != '<') return(0);
2268 if (ctxt->name == NULL)
2269 return(1);
2270 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2271 return(1);
2272 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2273 return(1);
2274
2275 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2276 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2277 dtd = xmlGetIntSubset(ctxt->myDoc);
2278 if (dtd != NULL && dtd->ExternalID != NULL) {
2279 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2280 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2281 return(1);
2282 }
2283 }
2284
2285 if (ctxt->node == NULL) return(0);
2286 lastChild = xmlGetLastChild(ctxt->node);
2287 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2288 lastChild = lastChild->prev;
2289 if (lastChild == NULL) {
2290 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2291 (ctxt->node->content != NULL)) return(0);
2292 /* keep ws in constructs like ...<b> </b>...
2293 for all tags "b" allowing PCDATA */
2294 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2295 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2296 return(0);
2297 }
2298 }
2299 } else if (xmlNodeIsText(lastChild)) {
2300 return(0);
2301 } else {
2302 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2303 for all tags "p" allowing PCDATA */
2304 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2305 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2306 return(0);
2307 }
2308 }
2309 }
2310 return(1);
2311}
2312
2313/**
2314 * htmlNewDocNoDtD:
2315 * @URI: URI for the dtd, or NULL
2316 * @ExternalID: the external ID of the DTD, or NULL
2317 *
2318 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2319 * are NULL
2320 *
2321 * Returns a new document, do not initialize the DTD if not provided
2322 */
2323htmlDocPtr
2324htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2325 xmlDocPtr cur;
2326
2327 /*
2328 * Allocate a new document and fill the fields.
2329 */
2330 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2331 if (cur == NULL) {
2332 htmlErrMemory(NULL, "HTML document creation failed\n");
2333 return(NULL);
2334 }
2335 memset(cur, 0, sizeof(xmlDoc));
2336
2337 cur->type = XML_HTML_DOCUMENT_NODE;
2338 cur->version = NULL;
2339 cur->intSubset = NULL;
2340 cur->doc = cur;
2341 cur->name = NULL;
2342 cur->children = NULL;
2343 cur->extSubset = NULL;
2344 cur->oldNs = NULL;
2345 cur->encoding = NULL;
2346 cur->standalone = 1;
2347 cur->compression = 0;
2348 cur->ids = NULL;
2349 cur->refs = NULL;
2350 cur->_private = NULL;
2351 cur->charset = XML_CHAR_ENCODING_UTF8;
2352 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2353 if ((ExternalID != NULL) ||
2354 (URI != NULL))
2355 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2356 return(cur);
2357}
2358
2359/**
2360 * htmlNewDoc:
2361 * @URI: URI for the dtd, or NULL
2362 * @ExternalID: the external ID of the DTD, or NULL
2363 *
2364 * Creates a new HTML document
2365 *
2366 * Returns a new document
2367 */
2368htmlDocPtr
2369htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2370 if ((URI == NULL) && (ExternalID == NULL))
2371 return(htmlNewDocNoDtD(
2372 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2373 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2374
2375 return(htmlNewDocNoDtD(URI, ExternalID));
2376}
2377
2378
2379/************************************************************************
2380 * *
2381 * The parser itself *
2382 * Relates to http://www.w3.org/TR/html40 *
2383 * *
2384 ************************************************************************/
2385
2386/************************************************************************
2387 * *
2388 * The parser itself *
2389 * *
2390 ************************************************************************/
2391
2392static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2393
2394/**
2395 * htmlParseHTMLName:
2396 * @ctxt: an HTML parser context
2397 *
2398 * parse an HTML tag or attribute name, note that we convert it to lowercase
2399 * since HTML names are not case-sensitive.
2400 *
2401 * Returns the Tag Name parsed or NULL
2402 */
2403
2404static const xmlChar *
2405htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2406 int i = 0;
2407 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2408
2409 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2410 (CUR != ':') && (CUR != '.')) return(NULL);
2411
2412 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2413 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2414 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2415 (CUR == '.'))) {
2416 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2417 else loc[i] = CUR;
2418 i++;
2419
2420 NEXT;
2421 }
2422
2423 return(xmlDictLookup(ctxt->dict, loc, i));
2424}
2425
2426
2427/**
2428 * htmlParseHTMLName_nonInvasive:
2429 * @ctxt: an HTML parser context
2430 *
2431 * parse an HTML tag or attribute name, note that we convert it to lowercase
2432 * since HTML names are not case-sensitive, this doesn't consume the data
2433 * from the stream, it's a look-ahead
2434 *
2435 * Returns the Tag Name parsed or NULL
2436 */
2437
2438static const xmlChar *
2439htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2440 int i = 0;
2441 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2442
2443 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2444 (NXT(1) != ':')) return(NULL);
2445
2446 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2447 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2448 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2449 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2450 else loc[i] = NXT(1+i);
2451 i++;
2452 }
2453
2454 return(xmlDictLookup(ctxt->dict, loc, i));
2455}
2456
2457
2458/**
2459 * htmlParseName:
2460 * @ctxt: an HTML parser context
2461 *
2462 * parse an HTML name, this routine is case sensitive.
2463 *
2464 * Returns the Name parsed or NULL
2465 */
2466
2467static const xmlChar *
2468htmlParseName(htmlParserCtxtPtr ctxt) {
2469 const xmlChar *in;
2470 const xmlChar *ret;
2471 int count = 0;
2472
2473 GROW;
2474
2475 /*
2476 * Accelerator for simple ASCII names
2477 */
2478 in = ctxt->input->cur;
2479 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2480 ((*in >= 0x41) && (*in <= 0x5A)) ||
2481 (*in == '_') || (*in == ':')) {
2482 in++;
2483 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2484 ((*in >= 0x41) && (*in <= 0x5A)) ||
2485 ((*in >= 0x30) && (*in <= 0x39)) ||
2486 (*in == '_') || (*in == '-') ||
2487 (*in == ':') || (*in == '.'))
2488 in++;
2489
2490 if (in == ctxt->input->end)
2491 return(NULL);
2492
2493 if ((*in > 0) && (*in < 0x80)) {
2494 count = in - ctxt->input->cur;
2495 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2496 ctxt->input->cur = in;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002497 ctxt->input->col += count;
2498 return(ret);
2499 }
2500 }
2501 return(htmlParseNameComplex(ctxt));
2502}
2503
2504static const xmlChar *
2505htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2506 int len = 0, l;
2507 int c;
2508 int count = 0;
2509 const xmlChar *base = ctxt->input->base;
2510
2511 /*
2512 * Handler for more complex cases
2513 */
2514 GROW;
2515 c = CUR_CHAR(l);
2516 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2517 (!IS_LETTER(c) && (c != '_') &&
2518 (c != ':'))) {
2519 return(NULL);
2520 }
2521
2522 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2523 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2524 (c == '.') || (c == '-') ||
2525 (c == '_') || (c == ':') ||
2526 (IS_COMBINING(c)) ||
2527 (IS_EXTENDER(c)))) {
2528 if (count++ > 100) {
2529 count = 0;
2530 GROW;
2531 }
2532 len += l;
2533 NEXTL(l);
2534 c = CUR_CHAR(l);
2535 if (ctxt->input->base != base) {
2536 /*
2537 * We changed encoding from an unknown encoding
2538 * Input buffer changed location, so we better start again
2539 */
2540 return(htmlParseNameComplex(ctxt));
2541 }
2542 }
2543
2544 if (ctxt->input->cur - ctxt->input->base < len) {
2545 /* Sanity check */
2546 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2547 "unexpected change of input buffer", NULL, NULL);
2548 return (NULL);
2549 }
2550
2551 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2552}
2553
2554
2555/**
2556 * htmlParseHTMLAttribute:
2557 * @ctxt: an HTML parser context
2558 * @stop: a char stop value
2559 *
2560 * parse an HTML attribute value till the stop (quote), if
2561 * stop is 0 then it stops at the first space
2562 *
2563 * Returns the attribute parsed or NULL
2564 */
2565
2566static xmlChar *
2567htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2568 xmlChar *buffer = NULL;
2569 int buffer_size = 0;
2570 xmlChar *out = NULL;
2571 const xmlChar *name = NULL;
2572 const xmlChar *cur = NULL;
2573 const htmlEntityDesc * ent;
2574
2575 /*
2576 * allocate a translation buffer.
2577 */
2578 buffer_size = HTML_PARSER_BUFFER_SIZE;
2579 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2580 if (buffer == NULL) {
2581 htmlErrMemory(ctxt, "buffer allocation failed\n");
2582 return(NULL);
2583 }
2584 out = buffer;
2585
2586 /*
2587 * Ok loop until we reach one of the ending chars
2588 */
2589 while ((CUR != 0) && (CUR != stop)) {
2590 if ((stop == 0) && (CUR == '>')) break;
2591 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2592 if (CUR == '&') {
2593 if (NXT(1) == '#') {
2594 unsigned int c;
2595 int bits;
2596
2597 c = htmlParseCharRef(ctxt);
2598 if (c < 0x80)
2599 { *out++ = c; bits= -6; }
2600 else if (c < 0x800)
2601 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2602 else if (c < 0x10000)
2603 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2604 else
2605 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2606
2607 for ( ; bits >= 0; bits-= 6) {
2608 *out++ = ((c >> bits) & 0x3F) | 0x80;
2609 }
2610
2611 if (out - buffer > buffer_size - 100) {
2612 int indx = out - buffer;
2613
2614 growBuffer(buffer);
2615 out = &buffer[indx];
2616 }
2617 } else {
2618 ent = htmlParseEntityRef(ctxt, &name);
2619 if (name == NULL) {
2620 *out++ = '&';
2621 if (out - buffer > buffer_size - 100) {
2622 int indx = out - buffer;
2623
2624 growBuffer(buffer);
2625 out = &buffer[indx];
2626 }
2627 } else if (ent == NULL) {
2628 *out++ = '&';
2629 cur = name;
2630 while (*cur != 0) {
2631 if (out - buffer > buffer_size - 100) {
2632 int indx = out - buffer;
2633
2634 growBuffer(buffer);
2635 out = &buffer[indx];
2636 }
2637 *out++ = *cur++;
2638 }
2639 } else {
2640 unsigned int c;
2641 int bits;
2642
2643 if (out - buffer > buffer_size - 100) {
2644 int indx = out - buffer;
2645
2646 growBuffer(buffer);
2647 out = &buffer[indx];
2648 }
2649 c = ent->value;
2650 if (c < 0x80)
2651 { *out++ = c; bits= -6; }
2652 else if (c < 0x800)
2653 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2654 else if (c < 0x10000)
2655 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2656 else
2657 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2658
2659 for ( ; bits >= 0; bits-= 6) {
2660 *out++ = ((c >> bits) & 0x3F) | 0x80;
2661 }
2662 }
2663 }
2664 } else {
2665 unsigned int c;
2666 int bits, l;
2667
2668 if (out - buffer > buffer_size - 100) {
2669 int indx = out - buffer;
2670
2671 growBuffer(buffer);
2672 out = &buffer[indx];
2673 }
2674 c = CUR_CHAR(l);
2675 if (c < 0x80)
2676 { *out++ = c; bits= -6; }
2677 else if (c < 0x800)
2678 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2679 else if (c < 0x10000)
2680 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2681 else
2682 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2683
2684 for ( ; bits >= 0; bits-= 6) {
2685 *out++ = ((c >> bits) & 0x3F) | 0x80;
2686 }
2687 NEXT;
2688 }
2689 }
2690 *out = 0;
2691 return(buffer);
2692}
2693
2694/**
2695 * htmlParseEntityRef:
2696 * @ctxt: an HTML parser context
2697 * @str: location to store the entity name
2698 *
2699 * parse an HTML ENTITY references
2700 *
2701 * [68] EntityRef ::= '&' Name ';'
2702 *
2703 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2704 * if non-NULL *str will have to be freed by the caller.
2705 */
2706const htmlEntityDesc *
2707htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2708 const xmlChar *name;
2709 const htmlEntityDesc * ent = NULL;
2710
2711 if (str != NULL) *str = NULL;
2712 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2713
2714 if (CUR == '&') {
2715 NEXT;
2716 name = htmlParseName(ctxt);
2717 if (name == NULL) {
2718 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2719 "htmlParseEntityRef: no name\n", NULL, NULL);
2720 } else {
2721 GROW;
2722 if (CUR == ';') {
2723 if (str != NULL)
2724 *str = name;
2725
2726 /*
2727 * Lookup the entity in the table.
2728 */
2729 ent = htmlEntityLookup(name);
2730 if (ent != NULL) /* OK that's ugly !!! */
2731 NEXT;
2732 } else {
2733 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2734 "htmlParseEntityRef: expecting ';'\n",
2735 NULL, NULL);
2736 if (str != NULL)
2737 *str = name;
2738 }
2739 }
2740 }
2741 return(ent);
2742}
2743
2744/**
2745 * htmlParseAttValue:
2746 * @ctxt: an HTML parser context
2747 *
2748 * parse a value for an attribute
2749 * Note: the parser won't do substitution of entities here, this
2750 * will be handled later in xmlStringGetNodeList, unless it was
2751 * asked for ctxt->replaceEntities != 0
2752 *
2753 * Returns the AttValue parsed or NULL.
2754 */
2755
2756static xmlChar *
2757htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2758 xmlChar *ret = NULL;
2759
2760 if (CUR == '"') {
2761 NEXT;
2762 ret = htmlParseHTMLAttribute(ctxt, '"');
2763 if (CUR != '"') {
2764 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2765 "AttValue: \" expected\n", NULL, NULL);
2766 } else
2767 NEXT;
2768 } else if (CUR == '\'') {
2769 NEXT;
2770 ret = htmlParseHTMLAttribute(ctxt, '\'');
2771 if (CUR != '\'') {
2772 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2773 "AttValue: ' expected\n", NULL, NULL);
2774 } else
2775 NEXT;
2776 } else {
2777 /*
2778 * That's an HTMLism, the attribute value may not be quoted
2779 */
2780 ret = htmlParseHTMLAttribute(ctxt, 0);
2781 if (ret == NULL) {
2782 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2783 "AttValue: no value found\n", NULL, NULL);
2784 }
2785 }
2786 return(ret);
2787}
2788
2789/**
2790 * htmlParseSystemLiteral:
2791 * @ctxt: an HTML parser context
2792 *
2793 * parse an HTML Literal
2794 *
2795 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2796 *
2797 * Returns the SystemLiteral parsed or NULL
2798 */
2799
2800static xmlChar *
2801htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2802 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002803 int err = 0;
2804 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002805 xmlChar *ret = NULL;
2806
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002807 if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002808 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002809 "SystemLiteral \" or ' expected\n", NULL, NULL);
2810 return(NULL);
2811 }
2812 quote = CUR;
2813 NEXT;
2814
2815 if (CUR_PTR < BASE_PTR)
2816 return(ret);
2817 startPosition = CUR_PTR - BASE_PTR;
2818
2819 while ((CUR != 0) && (CUR != quote)) {
2820 /* TODO: Handle UTF-8 */
2821 if (!IS_CHAR_CH(CUR)) {
2822 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2823 "Invalid char in SystemLiteral 0x%X\n", CUR);
2824 err = 1;
2825 }
2826 NEXT;
2827 len++;
2828 }
2829 if (CUR != quote) {
2830 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2831 "Unfinished SystemLiteral\n", NULL, NULL);
2832 } else {
2833 NEXT;
2834 if (err == 0)
2835 ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002836 }
2837
2838 return(ret);
2839}
2840
2841/**
2842 * htmlParsePubidLiteral:
2843 * @ctxt: an HTML parser context
2844 *
2845 * parse an HTML public literal
2846 *
2847 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2848 *
2849 * Returns the PubidLiteral parsed or NULL.
2850 */
2851
2852static xmlChar *
2853htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2854 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002855 int err = 0;
2856 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002857 xmlChar *ret = NULL;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002858
2859 if ((CUR != '"') && (CUR != '\'')) {
2860 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2861 "PubidLiteral \" or ' expected\n", NULL, NULL);
2862 return(NULL);
2863 }
2864 quote = CUR;
2865 NEXT;
2866
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002867 /*
2868 * Name ::= (Letter | '_') (NameChar)*
2869 */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002870 if (CUR_PTR < BASE_PTR)
2871 return(ret);
2872 startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002873
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002874 while ((CUR != 0) && (CUR != quote)) {
2875 if (!IS_PUBIDCHAR_CH(CUR)) {
2876 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2877 "Invalid char in PubidLiteral 0x%X\n", CUR);
2878 err = 1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002879 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002880 len++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002881 NEXT;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002882 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002883
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002884 if (CUR != '"') {
2885 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2886 "Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002887 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002888 NEXT;
2889 if (err == 0)
2890 ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002891 }
2892
2893 return(ret);
2894}
2895
2896/**
2897 * htmlParseScript:
2898 * @ctxt: an HTML parser context
2899 *
2900 * parse the content of an HTML SCRIPT or STYLE element
2901 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2902 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2903 * http://www.w3.org/TR/html4/types.html#type-script
2904 * http://www.w3.org/TR/html4/types.html#h-6.15
2905 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2906 *
2907 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2908 * element and the value of intrinsic event attributes. User agents must
2909 * not evaluate script data as HTML markup but instead must pass it on as
2910 * data to a script engine.
2911 * NOTES:
2912 * - The content is passed like CDATA
2913 * - the attributes for style and scripting "onXXX" are also described
2914 * as CDATA but SGML allows entities references in attributes so their
2915 * processing is identical as other attributes
2916 */
2917static void
2918htmlParseScript(htmlParserCtxtPtr ctxt) {
2919 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2920 int nbchar = 0;
2921 int cur,l;
2922
2923 SHRINK;
2924 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002925 while (cur != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002926 if ((cur == '<') && (NXT(1) == '/')) {
2927 /*
2928 * One should break here, the specification is clear:
2929 * Authors should therefore escape "</" within the content.
2930 * Escape mechanisms are specific to each scripting or
2931 * style sheet language.
2932 *
2933 * In recovery mode, only break if end tag match the
2934 * current tag, effectively ignoring all tags inside the
2935 * script/style block and treating the entire block as
2936 * CDATA.
2937 */
2938 if (ctxt->recovery) {
2939 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2940 xmlStrlen(ctxt->name)) == 0)
2941 {
2942 break; /* while */
2943 } else {
2944 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2945 "Element %s embeds close tag\n",
2946 ctxt->name, NULL);
2947 }
2948 } else {
2949 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2950 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2951 {
2952 break; /* while */
2953 }
2954 }
2955 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002956 if (IS_CHAR(cur)) {
2957 COPY_BUF(l,buf,nbchar,cur);
2958 } else {
2959 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2960 "Invalid char in CDATA 0x%X\n", cur);
2961 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002962 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002963 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002964 if (ctxt->sax->cdataBlock!= NULL) {
2965 /*
2966 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2967 */
2968 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2969 } else if (ctxt->sax->characters != NULL) {
2970 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2971 }
2972 nbchar = 0;
2973 }
2974 GROW;
2975 NEXTL(l);
2976 cur = CUR_CHAR(l);
2977 }
2978
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002979 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002980 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002981 if (ctxt->sax->cdataBlock!= NULL) {
2982 /*
2983 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2984 */
2985 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2986 } else if (ctxt->sax->characters != NULL) {
2987 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2988 }
2989 }
2990}
2991
2992
2993/**
2994 * htmlParseCharDataInternal:
2995 * @ctxt: an HTML parser context
2996 * @readahead: optional read ahead character in ascii range
2997 *
2998 * parse a CharData section.
2999 * if we are within a CDATA section ']]>' marks an end of section.
3000 *
3001 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3002 */
3003
3004static void
3005htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3006 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3007 int nbchar = 0;
3008 int cur, l;
3009 int chunk = 0;
3010
3011 if (readahead)
3012 buf[nbchar++] = readahead;
3013
3014 SHRINK;
3015 cur = CUR_CHAR(l);
3016 while (((cur != '<') || (ctxt->token == '<')) &&
3017 ((cur != '&') || (ctxt->token == '&')) &&
3018 (cur != 0)) {
3019 if (!(IS_CHAR(cur))) {
3020 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3021 "Invalid char in CDATA 0x%X\n", cur);
3022 } else {
3023 COPY_BUF(l,buf,nbchar,cur);
3024 }
3025 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003026 buf[nbchar] = 0;
3027
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003028 /*
3029 * Ok the segment is to be consumed as chars.
3030 */
3031 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3032 if (areBlanks(ctxt, buf, nbchar)) {
3033 if (ctxt->keepBlanks) {
3034 if (ctxt->sax->characters != NULL)
3035 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3036 } else {
3037 if (ctxt->sax->ignorableWhitespace != NULL)
3038 ctxt->sax->ignorableWhitespace(ctxt->userData,
3039 buf, nbchar);
3040 }
3041 } else {
3042 htmlCheckParagraph(ctxt);
3043 if (ctxt->sax->characters != NULL)
3044 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3045 }
3046 }
3047 nbchar = 0;
3048 }
3049 NEXTL(l);
3050 chunk++;
3051 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3052 chunk = 0;
3053 SHRINK;
3054 GROW;
3055 }
3056 cur = CUR_CHAR(l);
3057 if (cur == 0) {
3058 SHRINK;
3059 GROW;
3060 cur = CUR_CHAR(l);
3061 }
3062 }
3063 if (nbchar != 0) {
3064 buf[nbchar] = 0;
3065
3066 /*
3067 * Ok the segment is to be consumed as chars.
3068 */
3069 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3070 if (areBlanks(ctxt, buf, nbchar)) {
3071 if (ctxt->keepBlanks) {
3072 if (ctxt->sax->characters != NULL)
3073 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3074 } else {
3075 if (ctxt->sax->ignorableWhitespace != NULL)
3076 ctxt->sax->ignorableWhitespace(ctxt->userData,
3077 buf, nbchar);
3078 }
3079 } else {
3080 htmlCheckParagraph(ctxt);
3081 if (ctxt->sax->characters != NULL)
3082 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3083 }
3084 }
3085 } else {
3086 /*
3087 * Loop detection
3088 */
3089 if (cur == 0)
3090 ctxt->instate = XML_PARSER_EOF;
3091 }
3092}
3093
3094/**
3095 * htmlParseCharData:
3096 * @ctxt: an HTML parser context
3097 *
3098 * parse a CharData section.
3099 * if we are within a CDATA section ']]>' marks an end of section.
3100 *
3101 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3102 */
3103
3104static void
3105htmlParseCharData(htmlParserCtxtPtr ctxt) {
3106 htmlParseCharDataInternal(ctxt, 0);
3107}
3108
3109/**
3110 * htmlParseExternalID:
3111 * @ctxt: an HTML parser context
3112 * @publicID: a xmlChar** receiving PubidLiteral
3113 *
3114 * Parse an External ID or a Public ID
3115 *
3116 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3117 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3118 *
3119 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3120 *
3121 * Returns the function returns SystemLiteral and in the second
3122 * case publicID receives PubidLiteral, is strict is off
3123 * it is possible to return NULL and have publicID set.
3124 */
3125
3126static xmlChar *
3127htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3128 xmlChar *URI = NULL;
3129
3130 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3131 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3132 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3133 SKIP(6);
3134 if (!IS_BLANK_CH(CUR)) {
3135 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3136 "Space required after 'SYSTEM'\n", NULL, NULL);
3137 }
3138 SKIP_BLANKS;
3139 URI = htmlParseSystemLiteral(ctxt);
3140 if (URI == NULL) {
3141 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3142 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3143 }
3144 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3145 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3146 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3147 SKIP(6);
3148 if (!IS_BLANK_CH(CUR)) {
3149 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3150 "Space required after 'PUBLIC'\n", NULL, NULL);
3151 }
3152 SKIP_BLANKS;
3153 *publicID = htmlParsePubidLiteral(ctxt);
3154 if (*publicID == NULL) {
3155 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3156 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3157 NULL, NULL);
3158 }
3159 SKIP_BLANKS;
3160 if ((CUR == '"') || (CUR == '\'')) {
3161 URI = htmlParseSystemLiteral(ctxt);
3162 }
3163 }
3164 return(URI);
3165}
3166
3167/**
3168 * xmlParsePI:
3169 * @ctxt: an XML parser context
3170 *
3171 * parse an XML Processing Instruction.
3172 *
3173 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3174 */
3175static void
3176htmlParsePI(htmlParserCtxtPtr ctxt) {
3177 xmlChar *buf = NULL;
3178 int len = 0;
3179 int size = HTML_PARSER_BUFFER_SIZE;
3180 int cur, l;
3181 const xmlChar *target;
3182 xmlParserInputState state;
3183 int count = 0;
3184
3185 if ((RAW == '<') && (NXT(1) == '?')) {
3186 state = ctxt->instate;
3187 ctxt->instate = XML_PARSER_PI;
3188 /*
3189 * this is a Processing Instruction.
3190 */
3191 SKIP(2);
3192 SHRINK;
3193
3194 /*
3195 * Parse the target name and check for special support like
3196 * namespace.
3197 */
3198 target = htmlParseName(ctxt);
3199 if (target != NULL) {
3200 if (RAW == '>') {
3201 SKIP(1);
3202
3203 /*
3204 * SAX: PI detected.
3205 */
3206 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3207 (ctxt->sax->processingInstruction != NULL))
3208 ctxt->sax->processingInstruction(ctxt->userData,
3209 target, NULL);
3210 ctxt->instate = state;
3211 return;
3212 }
3213 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3214 if (buf == NULL) {
3215 htmlErrMemory(ctxt, NULL);
3216 ctxt->instate = state;
3217 return;
3218 }
3219 cur = CUR;
3220 if (!IS_BLANK(cur)) {
3221 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3222 "ParsePI: PI %s space expected\n", target, NULL);
3223 }
3224 SKIP_BLANKS;
3225 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003226 while ((cur != 0) && (cur != '>')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003227 if (len + 5 >= size) {
3228 xmlChar *tmp;
3229
3230 size *= 2;
3231 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3232 if (tmp == NULL) {
3233 htmlErrMemory(ctxt, NULL);
3234 xmlFree(buf);
3235 ctxt->instate = state;
3236 return;
3237 }
3238 buf = tmp;
3239 }
3240 count++;
3241 if (count > 50) {
3242 GROW;
3243 count = 0;
3244 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003245 if (IS_CHAR(cur)) {
3246 COPY_BUF(l,buf,len,cur);
3247 } else {
3248 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3249 "Invalid char in processing instruction "
3250 "0x%X\n", cur);
3251 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003252 NEXTL(l);
3253 cur = CUR_CHAR(l);
3254 if (cur == 0) {
3255 SHRINK;
3256 GROW;
3257 cur = CUR_CHAR(l);
3258 }
3259 }
3260 buf[len] = 0;
3261 if (cur != '>') {
3262 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3263 "ParsePI: PI %s never end ...\n", target, NULL);
3264 } else {
3265 SKIP(1);
3266
3267 /*
3268 * SAX: PI detected.
3269 */
3270 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3271 (ctxt->sax->processingInstruction != NULL))
3272 ctxt->sax->processingInstruction(ctxt->userData,
3273 target, buf);
3274 }
3275 xmlFree(buf);
3276 } else {
3277 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3278 "PI is not started correctly", NULL, NULL);
3279 }
3280 ctxt->instate = state;
3281 }
3282}
3283
3284/**
3285 * htmlParseComment:
3286 * @ctxt: an HTML parser context
3287 *
3288 * Parse an XML (SGML) comment <!-- .... -->
3289 *
3290 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3291 */
3292static void
3293htmlParseComment(htmlParserCtxtPtr ctxt) {
3294 xmlChar *buf = NULL;
3295 int len;
3296 int size = HTML_PARSER_BUFFER_SIZE;
3297 int q, ql;
3298 int r, rl;
3299 int cur, l;
3300 xmlParserInputState state;
3301
3302 /*
3303 * Check that there is a comment right here.
3304 */
3305 if ((RAW != '<') || (NXT(1) != '!') ||
3306 (NXT(2) != '-') || (NXT(3) != '-')) return;
3307
3308 state = ctxt->instate;
3309 ctxt->instate = XML_PARSER_COMMENT;
3310 SHRINK;
3311 SKIP(4);
3312 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3313 if (buf == NULL) {
3314 htmlErrMemory(ctxt, "buffer allocation failed\n");
3315 ctxt->instate = state;
3316 return;
3317 }
3318 len = 0;
3319 buf[len] = 0;
3320 q = CUR_CHAR(ql);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003321 if (q == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003322 goto unfinished;
3323 NEXTL(ql);
3324 r = CUR_CHAR(rl);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003325 if (r == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003326 goto unfinished;
3327 NEXTL(rl);
3328 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003329 while ((cur != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003330 ((cur != '>') ||
3331 (r != '-') || (q != '-'))) {
3332 if (len + 5 >= size) {
3333 xmlChar *tmp;
3334
3335 size *= 2;
3336 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3337 if (tmp == NULL) {
3338 xmlFree(buf);
3339 htmlErrMemory(ctxt, "growing buffer failed\n");
3340 ctxt->instate = state;
3341 return;
3342 }
3343 buf = tmp;
3344 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003345 if (IS_CHAR(q)) {
3346 COPY_BUF(ql,buf,len,q);
3347 } else {
3348 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3349 "Invalid char in comment 0x%X\n", q);
3350 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003351 q = r;
3352 ql = rl;
3353 r = cur;
3354 rl = l;
3355 NEXTL(l);
3356 cur = CUR_CHAR(l);
3357 if (cur == 0) {
3358 SHRINK;
3359 GROW;
3360 cur = CUR_CHAR(l);
3361 }
3362 }
3363 buf[len] = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003364 if (cur == '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003365 NEXT;
3366 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3367 (!ctxt->disableSAX))
3368 ctxt->sax->comment(ctxt->userData, buf);
3369 xmlFree(buf);
3370 ctxt->instate = state;
3371 return;
3372 }
3373
3374unfinished:
3375 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3376 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3377 xmlFree(buf);
3378}
3379
3380/**
3381 * htmlParseCharRef:
3382 * @ctxt: an HTML parser context
3383 *
3384 * parse Reference declarations
3385 *
3386 * [66] CharRef ::= '&#' [0-9]+ ';' |
3387 * '&#x' [0-9a-fA-F]+ ';'
3388 *
3389 * Returns the value parsed (as an int)
3390 */
3391int
3392htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3393 int val = 0;
3394
3395 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3396 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3397 "htmlParseCharRef: context error\n",
3398 NULL, NULL);
3399 return(0);
3400 }
3401 if ((CUR == '&') && (NXT(1) == '#') &&
3402 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3403 SKIP(3);
3404 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003405 if ((CUR >= '0') && (CUR <= '9')) {
3406 if (val < 0x110000)
3407 val = val * 16 + (CUR - '0');
3408 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3409 if (val < 0x110000)
3410 val = val * 16 + (CUR - 'a') + 10;
3411 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3412 if (val < 0x110000)
3413 val = val * 16 + (CUR - 'A') + 10;
3414 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003415 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3416 "htmlParseCharRef: missing semicolon\n",
3417 NULL, NULL);
3418 break;
3419 }
3420 NEXT;
3421 }
3422 if (CUR == ';')
3423 NEXT;
3424 } else if ((CUR == '&') && (NXT(1) == '#')) {
3425 SKIP(2);
3426 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003427 if ((CUR >= '0') && (CUR <= '9')) {
3428 if (val < 0x110000)
3429 val = val * 10 + (CUR - '0');
3430 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003431 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3432 "htmlParseCharRef: missing semicolon\n",
3433 NULL, NULL);
3434 break;
3435 }
3436 NEXT;
3437 }
3438 if (CUR == ';')
3439 NEXT;
3440 } else {
3441 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3442 "htmlParseCharRef: invalid value\n", NULL, NULL);
3443 }
3444 /*
3445 * Check the value IS_CHAR ...
3446 */
3447 if (IS_CHAR(val)) {
3448 return(val);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003449 } else if (val >= 0x110000) {
3450 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3451 "htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003452 } else {
3453 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3454 "htmlParseCharRef: invalid xmlChar value %d\n",
3455 val);
3456 }
3457 return(0);
3458}
3459
3460
3461/**
3462 * htmlParseDocTypeDecl:
3463 * @ctxt: an HTML parser context
3464 *
3465 * parse a DOCTYPE declaration
3466 *
3467 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3468 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3469 */
3470
3471static void
3472htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3473 const xmlChar *name;
3474 xmlChar *ExternalID = NULL;
3475 xmlChar *URI = NULL;
3476
3477 /*
3478 * We know that '<!DOCTYPE' has been detected.
3479 */
3480 SKIP(9);
3481
3482 SKIP_BLANKS;
3483
3484 /*
3485 * Parse the DOCTYPE name.
3486 */
3487 name = htmlParseName(ctxt);
3488 if (name == NULL) {
3489 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3490 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3491 NULL, NULL);
3492 }
3493 /*
3494 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3495 */
3496
3497 SKIP_BLANKS;
3498
3499 /*
3500 * Check for SystemID and ExternalID
3501 */
3502 URI = htmlParseExternalID(ctxt, &ExternalID);
3503 SKIP_BLANKS;
3504
3505 /*
3506 * We should be at the end of the DOCTYPE declaration.
3507 */
3508 if (CUR != '>') {
3509 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3510 "DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003511 /* Ignore bogus content */
3512 while ((CUR != 0) && (CUR != '>'))
3513 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003514 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003515 if (CUR == '>')
3516 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003517
3518 /*
3519 * Create or update the document accordingly to the DOCTYPE
3520 */
3521 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3522 (!ctxt->disableSAX))
3523 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3524
3525 /*
3526 * Cleanup, since we don't use all those identifiers
3527 */
3528 if (URI != NULL) xmlFree(URI);
3529 if (ExternalID != NULL) xmlFree(ExternalID);
3530}
3531
3532/**
3533 * htmlParseAttribute:
3534 * @ctxt: an HTML parser context
3535 * @value: a xmlChar ** used to store the value of the attribute
3536 *
3537 * parse an attribute
3538 *
3539 * [41] Attribute ::= Name Eq AttValue
3540 *
3541 * [25] Eq ::= S? '=' S?
3542 *
3543 * With namespace:
3544 *
3545 * [NS 11] Attribute ::= QName Eq AttValue
3546 *
3547 * Also the case QName == xmlns:??? is handled independently as a namespace
3548 * definition.
3549 *
3550 * Returns the attribute name, and the value in *value.
3551 */
3552
3553static const xmlChar *
3554htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3555 const xmlChar *name;
3556 xmlChar *val = NULL;
3557
3558 *value = NULL;
3559 name = htmlParseHTMLName(ctxt);
3560 if (name == NULL) {
3561 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3562 "error parsing attribute name\n", NULL, NULL);
3563 return(NULL);
3564 }
3565
3566 /*
3567 * read the value
3568 */
3569 SKIP_BLANKS;
3570 if (CUR == '=') {
3571 NEXT;
3572 SKIP_BLANKS;
3573 val = htmlParseAttValue(ctxt);
3574 }
3575
3576 *value = val;
3577 return(name);
3578}
3579
3580/**
3581 * htmlCheckEncodingDirect:
3582 * @ctxt: an HTML parser context
3583 * @attvalue: the attribute value
3584 *
3585 * Checks an attribute value to detect
3586 * the encoding
3587 * If a new encoding is detected the parser is switched to decode
3588 * it and pass UTF8
3589 */
3590static void
3591htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3592
3593 if ((ctxt == NULL) || (encoding == NULL) ||
3594 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3595 return;
3596
3597 /* do not change encoding */
3598 if (ctxt->input->encoding != NULL)
3599 return;
3600
3601 if (encoding != NULL) {
3602 xmlCharEncoding enc;
3603 xmlCharEncodingHandlerPtr handler;
3604
3605 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3606
3607 if (ctxt->input->encoding != NULL)
3608 xmlFree((xmlChar *) ctxt->input->encoding);
3609 ctxt->input->encoding = xmlStrdup(encoding);
3610
3611 enc = xmlParseCharEncoding((const char *) encoding);
3612 /*
3613 * registered set of known encodings
3614 */
3615 if (enc != XML_CHAR_ENCODING_ERROR) {
3616 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3617 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3618 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3619 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3620 (ctxt->input->buf != NULL) &&
3621 (ctxt->input->buf->encoder == NULL)) {
3622 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3623 "htmlCheckEncoding: wrong encoding meta\n",
3624 NULL, NULL);
3625 } else {
3626 xmlSwitchEncoding(ctxt, enc);
3627 }
3628 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3629 } else {
3630 /*
3631 * fallback for unknown encodings
3632 */
3633 handler = xmlFindCharEncodingHandler((const char *) encoding);
3634 if (handler != NULL) {
3635 xmlSwitchToEncoding(ctxt, handler);
3636 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3637 } else {
3638 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3639 "htmlCheckEncoding: unknown encoding %s\n",
3640 encoding, NULL);
3641 }
3642 }
3643
3644 if ((ctxt->input->buf != NULL) &&
3645 (ctxt->input->buf->encoder != NULL) &&
3646 (ctxt->input->buf->raw != NULL) &&
3647 (ctxt->input->buf->buffer != NULL)) {
3648 int nbchars;
3649 int processed;
3650
3651 /*
3652 * convert as much as possible to the parser reading buffer.
3653 */
3654 processed = ctxt->input->cur - ctxt->input->base;
3655 xmlBufShrink(ctxt->input->buf->buffer, processed);
3656 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3657 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3658 if (nbchars < 0) {
3659 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3660 "htmlCheckEncoding: encoder error\n",
3661 NULL, NULL);
3662 }
3663 }
3664 }
3665}
3666
3667/**
3668 * htmlCheckEncoding:
3669 * @ctxt: an HTML parser context
3670 * @attvalue: the attribute value
3671 *
3672 * Checks an http-equiv attribute from a Meta tag to detect
3673 * the encoding
3674 * If a new encoding is detected the parser is switched to decode
3675 * it and pass UTF8
3676 */
3677static void
3678htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3679 const xmlChar *encoding;
3680
3681 if (!attvalue)
3682 return;
3683
3684 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3685 if (encoding != NULL) {
3686 encoding += 7;
3687 }
3688 /*
3689 * skip blank
3690 */
3691 if (encoding && IS_BLANK_CH(*encoding))
3692 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3693 if (encoding && *encoding == '=') {
3694 encoding ++;
3695 htmlCheckEncodingDirect(ctxt, encoding);
3696 }
3697}
3698
3699/**
3700 * htmlCheckMeta:
3701 * @ctxt: an HTML parser context
3702 * @atts: the attributes values
3703 *
3704 * Checks an attributes from a Meta tag
3705 */
3706static void
3707htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3708 int i;
3709 const xmlChar *att, *value;
3710 int http = 0;
3711 const xmlChar *content = NULL;
3712
3713 if ((ctxt == NULL) || (atts == NULL))
3714 return;
3715
3716 i = 0;
3717 att = atts[i++];
3718 while (att != NULL) {
3719 value = atts[i++];
3720 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3721 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3722 http = 1;
3723 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3724 htmlCheckEncodingDirect(ctxt, value);
3725 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3726 content = value;
3727 att = atts[i++];
3728 }
3729 if ((http) && (content != NULL))
3730 htmlCheckEncoding(ctxt, content);
3731
3732}
3733
3734/**
3735 * htmlParseStartTag:
3736 * @ctxt: an HTML parser context
3737 *
3738 * parse a start of tag either for rule element or
3739 * EmptyElement. In both case we don't parse the tag closing chars.
3740 *
3741 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3742 *
3743 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3744 *
3745 * With namespace:
3746 *
3747 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3748 *
3749 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3750 *
3751 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3752 */
3753
3754static int
3755htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3756 const xmlChar *name;
3757 const xmlChar *attname;
3758 xmlChar *attvalue;
3759 const xmlChar **atts;
3760 int nbatts = 0;
3761 int maxatts;
3762 int meta = 0;
3763 int i;
3764 int discardtag = 0;
3765
3766 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3767 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3768 "htmlParseStartTag: context error\n", NULL, NULL);
3769 return -1;
3770 }
3771 if (ctxt->instate == XML_PARSER_EOF)
3772 return(-1);
3773 if (CUR != '<') return -1;
3774 NEXT;
3775
3776 atts = ctxt->atts;
3777 maxatts = ctxt->maxatts;
3778
3779 GROW;
3780 name = htmlParseHTMLName(ctxt);
3781 if (name == NULL) {
3782 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3783 "htmlParseStartTag: invalid element name\n",
3784 NULL, NULL);
3785 /* if recover preserve text on classic misconstructs */
3786 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3787 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3788 htmlParseCharDataInternal(ctxt, '<');
3789 return(-1);
3790 }
3791
3792
3793 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003794 while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003795 (ctxt->instate != XML_PARSER_EOF))
3796 NEXT;
3797 return -1;
3798 }
3799 if (xmlStrEqual(name, BAD_CAST"meta"))
3800 meta = 1;
3801
3802 /*
3803 * Check for auto-closure of HTML elements.
3804 */
3805 htmlAutoClose(ctxt, name);
3806
3807 /*
3808 * Check for implied HTML elements.
3809 */
3810 htmlCheckImplied(ctxt, name);
3811
3812 /*
3813 * Avoid html at any level > 0, head at any level != 1
3814 * or any attempt to recurse body
3815 */
3816 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3817 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3818 "htmlParseStartTag: misplaced <html> tag\n",
3819 name, NULL);
3820 discardtag = 1;
3821 ctxt->depth++;
3822 }
3823 if ((ctxt->nameNr != 1) &&
3824 (xmlStrEqual(name, BAD_CAST"head"))) {
3825 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3826 "htmlParseStartTag: misplaced <head> tag\n",
3827 name, NULL);
3828 discardtag = 1;
3829 ctxt->depth++;
3830 }
3831 if (xmlStrEqual(name, BAD_CAST"body")) {
3832 int indx;
3833 for (indx = 0;indx < ctxt->nameNr;indx++) {
3834 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3835 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3836 "htmlParseStartTag: misplaced <body> tag\n",
3837 name, NULL);
3838 discardtag = 1;
3839 ctxt->depth++;
3840 }
3841 }
3842 }
3843
3844 /*
3845 * Now parse the attributes, it ends up with the ending
3846 *
3847 * (S Attribute)* S?
3848 */
3849 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003850 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003851 (CUR != '>') &&
3852 ((CUR != '/') || (NXT(1) != '>'))) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003853 GROW;
3854 attname = htmlParseAttribute(ctxt, &attvalue);
3855 if (attname != NULL) {
3856
3857 /*
3858 * Well formedness requires at most one declaration of an attribute
3859 */
3860 for (i = 0; i < nbatts;i += 2) {
3861 if (xmlStrEqual(atts[i], attname)) {
3862 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3863 "Attribute %s redefined\n", attname, NULL);
3864 if (attvalue != NULL)
3865 xmlFree(attvalue);
3866 goto failed;
3867 }
3868 }
3869
3870 /*
3871 * Add the pair to atts
3872 */
3873 if (atts == NULL) {
3874 maxatts = 22; /* allow for 10 attrs by default */
3875 atts = (const xmlChar **)
3876 xmlMalloc(maxatts * sizeof(xmlChar *));
3877 if (atts == NULL) {
3878 htmlErrMemory(ctxt, NULL);
3879 if (attvalue != NULL)
3880 xmlFree(attvalue);
3881 goto failed;
3882 }
3883 ctxt->atts = atts;
3884 ctxt->maxatts = maxatts;
3885 } else if (nbatts + 4 > maxatts) {
3886 const xmlChar **n;
3887
3888 maxatts *= 2;
3889 n = (const xmlChar **) xmlRealloc((void *) atts,
3890 maxatts * sizeof(const xmlChar *));
3891 if (n == NULL) {
3892 htmlErrMemory(ctxt, NULL);
3893 if (attvalue != NULL)
3894 xmlFree(attvalue);
3895 goto failed;
3896 }
3897 atts = n;
3898 ctxt->atts = atts;
3899 ctxt->maxatts = maxatts;
3900 }
3901 atts[nbatts++] = attname;
3902 atts[nbatts++] = attvalue;
3903 atts[nbatts] = NULL;
3904 atts[nbatts + 1] = NULL;
3905 }
3906 else {
3907 if (attvalue != NULL)
3908 xmlFree(attvalue);
3909 /* Dump the bogus attribute string up to the next blank or
3910 * the end of the tag. */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003911 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003912 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3913 ((CUR != '/') || (NXT(1) != '>')))
3914 NEXT;
3915 }
3916
3917failed:
3918 SKIP_BLANKS;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003919 }
3920
3921 /*
3922 * Handle specific association to the META tag
3923 */
3924 if (meta && (nbatts != 0))
3925 htmlCheckMeta(ctxt, atts);
3926
3927 /*
3928 * SAX: Start of Element !
3929 */
3930 if (!discardtag) {
3931 htmlnamePush(ctxt, name);
3932 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3933 if (nbatts != 0)
3934 ctxt->sax->startElement(ctxt->userData, name, atts);
3935 else
3936 ctxt->sax->startElement(ctxt->userData, name, NULL);
3937 }
3938 }
3939
3940 if (atts != NULL) {
3941 for (i = 1;i < nbatts;i += 2) {
3942 if (atts[i] != NULL)
3943 xmlFree((xmlChar *) atts[i]);
3944 }
3945 }
3946
3947 return(discardtag);
3948}
3949
3950/**
3951 * htmlParseEndTag:
3952 * @ctxt: an HTML parser context
3953 *
3954 * parse an end of tag
3955 *
3956 * [42] ETag ::= '</' Name S? '>'
3957 *
3958 * With namespace
3959 *
3960 * [NS 9] ETag ::= '</' QName S? '>'
3961 *
3962 * Returns 1 if the current level should be closed.
3963 */
3964
3965static int
3966htmlParseEndTag(htmlParserCtxtPtr ctxt)
3967{
3968 const xmlChar *name;
3969 const xmlChar *oldname;
3970 int i, ret;
3971
3972 if ((CUR != '<') || (NXT(1) != '/')) {
3973 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3974 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3975 return (0);
3976 }
3977 SKIP(2);
3978
3979 name = htmlParseHTMLName(ctxt);
3980 if (name == NULL)
3981 return (0);
3982 /*
3983 * We should definitely be at the ending "S? '>'" part
3984 */
3985 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003986 if (CUR != '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003987 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3988 "End tag : expected '>'\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003989 /* Skip to next '>' */
3990 while ((CUR != 0) && (CUR != '>'))
3991 NEXT;
3992 }
3993 if (CUR == '>')
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003994 NEXT;
3995
3996 /*
3997 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3998 * out now.
3999 */
4000 if ((ctxt->depth > 0) &&
4001 (xmlStrEqual(name, BAD_CAST "html") ||
4002 xmlStrEqual(name, BAD_CAST "body") ||
4003 xmlStrEqual(name, BAD_CAST "head"))) {
4004 ctxt->depth--;
4005 return (0);
4006 }
4007
4008 /*
4009 * If the name read is not one of the element in the parsing stack
4010 * then return, it's just an error.
4011 */
4012 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4013 if (xmlStrEqual(name, ctxt->nameTab[i]))
4014 break;
4015 }
4016 if (i < 0) {
4017 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4018 "Unexpected end tag : %s\n", name, NULL);
4019 return (0);
4020 }
4021
4022
4023 /*
4024 * Check for auto-closure of HTML elements.
4025 */
4026
4027 htmlAutoCloseOnClose(ctxt, name);
4028
4029 /*
4030 * Well formedness constraints, opening and closing must match.
4031 * With the exception that the autoclose may have popped stuff out
4032 * of the stack.
4033 */
4034 if (!xmlStrEqual(name, ctxt->name)) {
4035 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4036 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4037 "Opening and ending tag mismatch: %s and %s\n",
4038 name, ctxt->name);
4039 }
4040 }
4041
4042 /*
4043 * SAX: End of Tag
4044 */
4045 oldname = ctxt->name;
4046 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4047 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4048 ctxt->sax->endElement(ctxt->userData, name);
4049 htmlNodeInfoPop(ctxt);
4050 htmlnamePop(ctxt);
4051 ret = 1;
4052 } else {
4053 ret = 0;
4054 }
4055
4056 return (ret);
4057}
4058
4059
4060/**
4061 * htmlParseReference:
4062 * @ctxt: an HTML parser context
4063 *
4064 * parse and handle entity references in content,
4065 * this will end-up in a call to character() since this is either a
4066 * CharRef, or a predefined entity.
4067 */
4068static void
4069htmlParseReference(htmlParserCtxtPtr ctxt) {
4070 const htmlEntityDesc * ent;
4071 xmlChar out[6];
4072 const xmlChar *name;
4073 if (CUR != '&') return;
4074
4075 if (NXT(1) == '#') {
4076 unsigned int c;
4077 int bits, i = 0;
4078
4079 c = htmlParseCharRef(ctxt);
4080 if (c == 0)
4081 return;
4082
4083 if (c < 0x80) { out[i++]= c; bits= -6; }
4084 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4085 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4086 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4087
4088 for ( ; bits >= 0; bits-= 6) {
4089 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4090 }
4091 out[i] = 0;
4092
4093 htmlCheckParagraph(ctxt);
4094 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4095 ctxt->sax->characters(ctxt->userData, out, i);
4096 } else {
4097 ent = htmlParseEntityRef(ctxt, &name);
4098 if (name == NULL) {
4099 htmlCheckParagraph(ctxt);
4100 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4101 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4102 return;
4103 }
4104 if ((ent == NULL) || !(ent->value > 0)) {
4105 htmlCheckParagraph(ctxt);
4106 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4107 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4108 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4109 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4110 }
4111 } else {
4112 unsigned int c;
4113 int bits, i = 0;
4114
4115 c = ent->value;
4116 if (c < 0x80)
4117 { out[i++]= c; bits= -6; }
4118 else if (c < 0x800)
4119 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4120 else if (c < 0x10000)
4121 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4122 else
4123 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4124
4125 for ( ; bits >= 0; bits-= 6) {
4126 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4127 }
4128 out[i] = 0;
4129
4130 htmlCheckParagraph(ctxt);
4131 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4132 ctxt->sax->characters(ctxt->userData, out, i);
4133 }
4134 }
4135}
4136
4137/**
4138 * htmlParseContent:
4139 * @ctxt: an HTML parser context
4140 *
4141 * Parse a content: comment, sub-element, reference or text.
4142 * Kept for compatibility with old code
4143 */
4144
4145static void
4146htmlParseContent(htmlParserCtxtPtr ctxt) {
4147 xmlChar *currentNode;
4148 int depth;
4149 const xmlChar *name;
4150
4151 currentNode = xmlStrdup(ctxt->name);
4152 depth = ctxt->nameNr;
4153 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004154 GROW;
4155
4156 if (ctxt->instate == XML_PARSER_EOF)
4157 break;
4158
4159 /*
4160 * Our tag or one of it's parent or children is ending.
4161 */
4162 if ((CUR == '<') && (NXT(1) == '/')) {
4163 if (htmlParseEndTag(ctxt) &&
4164 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4165 if (currentNode != NULL)
4166 xmlFree(currentNode);
4167 return;
4168 }
4169 continue; /* while */
4170 }
4171
4172 else if ((CUR == '<') &&
4173 ((IS_ASCII_LETTER(NXT(1))) ||
4174 (NXT(1) == '_') || (NXT(1) == ':'))) {
4175 name = htmlParseHTMLName_nonInvasive(ctxt);
4176 if (name == NULL) {
4177 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4178 "htmlParseStartTag: invalid element name\n",
4179 NULL, NULL);
4180 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004181 while ((CUR != 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004182 NEXT;
4183
4184 if (currentNode != NULL)
4185 xmlFree(currentNode);
4186 return;
4187 }
4188
4189 if (ctxt->name != NULL) {
4190 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4191 htmlAutoClose(ctxt, name);
4192 continue;
4193 }
4194 }
4195 }
4196
4197 /*
4198 * Has this node been popped out during parsing of
4199 * the next element
4200 */
4201 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4202 (!xmlStrEqual(currentNode, ctxt->name)))
4203 {
4204 if (currentNode != NULL) xmlFree(currentNode);
4205 return;
4206 }
4207
4208 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4209 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4210 /*
4211 * Handle SCRIPT/STYLE separately
4212 */
4213 htmlParseScript(ctxt);
4214 } else {
4215 /*
4216 * Sometimes DOCTYPE arrives in the middle of the document
4217 */
4218 if ((CUR == '<') && (NXT(1) == '!') &&
4219 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4220 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4221 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4222 (UPP(8) == 'E')) {
4223 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4224 "Misplaced DOCTYPE declaration\n",
4225 BAD_CAST "DOCTYPE" , NULL);
4226 htmlParseDocTypeDecl(ctxt);
4227 }
4228
4229 /*
4230 * First case : a comment
4231 */
4232 if ((CUR == '<') && (NXT(1) == '!') &&
4233 (NXT(2) == '-') && (NXT(3) == '-')) {
4234 htmlParseComment(ctxt);
4235 }
4236
4237 /*
4238 * Second case : a Processing Instruction.
4239 */
4240 else if ((CUR == '<') && (NXT(1) == '?')) {
4241 htmlParsePI(ctxt);
4242 }
4243
4244 /*
4245 * Third case : a sub-element.
4246 */
4247 else if (CUR == '<') {
4248 htmlParseElement(ctxt);
4249 }
4250
4251 /*
4252 * Fourth case : a reference. If if has not been resolved,
4253 * parsing returns it's Name, create the node
4254 */
4255 else if (CUR == '&') {
4256 htmlParseReference(ctxt);
4257 }
4258
4259 /*
4260 * Fifth case : end of the resource
4261 */
4262 else if (CUR == 0) {
4263 htmlAutoCloseOnEnd(ctxt);
4264 break;
4265 }
4266
4267 /*
4268 * Last case, text. Note that References are handled directly.
4269 */
4270 else {
4271 htmlParseCharData(ctxt);
4272 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004273 }
4274 GROW;
4275 }
4276 if (currentNode != NULL) xmlFree(currentNode);
4277}
4278
4279/**
4280 * htmlParseElement:
4281 * @ctxt: an HTML parser context
4282 *
4283 * parse an HTML element, this is highly recursive
4284 * this is kept for compatibility with previous code versions
4285 *
4286 * [39] element ::= EmptyElemTag | STag content ETag
4287 *
4288 * [41] Attribute ::= Name Eq AttValue
4289 */
4290
4291void
4292htmlParseElement(htmlParserCtxtPtr ctxt) {
4293 const xmlChar *name;
4294 xmlChar *currentNode = NULL;
4295 const htmlElemDesc * info;
4296 htmlParserNodeInfo node_info;
4297 int failed;
4298 int depth;
4299 const xmlChar *oldptr;
4300
4301 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4302 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4303 "htmlParseElement: context error\n", NULL, NULL);
4304 return;
4305 }
4306
4307 if (ctxt->instate == XML_PARSER_EOF)
4308 return;
4309
4310 /* Capture start position */
4311 if (ctxt->record_info) {
4312 node_info.begin_pos = ctxt->input->consumed +
4313 (CUR_PTR - ctxt->input->base);
4314 node_info.begin_line = ctxt->input->line;
4315 }
4316
4317 failed = htmlParseStartTag(ctxt);
4318 name = ctxt->name;
4319 if ((failed == -1) || (name == NULL)) {
4320 if (CUR == '>')
4321 NEXT;
4322 return;
4323 }
4324
4325 /*
4326 * Lookup the info for that element.
4327 */
4328 info = htmlTagLookup(name);
4329 if (info == NULL) {
4330 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4331 "Tag %s invalid\n", name, NULL);
4332 }
4333
4334 /*
4335 * Check for an Empty Element labeled the XML/SGML way
4336 */
4337 if ((CUR == '/') && (NXT(1) == '>')) {
4338 SKIP(2);
4339 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4340 ctxt->sax->endElement(ctxt->userData, name);
4341 htmlnamePop(ctxt);
4342 return;
4343 }
4344
4345 if (CUR == '>') {
4346 NEXT;
4347 } else {
4348 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4349 "Couldn't find end of Start Tag %s\n", name, NULL);
4350
4351 /*
4352 * end of parsing of this node.
4353 */
4354 if (xmlStrEqual(name, ctxt->name)) {
4355 nodePop(ctxt);
4356 htmlnamePop(ctxt);
4357 }
4358
4359 /*
4360 * Capture end position and add node
4361 */
4362 if (ctxt->record_info) {
4363 node_info.end_pos = ctxt->input->consumed +
4364 (CUR_PTR - ctxt->input->base);
4365 node_info.end_line = ctxt->input->line;
4366 node_info.node = ctxt->node;
4367 xmlParserAddNodeInfo(ctxt, &node_info);
4368 }
4369 return;
4370 }
4371
4372 /*
4373 * Check for an Empty Element from DTD definition
4374 */
4375 if ((info != NULL) && (info->empty)) {
4376 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4377 ctxt->sax->endElement(ctxt->userData, name);
4378 htmlnamePop(ctxt);
4379 return;
4380 }
4381
4382 /*
4383 * Parse the content of the element:
4384 */
4385 currentNode = xmlStrdup(ctxt->name);
4386 depth = ctxt->nameNr;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004387 while (CUR != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004388 oldptr = ctxt->input->cur;
4389 htmlParseContent(ctxt);
4390 if (oldptr==ctxt->input->cur) break;
4391 if (ctxt->nameNr < depth) break;
4392 }
4393
4394 /*
4395 * Capture end position and add node
4396 */
4397 if ( currentNode != NULL && ctxt->record_info ) {
4398 node_info.end_pos = ctxt->input->consumed +
4399 (CUR_PTR - ctxt->input->base);
4400 node_info.end_line = ctxt->input->line;
4401 node_info.node = ctxt->node;
4402 xmlParserAddNodeInfo(ctxt, &node_info);
4403 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004404 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004405 htmlAutoCloseOnEnd(ctxt);
4406 }
4407
4408 if (currentNode != NULL)
4409 xmlFree(currentNode);
4410}
4411
4412static void
4413htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4414 /*
4415 * Capture end position and add node
4416 */
4417 if ( ctxt->node != NULL && ctxt->record_info ) {
4418 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4419 (CUR_PTR - ctxt->input->base);
4420 ctxt->nodeInfo->end_line = ctxt->input->line;
4421 ctxt->nodeInfo->node = ctxt->node;
4422 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4423 htmlNodeInfoPop(ctxt);
4424 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004425 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004426 htmlAutoCloseOnEnd(ctxt);
4427 }
4428}
4429
4430/**
4431 * htmlParseElementInternal:
4432 * @ctxt: an HTML parser context
4433 *
4434 * parse an HTML element, new version, non recursive
4435 *
4436 * [39] element ::= EmptyElemTag | STag content ETag
4437 *
4438 * [41] Attribute ::= Name Eq AttValue
4439 */
4440
4441static void
4442htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4443 const xmlChar *name;
4444 const htmlElemDesc * info;
4445 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4446 int failed;
4447
4448 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4449 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4450 "htmlParseElementInternal: context error\n", NULL, NULL);
4451 return;
4452 }
4453
4454 if (ctxt->instate == XML_PARSER_EOF)
4455 return;
4456
4457 /* Capture start position */
4458 if (ctxt->record_info) {
4459 node_info.begin_pos = ctxt->input->consumed +
4460 (CUR_PTR - ctxt->input->base);
4461 node_info.begin_line = ctxt->input->line;
4462 }
4463
4464 failed = htmlParseStartTag(ctxt);
4465 name = ctxt->name;
4466 if ((failed == -1) || (name == NULL)) {
4467 if (CUR == '>')
4468 NEXT;
4469 return;
4470 }
4471
4472 /*
4473 * Lookup the info for that element.
4474 */
4475 info = htmlTagLookup(name);
4476 if (info == NULL) {
4477 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4478 "Tag %s invalid\n", name, NULL);
4479 }
4480
4481 /*
4482 * Check for an Empty Element labeled the XML/SGML way
4483 */
4484 if ((CUR == '/') && (NXT(1) == '>')) {
4485 SKIP(2);
4486 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4487 ctxt->sax->endElement(ctxt->userData, name);
4488 htmlnamePop(ctxt);
4489 return;
4490 }
4491
4492 if (CUR == '>') {
4493 NEXT;
4494 } else {
4495 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4496 "Couldn't find end of Start Tag %s\n", name, NULL);
4497
4498 /*
4499 * end of parsing of this node.
4500 */
4501 if (xmlStrEqual(name, ctxt->name)) {
4502 nodePop(ctxt);
4503 htmlnamePop(ctxt);
4504 }
4505
4506 if (ctxt->record_info)
4507 htmlNodeInfoPush(ctxt, &node_info);
4508 htmlParserFinishElementParsing(ctxt);
4509 return;
4510 }
4511
4512 /*
4513 * Check for an Empty Element from DTD definition
4514 */
4515 if ((info != NULL) && (info->empty)) {
4516 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4517 ctxt->sax->endElement(ctxt->userData, name);
4518 htmlnamePop(ctxt);
4519 return;
4520 }
4521
4522 if (ctxt->record_info)
4523 htmlNodeInfoPush(ctxt, &node_info);
4524}
4525
4526/**
4527 * htmlParseContentInternal:
4528 * @ctxt: an HTML parser context
4529 *
4530 * Parse a content: comment, sub-element, reference or text.
4531 * New version for non recursive htmlParseElementInternal
4532 */
4533
4534static void
4535htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4536 xmlChar *currentNode;
4537 int depth;
4538 const xmlChar *name;
4539
4540 currentNode = xmlStrdup(ctxt->name);
4541 depth = ctxt->nameNr;
4542 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004543 GROW;
4544
4545 if (ctxt->instate == XML_PARSER_EOF)
4546 break;
4547
4548 /*
4549 * Our tag or one of it's parent or children is ending.
4550 */
4551 if ((CUR == '<') && (NXT(1) == '/')) {
4552 if (htmlParseEndTag(ctxt) &&
4553 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4554 if (currentNode != NULL)
4555 xmlFree(currentNode);
4556
4557 currentNode = xmlStrdup(ctxt->name);
4558 depth = ctxt->nameNr;
4559 }
4560 continue; /* while */
4561 }
4562
4563 else if ((CUR == '<') &&
4564 ((IS_ASCII_LETTER(NXT(1))) ||
4565 (NXT(1) == '_') || (NXT(1) == ':'))) {
4566 name = htmlParseHTMLName_nonInvasive(ctxt);
4567 if (name == NULL) {
4568 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4569 "htmlParseStartTag: invalid element name\n",
4570 NULL, NULL);
4571 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004572 while ((CUR == 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004573 NEXT;
4574
4575 htmlParserFinishElementParsing(ctxt);
4576 if (currentNode != NULL)
4577 xmlFree(currentNode);
4578
4579 currentNode = xmlStrdup(ctxt->name);
4580 depth = ctxt->nameNr;
4581 continue;
4582 }
4583
4584 if (ctxt->name != NULL) {
4585 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4586 htmlAutoClose(ctxt, name);
4587 continue;
4588 }
4589 }
4590 }
4591
4592 /*
4593 * Has this node been popped out during parsing of
4594 * the next element
4595 */
4596 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4597 (!xmlStrEqual(currentNode, ctxt->name)))
4598 {
4599 htmlParserFinishElementParsing(ctxt);
4600 if (currentNode != NULL) xmlFree(currentNode);
4601
4602 currentNode = xmlStrdup(ctxt->name);
4603 depth = ctxt->nameNr;
4604 continue;
4605 }
4606
4607 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4608 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4609 /*
4610 * Handle SCRIPT/STYLE separately
4611 */
4612 htmlParseScript(ctxt);
4613 } else {
4614 /*
4615 * Sometimes DOCTYPE arrives in the middle of the document
4616 */
4617 if ((CUR == '<') && (NXT(1) == '!') &&
4618 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4619 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4620 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4621 (UPP(8) == 'E')) {
4622 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4623 "Misplaced DOCTYPE declaration\n",
4624 BAD_CAST "DOCTYPE" , NULL);
4625 htmlParseDocTypeDecl(ctxt);
4626 }
4627
4628 /*
4629 * First case : a comment
4630 */
4631 if ((CUR == '<') && (NXT(1) == '!') &&
4632 (NXT(2) == '-') && (NXT(3) == '-')) {
4633 htmlParseComment(ctxt);
4634 }
4635
4636 /*
4637 * Second case : a Processing Instruction.
4638 */
4639 else if ((CUR == '<') && (NXT(1) == '?')) {
4640 htmlParsePI(ctxt);
4641 }
4642
4643 /*
4644 * Third case : a sub-element.
4645 */
4646 else if (CUR == '<') {
4647 htmlParseElementInternal(ctxt);
4648 if (currentNode != NULL) xmlFree(currentNode);
4649
4650 currentNode = xmlStrdup(ctxt->name);
4651 depth = ctxt->nameNr;
4652 }
4653
4654 /*
4655 * Fourth case : a reference. If if has not been resolved,
4656 * parsing returns it's Name, create the node
4657 */
4658 else if (CUR == '&') {
4659 htmlParseReference(ctxt);
4660 }
4661
4662 /*
4663 * Fifth case : end of the resource
4664 */
4665 else if (CUR == 0) {
4666 htmlAutoCloseOnEnd(ctxt);
4667 break;
4668 }
4669
4670 /*
4671 * Last case, text. Note that References are handled directly.
4672 */
4673 else {
4674 htmlParseCharData(ctxt);
4675 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004676 }
4677 GROW;
4678 }
4679 if (currentNode != NULL) xmlFree(currentNode);
4680}
4681
4682/**
4683 * htmlParseContent:
4684 * @ctxt: an HTML parser context
4685 *
4686 * Parse a content: comment, sub-element, reference or text.
4687 * This is the entry point when called from parser.c
4688 */
4689
4690void
4691__htmlParseContent(void *ctxt) {
4692 if (ctxt != NULL)
4693 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4694}
4695
4696/**
4697 * htmlParseDocument:
4698 * @ctxt: an HTML parser context
4699 *
4700 * parse an HTML document (and build a tree if using the standard SAX
4701 * interface).
4702 *
4703 * Returns 0, -1 in case of error. the parser context is augmented
4704 * as a result of the parsing.
4705 */
4706
4707int
4708htmlParseDocument(htmlParserCtxtPtr ctxt) {
4709 xmlChar start[4];
4710 xmlCharEncoding enc;
4711 xmlDtdPtr dtd;
4712
4713 xmlInitParser();
4714
4715 htmlDefaultSAXHandlerInit();
4716
4717 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4718 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4719 "htmlParseDocument: context error\n", NULL, NULL);
4720 return(XML_ERR_INTERNAL_ERROR);
4721 }
4722 ctxt->html = 1;
4723 ctxt->linenumbers = 1;
4724 GROW;
4725 /*
4726 * SAX: beginning of the document processing.
4727 */
4728 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4729 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4730
4731 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4732 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4733 /*
4734 * Get the 4 first bytes and decode the charset
4735 * if enc != XML_CHAR_ENCODING_NONE
4736 * plug some encoding conversion routines.
4737 */
4738 start[0] = RAW;
4739 start[1] = NXT(1);
4740 start[2] = NXT(2);
4741 start[3] = NXT(3);
4742 enc = xmlDetectCharEncoding(&start[0], 4);
4743 if (enc != XML_CHAR_ENCODING_NONE) {
4744 xmlSwitchEncoding(ctxt, enc);
4745 }
4746 }
4747
4748 /*
4749 * Wipe out everything which is before the first '<'
4750 */
4751 SKIP_BLANKS;
4752 if (CUR == 0) {
4753 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4754 "Document is empty\n", NULL, NULL);
4755 }
4756
4757 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4758 ctxt->sax->startDocument(ctxt->userData);
4759
4760
4761 /*
4762 * Parse possible comments and PIs before any content
4763 */
4764 while (((CUR == '<') && (NXT(1) == '!') &&
4765 (NXT(2) == '-') && (NXT(3) == '-')) ||
4766 ((CUR == '<') && (NXT(1) == '?'))) {
4767 htmlParseComment(ctxt);
4768 htmlParsePI(ctxt);
4769 SKIP_BLANKS;
4770 }
4771
4772
4773 /*
4774 * Then possibly doc type declaration(s) and more Misc
4775 * (doctypedecl Misc*)?
4776 */
4777 if ((CUR == '<') && (NXT(1) == '!') &&
4778 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4779 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4780 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4781 (UPP(8) == 'E')) {
4782 htmlParseDocTypeDecl(ctxt);
4783 }
4784 SKIP_BLANKS;
4785
4786 /*
4787 * Parse possible comments and PIs before any content
4788 */
4789 while (((CUR == '<') && (NXT(1) == '!') &&
4790 (NXT(2) == '-') && (NXT(3) == '-')) ||
4791 ((CUR == '<') && (NXT(1) == '?'))) {
4792 htmlParseComment(ctxt);
4793 htmlParsePI(ctxt);
4794 SKIP_BLANKS;
4795 }
4796
4797 /*
4798 * Time to start parsing the tree itself
4799 */
4800 htmlParseContentInternal(ctxt);
4801
4802 /*
4803 * autoclose
4804 */
4805 if (CUR == 0)
4806 htmlAutoCloseOnEnd(ctxt);
4807
4808
4809 /*
4810 * SAX: end of the document processing.
4811 */
4812 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4813 ctxt->sax->endDocument(ctxt->userData);
4814
4815 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4816 dtd = xmlGetIntSubset(ctxt->myDoc);
4817 if (dtd == NULL)
4818 ctxt->myDoc->intSubset =
4819 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4820 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4821 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4822 }
4823 if (! ctxt->wellFormed) return(-1);
4824 return(0);
4825}
4826
4827
4828/************************************************************************
4829 * *
4830 * Parser contexts handling *
4831 * *
4832 ************************************************************************/
4833
4834/**
4835 * htmlInitParserCtxt:
4836 * @ctxt: an HTML parser context
4837 *
4838 * Initialize a parser context
4839 *
4840 * Returns 0 in case of success and -1 in case of error
4841 */
4842
4843static int
4844htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4845{
4846 htmlSAXHandler *sax;
4847
4848 if (ctxt == NULL) return(-1);
4849 memset(ctxt, 0, sizeof(htmlParserCtxt));
4850
4851 ctxt->dict = xmlDictCreate();
4852 if (ctxt->dict == NULL) {
4853 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4854 return(-1);
4855 }
4856 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4857 if (sax == NULL) {
4858 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4859 return(-1);
4860 }
4861 else
4862 memset(sax, 0, sizeof(htmlSAXHandler));
4863
4864 /* Allocate the Input stack */
4865 ctxt->inputTab = (htmlParserInputPtr *)
4866 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4867 if (ctxt->inputTab == NULL) {
4868 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4869 ctxt->inputNr = 0;
4870 ctxt->inputMax = 0;
4871 ctxt->input = NULL;
4872 return(-1);
4873 }
4874 ctxt->inputNr = 0;
4875 ctxt->inputMax = 5;
4876 ctxt->input = NULL;
4877 ctxt->version = NULL;
4878 ctxt->encoding = NULL;
4879 ctxt->standalone = -1;
4880 ctxt->instate = XML_PARSER_START;
4881
4882 /* Allocate the Node stack */
4883 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4884 if (ctxt->nodeTab == NULL) {
4885 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4886 ctxt->nodeNr = 0;
4887 ctxt->nodeMax = 0;
4888 ctxt->node = NULL;
4889 ctxt->inputNr = 0;
4890 ctxt->inputMax = 0;
4891 ctxt->input = NULL;
4892 return(-1);
4893 }
4894 ctxt->nodeNr = 0;
4895 ctxt->nodeMax = 10;
4896 ctxt->node = NULL;
4897
4898 /* Allocate the Name stack */
4899 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4900 if (ctxt->nameTab == NULL) {
4901 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4902 ctxt->nameNr = 0;
4903 ctxt->nameMax = 0;
4904 ctxt->name = NULL;
4905 ctxt->nodeNr = 0;
4906 ctxt->nodeMax = 0;
4907 ctxt->node = NULL;
4908 ctxt->inputNr = 0;
4909 ctxt->inputMax = 0;
4910 ctxt->input = NULL;
4911 return(-1);
4912 }
4913 ctxt->nameNr = 0;
4914 ctxt->nameMax = 10;
4915 ctxt->name = NULL;
4916
4917 ctxt->nodeInfoTab = NULL;
4918 ctxt->nodeInfoNr = 0;
4919 ctxt->nodeInfoMax = 0;
4920
4921 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4922 else {
4923 ctxt->sax = sax;
4924 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4925 }
4926 ctxt->userData = ctxt;
4927 ctxt->myDoc = NULL;
4928 ctxt->wellFormed = 1;
4929 ctxt->replaceEntities = 0;
4930 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4931 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4932 ctxt->html = 1;
4933 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4934 ctxt->vctxt.userData = ctxt;
4935 ctxt->vctxt.error = xmlParserValidityError;
4936 ctxt->vctxt.warning = xmlParserValidityWarning;
4937 ctxt->record_info = 0;
4938 ctxt->validate = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004939 ctxt->checkIndex = 0;
4940 ctxt->catalogs = NULL;
4941 xmlInitNodeInfoSeq(&ctxt->node_seq);
4942 return(0);
4943}
4944
4945/**
4946 * htmlFreeParserCtxt:
4947 * @ctxt: an HTML parser context
4948 *
4949 * Free all the memory used by a parser context. However the parsed
4950 * document in ctxt->myDoc is not freed.
4951 */
4952
4953void
4954htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4955{
4956 xmlFreeParserCtxt(ctxt);
4957}
4958
4959/**
4960 * htmlNewParserCtxt:
4961 *
4962 * Allocate and initialize a new parser context.
4963 *
4964 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4965 */
4966
4967htmlParserCtxtPtr
4968htmlNewParserCtxt(void)
4969{
4970 xmlParserCtxtPtr ctxt;
4971
4972 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4973 if (ctxt == NULL) {
4974 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4975 return(NULL);
4976 }
4977 memset(ctxt, 0, sizeof(xmlParserCtxt));
4978 if (htmlInitParserCtxt(ctxt) < 0) {
4979 htmlFreeParserCtxt(ctxt);
4980 return(NULL);
4981 }
4982 return(ctxt);
4983}
4984
4985/**
4986 * htmlCreateMemoryParserCtxt:
4987 * @buffer: a pointer to a char array
4988 * @size: the size of the array
4989 *
4990 * Create a parser context for an HTML in-memory document.
4991 *
4992 * Returns the new parser context or NULL
4993 */
4994htmlParserCtxtPtr
4995htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4996 xmlParserCtxtPtr ctxt;
4997 xmlParserInputPtr input;
4998 xmlParserInputBufferPtr buf;
4999
5000 if (buffer == NULL)
5001 return(NULL);
5002 if (size <= 0)
5003 return(NULL);
5004
5005 ctxt = htmlNewParserCtxt();
5006 if (ctxt == NULL)
5007 return(NULL);
5008
5009 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5010 if (buf == NULL) return(NULL);
5011
5012 input = xmlNewInputStream(ctxt);
5013 if (input == NULL) {
5014 xmlFreeParserCtxt(ctxt);
5015 return(NULL);
5016 }
5017
5018 input->filename = NULL;
5019 input->buf = buf;
5020 xmlBufResetInput(buf->buffer, input);
5021
5022 inputPush(ctxt, input);
5023 return(ctxt);
5024}
5025
5026/**
5027 * htmlCreateDocParserCtxt:
5028 * @cur: a pointer to an array of xmlChar
5029 * @encoding: a free form C string describing the HTML document encoding, or NULL
5030 *
5031 * Create a parser context for an HTML document.
5032 *
5033 * TODO: check the need to add encoding handling there
5034 *
5035 * Returns the new parser context or NULL
5036 */
5037static htmlParserCtxtPtr
5038htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5039 int len;
5040 htmlParserCtxtPtr ctxt;
5041
5042 if (cur == NULL)
5043 return(NULL);
5044 len = xmlStrlen(cur);
5045 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5046 if (ctxt == NULL)
5047 return(NULL);
5048
5049 if (encoding != NULL) {
5050 xmlCharEncoding enc;
5051 xmlCharEncodingHandlerPtr handler;
5052
5053 if (ctxt->input->encoding != NULL)
5054 xmlFree((xmlChar *) ctxt->input->encoding);
5055 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5056
5057 enc = xmlParseCharEncoding(encoding);
5058 /*
5059 * registered set of known encodings
5060 */
5061 if (enc != XML_CHAR_ENCODING_ERROR) {
5062 xmlSwitchEncoding(ctxt, enc);
5063 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5064 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5065 "Unsupported encoding %s\n",
5066 (const xmlChar *) encoding, NULL);
5067 }
5068 } else {
5069 /*
5070 * fallback for unknown encodings
5071 */
5072 handler = xmlFindCharEncodingHandler((const char *) encoding);
5073 if (handler != NULL) {
5074 xmlSwitchToEncoding(ctxt, handler);
5075 } else {
5076 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5077 "Unsupported encoding %s\n",
5078 (const xmlChar *) encoding, NULL);
5079 }
5080 }
5081 }
5082 return(ctxt);
5083}
5084
5085#ifdef LIBXML_PUSH_ENABLED
5086/************************************************************************
5087 * *
5088 * Progressive parsing interfaces *
5089 * *
5090 ************************************************************************/
5091
5092/**
5093 * htmlParseLookupSequence:
5094 * @ctxt: an HTML parser context
5095 * @first: the first char to lookup
5096 * @next: the next char to lookup or zero
5097 * @third: the next char to lookup or zero
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005098 * @ignoreattrval: skip over attribute values
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005099 *
5100 * Try to find if a sequence (first, next, third) or just (first next) or
5101 * (first) is available in the input stream.
5102 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5103 * to avoid rescanning sequences of bytes, it DOES change the state of the
5104 * parser, do not use liberally.
5105 * This is basically similar to xmlParseLookupSequence()
5106 *
5107 * Returns the index to the current parsing point if the full sequence
5108 * is available, -1 otherwise.
5109 */
5110static int
5111htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005112 xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005113{
5114 int base, len;
5115 htmlParserInputPtr in;
5116 const xmlChar *buf;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005117 int invalue = 0;
5118 char valdellim = 0x0;
5119
5120 in = ctxt->input;
5121 if (in == NULL)
5122 return (-1);
5123
5124 base = in->cur - in->base;
5125 if (base < 0)
5126 return (-1);
5127
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005128 if (ctxt->checkIndex > base) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005129 base = ctxt->checkIndex;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005130 /* Abuse hasPErefs member to restore current state. */
5131 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5132 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005133
5134 if (in->buf == NULL) {
5135 buf = in->base;
5136 len = in->length;
5137 } else {
5138 buf = xmlBufContent(in->buf->buffer);
5139 len = xmlBufUse(in->buf->buffer);
5140 }
5141
5142 /* take into account the sequence length */
5143 if (third)
5144 len -= 2;
5145 else if (next)
5146 len--;
5147 for (; base < len; base++) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005148 if (ignoreattrval) {
5149 if (buf[base] == '"' || buf[base] == '\'') {
5150 if (invalue) {
5151 if (buf[base] == valdellim) {
5152 invalue = 0;
5153 continue;
5154 }
5155 } else {
5156 valdellim = buf[base];
5157 invalue = 1;
5158 continue;
5159 }
5160 } else if (invalue) {
5161 continue;
5162 }
5163 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005164 if (buf[base] == first) {
5165 if (third != 0) {
5166 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5167 continue;
5168 } else if (next != 0) {
5169 if (buf[base + 1] != next)
5170 continue;
5171 }
5172 ctxt->checkIndex = 0;
5173#ifdef DEBUG_PUSH
5174 if (next == 0)
5175 xmlGenericError(xmlGenericErrorContext,
5176 "HPP: lookup '%c' found at %d\n",
5177 first, base);
5178 else if (third == 0)
5179 xmlGenericError(xmlGenericErrorContext,
5180 "HPP: lookup '%c%c' found at %d\n",
5181 first, next, base);
5182 else
5183 xmlGenericError(xmlGenericErrorContext,
5184 "HPP: lookup '%c%c%c' found at %d\n",
5185 first, next, third, base);
5186#endif
5187 return (base - (in->cur - in->base));
5188 }
5189 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005190 ctxt->checkIndex = base;
5191 /* Abuse hasPErefs member to track current state. */
5192 if (invalue)
5193 ctxt->hasPErefs |= 1;
5194 else
5195 ctxt->hasPErefs &= ~1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005196#ifdef DEBUG_PUSH
5197 if (next == 0)
5198 xmlGenericError(xmlGenericErrorContext,
5199 "HPP: lookup '%c' failed\n", first);
5200 else if (third == 0)
5201 xmlGenericError(xmlGenericErrorContext,
5202 "HPP: lookup '%c%c' failed\n", first, next);
5203 else
5204 xmlGenericError(xmlGenericErrorContext,
5205 "HPP: lookup '%c%c%c' failed\n", first, next,
5206 third);
5207#endif
5208 return (-1);
5209}
5210
5211/**
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005212 * htmlParseTryOrFinish:
5213 * @ctxt: an HTML parser context
5214 * @terminate: last chunk indicator
5215 *
5216 * Try to progress on parsing
5217 *
5218 * Returns zero if no parsing was possible
5219 */
5220static int
5221htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5222 int ret = 0;
5223 htmlParserInputPtr in;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005224 ptrdiff_t avail = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005225 xmlChar cur, next;
5226
5227 htmlParserNodeInfo node_info;
5228
5229#ifdef DEBUG_PUSH
5230 switch (ctxt->instate) {
5231 case XML_PARSER_EOF:
5232 xmlGenericError(xmlGenericErrorContext,
5233 "HPP: try EOF\n"); break;
5234 case XML_PARSER_START:
5235 xmlGenericError(xmlGenericErrorContext,
5236 "HPP: try START\n"); break;
5237 case XML_PARSER_MISC:
5238 xmlGenericError(xmlGenericErrorContext,
5239 "HPP: try MISC\n");break;
5240 case XML_PARSER_COMMENT:
5241 xmlGenericError(xmlGenericErrorContext,
5242 "HPP: try COMMENT\n");break;
5243 case XML_PARSER_PROLOG:
5244 xmlGenericError(xmlGenericErrorContext,
5245 "HPP: try PROLOG\n");break;
5246 case XML_PARSER_START_TAG:
5247 xmlGenericError(xmlGenericErrorContext,
5248 "HPP: try START_TAG\n");break;
5249 case XML_PARSER_CONTENT:
5250 xmlGenericError(xmlGenericErrorContext,
5251 "HPP: try CONTENT\n");break;
5252 case XML_PARSER_CDATA_SECTION:
5253 xmlGenericError(xmlGenericErrorContext,
5254 "HPP: try CDATA_SECTION\n");break;
5255 case XML_PARSER_END_TAG:
5256 xmlGenericError(xmlGenericErrorContext,
5257 "HPP: try END_TAG\n");break;
5258 case XML_PARSER_ENTITY_DECL:
5259 xmlGenericError(xmlGenericErrorContext,
5260 "HPP: try ENTITY_DECL\n");break;
5261 case XML_PARSER_ENTITY_VALUE:
5262 xmlGenericError(xmlGenericErrorContext,
5263 "HPP: try ENTITY_VALUE\n");break;
5264 case XML_PARSER_ATTRIBUTE_VALUE:
5265 xmlGenericError(xmlGenericErrorContext,
5266 "HPP: try ATTRIBUTE_VALUE\n");break;
5267 case XML_PARSER_DTD:
5268 xmlGenericError(xmlGenericErrorContext,
5269 "HPP: try DTD\n");break;
5270 case XML_PARSER_EPILOG:
5271 xmlGenericError(xmlGenericErrorContext,
5272 "HPP: try EPILOG\n");break;
5273 case XML_PARSER_PI:
5274 xmlGenericError(xmlGenericErrorContext,
5275 "HPP: try PI\n");break;
5276 case XML_PARSER_SYSTEM_LITERAL:
5277 xmlGenericError(xmlGenericErrorContext,
5278 "HPP: try SYSTEM_LITERAL\n");break;
5279 }
5280#endif
5281
5282 while (1) {
5283
5284 in = ctxt->input;
5285 if (in == NULL) break;
5286 if (in->buf == NULL)
5287 avail = in->length - (in->cur - in->base);
5288 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005289 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5290 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005291 if ((avail == 0) && (terminate)) {
5292 htmlAutoCloseOnEnd(ctxt);
5293 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5294 /*
5295 * SAX: end of the document processing.
5296 */
5297 ctxt->instate = XML_PARSER_EOF;
5298 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5299 ctxt->sax->endDocument(ctxt->userData);
5300 }
5301 }
5302 if (avail < 1)
5303 goto done;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005304 /*
5305 * This is done to make progress and avoid an infinite loop
5306 * if a parsing attempt was aborted by hitting a NUL byte. After
5307 * changing htmlCurrentChar, this probably isn't necessary anymore.
5308 * We should consider removing this check.
5309 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005310 cur = in->cur[0];
5311 if (cur == 0) {
5312 SKIP(1);
5313 continue;
5314 }
5315
5316 switch (ctxt->instate) {
5317 case XML_PARSER_EOF:
5318 /*
5319 * Document parsing is done !
5320 */
5321 goto done;
5322 case XML_PARSER_START:
5323 /*
5324 * Very first chars read from the document flow.
5325 */
5326 cur = in->cur[0];
5327 if (IS_BLANK_CH(cur)) {
5328 SKIP_BLANKS;
5329 if (in->buf == NULL)
5330 avail = in->length - (in->cur - in->base);
5331 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005332 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5333 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005334 }
5335 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5336 ctxt->sax->setDocumentLocator(ctxt->userData,
5337 &xmlDefaultSAXLocator);
5338 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5339 (!ctxt->disableSAX))
5340 ctxt->sax->startDocument(ctxt->userData);
5341
5342 cur = in->cur[0];
5343 next = in->cur[1];
5344 if ((cur == '<') && (next == '!') &&
5345 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5346 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5347 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5348 (UPP(8) == 'E')) {
5349 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005350 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005351 goto done;
5352#ifdef DEBUG_PUSH
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: Parsing internal subset\n");
5355#endif
5356 htmlParseDocTypeDecl(ctxt);
5357 ctxt->instate = XML_PARSER_PROLOG;
5358#ifdef DEBUG_PUSH
5359 xmlGenericError(xmlGenericErrorContext,
5360 "HPP: entering PROLOG\n");
5361#endif
5362 } else {
5363 ctxt->instate = XML_PARSER_MISC;
5364#ifdef DEBUG_PUSH
5365 xmlGenericError(xmlGenericErrorContext,
5366 "HPP: entering MISC\n");
5367#endif
5368 }
5369 break;
5370 case XML_PARSER_MISC:
5371 SKIP_BLANKS;
5372 if (in->buf == NULL)
5373 avail = in->length - (in->cur - in->base);
5374 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005375 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5376 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005377 /*
5378 * no chars in buffer
5379 */
5380 if (avail < 1)
5381 goto done;
5382 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005383 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005384 */
5385 if (avail < 2) {
5386 if (!terminate)
5387 goto done;
5388 else
5389 next = ' ';
5390 } else {
5391 next = in->cur[1];
5392 }
5393 cur = in->cur[0];
5394 if ((cur == '<') && (next == '!') &&
5395 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5396 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005397 (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005398 goto done;
5399#ifdef DEBUG_PUSH
5400 xmlGenericError(xmlGenericErrorContext,
5401 "HPP: Parsing Comment\n");
5402#endif
5403 htmlParseComment(ctxt);
5404 ctxt->instate = XML_PARSER_MISC;
5405 } else if ((cur == '<') && (next == '?')) {
5406 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005407 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005408 goto done;
5409#ifdef DEBUG_PUSH
5410 xmlGenericError(xmlGenericErrorContext,
5411 "HPP: Parsing PI\n");
5412#endif
5413 htmlParsePI(ctxt);
5414 ctxt->instate = XML_PARSER_MISC;
5415 } else if ((cur == '<') && (next == '!') &&
5416 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5417 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5418 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5419 (UPP(8) == 'E')) {
5420 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005421 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005422 goto done;
5423#ifdef DEBUG_PUSH
5424 xmlGenericError(xmlGenericErrorContext,
5425 "HPP: Parsing internal subset\n");
5426#endif
5427 htmlParseDocTypeDecl(ctxt);
5428 ctxt->instate = XML_PARSER_PROLOG;
5429#ifdef DEBUG_PUSH
5430 xmlGenericError(xmlGenericErrorContext,
5431 "HPP: entering PROLOG\n");
5432#endif
5433 } else if ((cur == '<') && (next == '!') &&
5434 (avail < 9)) {
5435 goto done;
5436 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005437 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005438#ifdef DEBUG_PUSH
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: entering START_TAG\n");
5441#endif
5442 }
5443 break;
5444 case XML_PARSER_PROLOG:
5445 SKIP_BLANKS;
5446 if (in->buf == NULL)
5447 avail = in->length - (in->cur - in->base);
5448 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005449 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5450 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005451 if (avail < 2)
5452 goto done;
5453 cur = in->cur[0];
5454 next = in->cur[1];
5455 if ((cur == '<') && (next == '!') &&
5456 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5457 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005458 (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005459 goto done;
5460#ifdef DEBUG_PUSH
5461 xmlGenericError(xmlGenericErrorContext,
5462 "HPP: Parsing Comment\n");
5463#endif
5464 htmlParseComment(ctxt);
5465 ctxt->instate = XML_PARSER_PROLOG;
5466 } else if ((cur == '<') && (next == '?')) {
5467 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005468 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005469 goto done;
5470#ifdef DEBUG_PUSH
5471 xmlGenericError(xmlGenericErrorContext,
5472 "HPP: Parsing PI\n");
5473#endif
5474 htmlParsePI(ctxt);
5475 ctxt->instate = XML_PARSER_PROLOG;
5476 } else if ((cur == '<') && (next == '!') &&
5477 (avail < 4)) {
5478 goto done;
5479 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005480 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005481#ifdef DEBUG_PUSH
5482 xmlGenericError(xmlGenericErrorContext,
5483 "HPP: entering START_TAG\n");
5484#endif
5485 }
5486 break;
5487 case XML_PARSER_EPILOG:
5488 if (in->buf == NULL)
5489 avail = in->length - (in->cur - in->base);
5490 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005491 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5492 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005493 if (avail < 1)
5494 goto done;
5495 cur = in->cur[0];
5496 if (IS_BLANK_CH(cur)) {
5497 htmlParseCharData(ctxt);
5498 goto done;
5499 }
5500 if (avail < 2)
5501 goto done;
5502 next = in->cur[1];
5503 if ((cur == '<') && (next == '!') &&
5504 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5505 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005506 (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005507 goto done;
5508#ifdef DEBUG_PUSH
5509 xmlGenericError(xmlGenericErrorContext,
5510 "HPP: Parsing Comment\n");
5511#endif
5512 htmlParseComment(ctxt);
5513 ctxt->instate = XML_PARSER_EPILOG;
5514 } else if ((cur == '<') && (next == '?')) {
5515 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005516 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005517 goto done;
5518#ifdef DEBUG_PUSH
5519 xmlGenericError(xmlGenericErrorContext,
5520 "HPP: Parsing PI\n");
5521#endif
5522 htmlParsePI(ctxt);
5523 ctxt->instate = XML_PARSER_EPILOG;
5524 } else if ((cur == '<') && (next == '!') &&
5525 (avail < 4)) {
5526 goto done;
5527 } else {
5528 ctxt->errNo = XML_ERR_DOCUMENT_END;
5529 ctxt->wellFormed = 0;
5530 ctxt->instate = XML_PARSER_EOF;
5531#ifdef DEBUG_PUSH
5532 xmlGenericError(xmlGenericErrorContext,
5533 "HPP: entering EOF\n");
5534#endif
5535 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5536 ctxt->sax->endDocument(ctxt->userData);
5537 goto done;
5538 }
5539 break;
5540 case XML_PARSER_START_TAG: {
5541 const xmlChar *name;
5542 int failed;
5543 const htmlElemDesc * info;
5544
5545 /*
5546 * no chars in buffer
5547 */
5548 if (avail < 1)
5549 goto done;
5550 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005551 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005552 */
5553 if (avail < 2) {
5554 if (!terminate)
5555 goto done;
5556 else
5557 next = ' ';
5558 } else {
5559 next = in->cur[1];
5560 }
5561 cur = in->cur[0];
5562 if (cur != '<') {
5563 ctxt->instate = XML_PARSER_CONTENT;
5564#ifdef DEBUG_PUSH
5565 xmlGenericError(xmlGenericErrorContext,
5566 "HPP: entering CONTENT\n");
5567#endif
5568 break;
5569 }
5570 if (next == '/') {
5571 ctxt->instate = XML_PARSER_END_TAG;
5572 ctxt->checkIndex = 0;
5573#ifdef DEBUG_PUSH
5574 xmlGenericError(xmlGenericErrorContext,
5575 "HPP: entering END_TAG\n");
5576#endif
5577 break;
5578 }
5579 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005580 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005581 goto done;
5582
5583 /* Capture start position */
5584 if (ctxt->record_info) {
5585 node_info.begin_pos = ctxt->input->consumed +
5586 (CUR_PTR - ctxt->input->base);
5587 node_info.begin_line = ctxt->input->line;
5588 }
5589
5590
5591 failed = htmlParseStartTag(ctxt);
5592 name = ctxt->name;
5593 if ((failed == -1) ||
5594 (name == NULL)) {
5595 if (CUR == '>')
5596 NEXT;
5597 break;
5598 }
5599
5600 /*
5601 * Lookup the info for that element.
5602 */
5603 info = htmlTagLookup(name);
5604 if (info == NULL) {
5605 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5606 "Tag %s invalid\n", name, NULL);
5607 }
5608
5609 /*
5610 * Check for an Empty Element labeled the XML/SGML way
5611 */
5612 if ((CUR == '/') && (NXT(1) == '>')) {
5613 SKIP(2);
5614 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5615 ctxt->sax->endElement(ctxt->userData, name);
5616 htmlnamePop(ctxt);
5617 ctxt->instate = XML_PARSER_CONTENT;
5618#ifdef DEBUG_PUSH
5619 xmlGenericError(xmlGenericErrorContext,
5620 "HPP: entering CONTENT\n");
5621#endif
5622 break;
5623 }
5624
5625 if (CUR == '>') {
5626 NEXT;
5627 } else {
5628 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5629 "Couldn't find end of Start Tag %s\n",
5630 name, NULL);
5631
5632 /*
5633 * end of parsing of this node.
5634 */
5635 if (xmlStrEqual(name, ctxt->name)) {
5636 nodePop(ctxt);
5637 htmlnamePop(ctxt);
5638 }
5639
5640 if (ctxt->record_info)
5641 htmlNodeInfoPush(ctxt, &node_info);
5642
5643 ctxt->instate = XML_PARSER_CONTENT;
5644#ifdef DEBUG_PUSH
5645 xmlGenericError(xmlGenericErrorContext,
5646 "HPP: entering CONTENT\n");
5647#endif
5648 break;
5649 }
5650
5651 /*
5652 * Check for an Empty Element from DTD definition
5653 */
5654 if ((info != NULL) && (info->empty)) {
5655 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5656 ctxt->sax->endElement(ctxt->userData, name);
5657 htmlnamePop(ctxt);
5658 }
5659
5660 if (ctxt->record_info)
5661 htmlNodeInfoPush(ctxt, &node_info);
5662
5663 ctxt->instate = XML_PARSER_CONTENT;
5664#ifdef DEBUG_PUSH
5665 xmlGenericError(xmlGenericErrorContext,
5666 "HPP: entering CONTENT\n");
5667#endif
5668 break;
5669 }
5670 case XML_PARSER_CONTENT: {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005671 xmlChar chr[2] = { 0, 0 };
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005672
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005673 /*
5674 * Handle preparsed entities and charRef
5675 */
5676 if (ctxt->token != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005677 chr[0] = (xmlChar) ctxt->token;
5678 htmlCheckParagraph(ctxt);
5679 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5680 ctxt->sax->characters(ctxt->userData, chr, 1);
5681 ctxt->token = 0;
5682 ctxt->checkIndex = 0;
5683 }
5684 if ((avail == 1) && (terminate)) {
5685 cur = in->cur[0];
5686 if ((cur != '<') && (cur != '&')) {
5687 if (ctxt->sax != NULL) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005688 chr[0] = cur;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005689 if (IS_BLANK_CH(cur)) {
5690 if (ctxt->keepBlanks) {
5691 if (ctxt->sax->characters != NULL)
5692 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005693 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005694 } else {
5695 if (ctxt->sax->ignorableWhitespace != NULL)
5696 ctxt->sax->ignorableWhitespace(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005697 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005698 }
5699 } else {
5700 htmlCheckParagraph(ctxt);
5701 if (ctxt->sax->characters != NULL)
5702 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005703 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005704 }
5705 }
5706 ctxt->token = 0;
5707 ctxt->checkIndex = 0;
5708 in->cur++;
5709 break;
5710 }
5711 }
5712 if (avail < 2)
5713 goto done;
5714 cur = in->cur[0];
5715 next = in->cur[1];
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005716 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5717 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5718 /*
5719 * Handle SCRIPT/STYLE separately
5720 */
5721 if (!terminate) {
5722 int idx;
5723 xmlChar val;
5724
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005725 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005726 if (idx < 0)
5727 goto done;
5728 val = in->cur[idx + 2];
5729 if (val == 0) /* bad cut of input */
5730 goto done;
5731 }
5732 htmlParseScript(ctxt);
5733 if ((cur == '<') && (next == '/')) {
5734 ctxt->instate = XML_PARSER_END_TAG;
5735 ctxt->checkIndex = 0;
5736#ifdef DEBUG_PUSH
5737 xmlGenericError(xmlGenericErrorContext,
5738 "HPP: entering END_TAG\n");
5739#endif
5740 break;
5741 }
5742 } else {
5743 /*
5744 * Sometimes DOCTYPE arrives in the middle of the document
5745 */
5746 if ((cur == '<') && (next == '!') &&
5747 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5748 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5749 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5750 (UPP(8) == 'E')) {
5751 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005752 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005753 goto done;
5754 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5755 "Misplaced DOCTYPE declaration\n",
5756 BAD_CAST "DOCTYPE" , NULL);
5757 htmlParseDocTypeDecl(ctxt);
5758 } else if ((cur == '<') && (next == '!') &&
5759 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5760 if ((!terminate) &&
5761 (htmlParseLookupSequence(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005762 ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005763 goto done;
5764#ifdef DEBUG_PUSH
5765 xmlGenericError(xmlGenericErrorContext,
5766 "HPP: Parsing Comment\n");
5767#endif
5768 htmlParseComment(ctxt);
5769 ctxt->instate = XML_PARSER_CONTENT;
5770 } else if ((cur == '<') && (next == '?')) {
5771 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005772 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005773 goto done;
5774#ifdef DEBUG_PUSH
5775 xmlGenericError(xmlGenericErrorContext,
5776 "HPP: Parsing PI\n");
5777#endif
5778 htmlParsePI(ctxt);
5779 ctxt->instate = XML_PARSER_CONTENT;
5780 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5781 goto done;
5782 } else if ((cur == '<') && (next == '/')) {
5783 ctxt->instate = XML_PARSER_END_TAG;
5784 ctxt->checkIndex = 0;
5785#ifdef DEBUG_PUSH
5786 xmlGenericError(xmlGenericErrorContext,
5787 "HPP: entering END_TAG\n");
5788#endif
5789 break;
5790 } else if (cur == '<') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005791 if ((!terminate) && (next == 0))
5792 goto done;
5793 /*
5794 * Only switch to START_TAG if the next character
5795 * starts a valid name. Otherwise, htmlParseStartTag
5796 * might return without consuming all characters
5797 * up to the final '>'.
5798 */
5799 if ((IS_ASCII_LETTER(next)) ||
5800 (next == '_') || (next == ':') || (next == '.')) {
5801 ctxt->instate = XML_PARSER_START_TAG;
5802 ctxt->checkIndex = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005803#ifdef DEBUG_PUSH
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005804 xmlGenericError(xmlGenericErrorContext,
5805 "HPP: entering START_TAG\n");
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005806#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005807 } else {
5808 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
5809 "htmlParseTryOrFinish: "
5810 "invalid element name\n",
5811 NULL, NULL);
5812 htmlCheckParagraph(ctxt);
5813 if ((ctxt->sax != NULL) &&
5814 (ctxt->sax->characters != NULL))
5815 ctxt->sax->characters(ctxt->userData,
5816 in->cur, 1);
5817 NEXT;
5818 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005819 break;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005820 } else {
5821 /*
5822 * check that the text sequence is complete
5823 * before handing out the data to the parser
5824 * to avoid problems with erroneous end of
5825 * data detection.
5826 */
5827 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005828 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005829 goto done;
5830 ctxt->checkIndex = 0;
5831#ifdef DEBUG_PUSH
5832 xmlGenericError(xmlGenericErrorContext,
5833 "HPP: Parsing char data\n");
5834#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005835 while ((cur != '<') && (cur != 0)) {
5836 if (cur == '&') {
5837 htmlParseReference(ctxt);
5838 } else {
5839 htmlParseCharData(ctxt);
5840 }
5841 cur = in->cur[0];
5842 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005843 }
5844 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005845
5846 break;
5847 }
5848 case XML_PARSER_END_TAG:
5849 if (avail < 2)
5850 goto done;
5851 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005852 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005853 goto done;
5854 htmlParseEndTag(ctxt);
5855 if (ctxt->nameNr == 0) {
5856 ctxt->instate = XML_PARSER_EPILOG;
5857 } else {
5858 ctxt->instate = XML_PARSER_CONTENT;
5859 }
5860 ctxt->checkIndex = 0;
5861#ifdef DEBUG_PUSH
5862 xmlGenericError(xmlGenericErrorContext,
5863 "HPP: entering CONTENT\n");
5864#endif
5865 break;
5866 case XML_PARSER_CDATA_SECTION:
5867 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5868 "HPP: internal error, state == CDATA\n",
5869 NULL, NULL);
5870 ctxt->instate = XML_PARSER_CONTENT;
5871 ctxt->checkIndex = 0;
5872#ifdef DEBUG_PUSH
5873 xmlGenericError(xmlGenericErrorContext,
5874 "HPP: entering CONTENT\n");
5875#endif
5876 break;
5877 case XML_PARSER_DTD:
5878 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5879 "HPP: internal error, state == DTD\n",
5880 NULL, NULL);
5881 ctxt->instate = XML_PARSER_CONTENT;
5882 ctxt->checkIndex = 0;
5883#ifdef DEBUG_PUSH
5884 xmlGenericError(xmlGenericErrorContext,
5885 "HPP: entering CONTENT\n");
5886#endif
5887 break;
5888 case XML_PARSER_COMMENT:
5889 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5890 "HPP: internal error, state == COMMENT\n",
5891 NULL, NULL);
5892 ctxt->instate = XML_PARSER_CONTENT;
5893 ctxt->checkIndex = 0;
5894#ifdef DEBUG_PUSH
5895 xmlGenericError(xmlGenericErrorContext,
5896 "HPP: entering CONTENT\n");
5897#endif
5898 break;
5899 case XML_PARSER_PI:
5900 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5901 "HPP: internal error, state == PI\n",
5902 NULL, NULL);
5903 ctxt->instate = XML_PARSER_CONTENT;
5904 ctxt->checkIndex = 0;
5905#ifdef DEBUG_PUSH
5906 xmlGenericError(xmlGenericErrorContext,
5907 "HPP: entering CONTENT\n");
5908#endif
5909 break;
5910 case XML_PARSER_ENTITY_DECL:
5911 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5912 "HPP: internal error, state == ENTITY_DECL\n",
5913 NULL, NULL);
5914 ctxt->instate = XML_PARSER_CONTENT;
5915 ctxt->checkIndex = 0;
5916#ifdef DEBUG_PUSH
5917 xmlGenericError(xmlGenericErrorContext,
5918 "HPP: entering CONTENT\n");
5919#endif
5920 break;
5921 case XML_PARSER_ENTITY_VALUE:
5922 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5923 "HPP: internal error, state == ENTITY_VALUE\n",
5924 NULL, NULL);
5925 ctxt->instate = XML_PARSER_CONTENT;
5926 ctxt->checkIndex = 0;
5927#ifdef DEBUG_PUSH
5928 xmlGenericError(xmlGenericErrorContext,
5929 "HPP: entering DTD\n");
5930#endif
5931 break;
5932 case XML_PARSER_ATTRIBUTE_VALUE:
5933 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5934 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5935 NULL, NULL);
5936 ctxt->instate = XML_PARSER_START_TAG;
5937 ctxt->checkIndex = 0;
5938#ifdef DEBUG_PUSH
5939 xmlGenericError(xmlGenericErrorContext,
5940 "HPP: entering START_TAG\n");
5941#endif
5942 break;
5943 case XML_PARSER_SYSTEM_LITERAL:
5944 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5945 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5946 NULL, NULL);
5947 ctxt->instate = XML_PARSER_CONTENT;
5948 ctxt->checkIndex = 0;
5949#ifdef DEBUG_PUSH
5950 xmlGenericError(xmlGenericErrorContext,
5951 "HPP: entering CONTENT\n");
5952#endif
5953 break;
5954 case XML_PARSER_IGNORE:
5955 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5956 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5957 NULL, NULL);
5958 ctxt->instate = XML_PARSER_CONTENT;
5959 ctxt->checkIndex = 0;
5960#ifdef DEBUG_PUSH
5961 xmlGenericError(xmlGenericErrorContext,
5962 "HPP: entering CONTENT\n");
5963#endif
5964 break;
5965 case XML_PARSER_PUBLIC_LITERAL:
5966 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5967 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5968 NULL, NULL);
5969 ctxt->instate = XML_PARSER_CONTENT;
5970 ctxt->checkIndex = 0;
5971#ifdef DEBUG_PUSH
5972 xmlGenericError(xmlGenericErrorContext,
5973 "HPP: entering CONTENT\n");
5974#endif
5975 break;
5976
5977 }
5978 }
5979done:
5980 if ((avail == 0) && (terminate)) {
5981 htmlAutoCloseOnEnd(ctxt);
5982 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5983 /*
5984 * SAX: end of the document processing.
5985 */
5986 ctxt->instate = XML_PARSER_EOF;
5987 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5988 ctxt->sax->endDocument(ctxt->userData);
5989 }
5990 }
5991 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5992 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5993 (ctxt->instate == XML_PARSER_EPILOG))) {
5994 xmlDtdPtr dtd;
5995 dtd = xmlGetIntSubset(ctxt->myDoc);
5996 if (dtd == NULL)
5997 ctxt->myDoc->intSubset =
5998 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5999 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6000 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6001 }
6002#ifdef DEBUG_PUSH
6003 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6004#endif
6005 return(ret);
6006}
6007
6008/**
6009 * htmlParseChunk:
6010 * @ctxt: an HTML parser context
6011 * @chunk: an char array
6012 * @size: the size in byte of the chunk
6013 * @terminate: last chunk indicator
6014 *
6015 * Parse a Chunk of memory
6016 *
6017 * Returns zero if no error, the xmlParserErrors otherwise.
6018 */
6019int
6020htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6021 int terminate) {
6022 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6023 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6024 "htmlParseChunk: context error\n", NULL, NULL);
6025 return(XML_ERR_INTERNAL_ERROR);
6026 }
6027 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6028 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6029 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6030 size_t cur = ctxt->input->cur - ctxt->input->base;
6031 int res;
6032
6033 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006034 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006035 if (res < 0) {
6036 ctxt->errNo = XML_PARSER_EOF;
6037 ctxt->disableSAX = 1;
6038 return (XML_PARSER_EOF);
6039 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006040#ifdef DEBUG_PUSH
6041 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6042#endif
6043
6044#if 0
6045 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6046 htmlParseTryOrFinish(ctxt, terminate);
6047#endif
6048 } else if (ctxt->instate != XML_PARSER_EOF) {
6049 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6050 xmlParserInputBufferPtr in = ctxt->input->buf;
6051 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6052 (in->raw != NULL)) {
6053 int nbchars;
6054 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6055 size_t current = ctxt->input->cur - ctxt->input->base;
6056
6057 nbchars = xmlCharEncInput(in, terminate);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006058 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006059 if (nbchars < 0) {
6060 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6061 "encoder error\n", NULL, NULL);
6062 return(XML_ERR_INVALID_ENCODING);
6063 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006064 }
6065 }
6066 }
6067 htmlParseTryOrFinish(ctxt, terminate);
6068 if (terminate) {
6069 if ((ctxt->instate != XML_PARSER_EOF) &&
6070 (ctxt->instate != XML_PARSER_EPILOG) &&
6071 (ctxt->instate != XML_PARSER_MISC)) {
6072 ctxt->errNo = XML_ERR_DOCUMENT_END;
6073 ctxt->wellFormed = 0;
6074 }
6075 if (ctxt->instate != XML_PARSER_EOF) {
6076 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6077 ctxt->sax->endDocument(ctxt->userData);
6078 }
6079 ctxt->instate = XML_PARSER_EOF;
6080 }
6081 return((xmlParserErrors) ctxt->errNo);
6082}
6083
6084/************************************************************************
6085 * *
6086 * User entry points *
6087 * *
6088 ************************************************************************/
6089
6090/**
6091 * htmlCreatePushParserCtxt:
6092 * @sax: a SAX handler
6093 * @user_data: The user data returned on SAX callbacks
6094 * @chunk: a pointer to an array of chars
6095 * @size: number of chars in the array
6096 * @filename: an optional file name or URI
6097 * @enc: an optional encoding
6098 *
6099 * Create a parser context for using the HTML parser in push mode
6100 * The value of @filename is used for fetching external entities
6101 * and error/warning reports.
6102 *
6103 * Returns the new parser context or NULL
6104 */
6105htmlParserCtxtPtr
6106htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6107 const char *chunk, int size, const char *filename,
6108 xmlCharEncoding enc) {
6109 htmlParserCtxtPtr ctxt;
6110 htmlParserInputPtr inputStream;
6111 xmlParserInputBufferPtr buf;
6112
6113 xmlInitParser();
6114
6115 buf = xmlAllocParserInputBuffer(enc);
6116 if (buf == NULL) return(NULL);
6117
6118 ctxt = htmlNewParserCtxt();
6119 if (ctxt == NULL) {
6120 xmlFreeParserInputBuffer(buf);
6121 return(NULL);
6122 }
6123 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6124 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6125 if (sax != NULL) {
6126 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6127 xmlFree(ctxt->sax);
6128 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6129 if (ctxt->sax == NULL) {
6130 xmlFree(buf);
6131 xmlFree(ctxt);
6132 return(NULL);
6133 }
6134 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6135 if (user_data != NULL)
6136 ctxt->userData = user_data;
6137 }
6138 if (filename == NULL) {
6139 ctxt->directory = NULL;
6140 } else {
6141 ctxt->directory = xmlParserGetDirectory(filename);
6142 }
6143
6144 inputStream = htmlNewInputStream(ctxt);
6145 if (inputStream == NULL) {
6146 xmlFreeParserCtxt(ctxt);
6147 xmlFree(buf);
6148 return(NULL);
6149 }
6150
6151 if (filename == NULL)
6152 inputStream->filename = NULL;
6153 else
6154 inputStream->filename = (char *)
6155 xmlCanonicPath((const xmlChar *) filename);
6156 inputStream->buf = buf;
6157 xmlBufResetInput(buf->buffer, inputStream);
6158
6159 inputPush(ctxt, inputStream);
6160
6161 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6162 (ctxt->input->buf != NULL)) {
6163 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6164 size_t cur = ctxt->input->cur - ctxt->input->base;
6165
6166 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6167
6168 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6169#ifdef DEBUG_PUSH
6170 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6171#endif
6172 }
6173 ctxt->progressive = 1;
6174
6175 return(ctxt);
6176}
6177#endif /* LIBXML_PUSH_ENABLED */
6178
6179/**
6180 * htmlSAXParseDoc:
6181 * @cur: a pointer to an array of xmlChar
6182 * @encoding: a free form C string describing the HTML document encoding, or NULL
6183 * @sax: the SAX handler block
6184 * @userData: if using SAX, this pointer will be provided on callbacks.
6185 *
6186 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6187 * to handle parse events. If sax is NULL, fallback to the default DOM
6188 * behavior and return a tree.
6189 *
6190 * Returns the resulting document tree unless SAX is NULL or the document is
6191 * not well formed.
6192 */
6193
6194htmlDocPtr
6195htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6196 htmlSAXHandlerPtr sax, void *userData) {
6197 htmlDocPtr ret;
6198 htmlParserCtxtPtr ctxt;
6199
6200 xmlInitParser();
6201
6202 if (cur == NULL) return(NULL);
6203
6204
6205 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6206 if (ctxt == NULL) return(NULL);
6207 if (sax != NULL) {
6208 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6209 ctxt->sax = sax;
6210 ctxt->userData = userData;
6211 }
6212
6213 htmlParseDocument(ctxt);
6214 ret = ctxt->myDoc;
6215 if (sax != NULL) {
6216 ctxt->sax = NULL;
6217 ctxt->userData = NULL;
6218 }
6219 htmlFreeParserCtxt(ctxt);
6220
6221 return(ret);
6222}
6223
6224/**
6225 * htmlParseDoc:
6226 * @cur: a pointer to an array of xmlChar
6227 * @encoding: a free form C string describing the HTML document encoding, or NULL
6228 *
6229 * parse an HTML in-memory document and build a tree.
6230 *
6231 * Returns the resulting document tree
6232 */
6233
6234htmlDocPtr
6235htmlParseDoc(const xmlChar *cur, const char *encoding) {
6236 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6237}
6238
6239
6240/**
6241 * htmlCreateFileParserCtxt:
6242 * @filename: the filename
6243 * @encoding: a free form C string describing the HTML document encoding, or NULL
6244 *
6245 * Create a parser context for a file content.
6246 * Automatic support for ZLIB/Compress compressed document is provided
6247 * by default if found at compile-time.
6248 *
6249 * Returns the new parser context or NULL
6250 */
6251htmlParserCtxtPtr
6252htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6253{
6254 htmlParserCtxtPtr ctxt;
6255 htmlParserInputPtr inputStream;
6256 char *canonicFilename;
6257 /* htmlCharEncoding enc; */
6258 xmlChar *content, *content_line = (xmlChar *) "charset=";
6259
6260 if (filename == NULL)
6261 return(NULL);
6262
6263 ctxt = htmlNewParserCtxt();
6264 if (ctxt == NULL) {
6265 return(NULL);
6266 }
6267 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6268 if (canonicFilename == NULL) {
6269#ifdef LIBXML_SAX1_ENABLED
6270 if (xmlDefaultSAXHandler.error != NULL) {
6271 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6272 }
6273#endif
6274 xmlFreeParserCtxt(ctxt);
6275 return(NULL);
6276 }
6277
6278 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6279 xmlFree(canonicFilename);
6280 if (inputStream == NULL) {
6281 xmlFreeParserCtxt(ctxt);
6282 return(NULL);
6283 }
6284
6285 inputPush(ctxt, inputStream);
6286
6287 /* set encoding */
6288 if (encoding) {
6289 size_t l = strlen(encoding);
6290
6291 if (l < 1000) {
6292 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6293 if (content) {
6294 strcpy ((char *)content, (char *)content_line);
6295 strcat ((char *)content, (char *)encoding);
6296 htmlCheckEncoding (ctxt, content);
6297 xmlFree (content);
6298 }
6299 }
6300 }
6301
6302 return(ctxt);
6303}
6304
6305/**
6306 * htmlSAXParseFile:
6307 * @filename: the filename
6308 * @encoding: a free form C string describing the HTML document encoding, or NULL
6309 * @sax: the SAX handler block
6310 * @userData: if using SAX, this pointer will be provided on callbacks.
6311 *
6312 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6313 * compressed document is provided by default if found at compile-time.
6314 * It use the given SAX function block to handle the parsing callback.
6315 * If sax is NULL, fallback to the default DOM tree building routines.
6316 *
6317 * Returns the resulting document tree unless SAX is NULL or the document is
6318 * not well formed.
6319 */
6320
6321htmlDocPtr
6322htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6323 void *userData) {
6324 htmlDocPtr ret;
6325 htmlParserCtxtPtr ctxt;
6326 htmlSAXHandlerPtr oldsax = NULL;
6327
6328 xmlInitParser();
6329
6330 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6331 if (ctxt == NULL) return(NULL);
6332 if (sax != NULL) {
6333 oldsax = ctxt->sax;
6334 ctxt->sax = sax;
6335 ctxt->userData = userData;
6336 }
6337
6338 htmlParseDocument(ctxt);
6339
6340 ret = ctxt->myDoc;
6341 if (sax != NULL) {
6342 ctxt->sax = oldsax;
6343 ctxt->userData = NULL;
6344 }
6345 htmlFreeParserCtxt(ctxt);
6346
6347 return(ret);
6348}
6349
6350/**
6351 * htmlParseFile:
6352 * @filename: the filename
6353 * @encoding: a free form C string describing the HTML document encoding, or NULL
6354 *
6355 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6356 * compressed document is provided by default if found at compile-time.
6357 *
6358 * Returns the resulting document tree
6359 */
6360
6361htmlDocPtr
6362htmlParseFile(const char *filename, const char *encoding) {
6363 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6364}
6365
6366/**
6367 * htmlHandleOmittedElem:
6368 * @val: int 0 or 1
6369 *
6370 * Set and return the previous value for handling HTML omitted tags.
6371 *
6372 * Returns the last value for 0 for no handling, 1 for auto insertion.
6373 */
6374
6375int
6376htmlHandleOmittedElem(int val) {
6377 int old = htmlOmittedDefaultValue;
6378
6379 htmlOmittedDefaultValue = val;
6380 return(old);
6381}
6382
6383/**
6384 * htmlElementAllowedHere:
6385 * @parent: HTML parent element
6386 * @elt: HTML element
6387 *
6388 * Checks whether an HTML element may be a direct child of a parent element.
6389 * Note - doesn't check for deprecated elements
6390 *
6391 * Returns 1 if allowed; 0 otherwise.
6392 */
6393int
6394htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6395 const char** p ;
6396
6397 if ( ! elt || ! parent || ! parent->subelts )
6398 return 0 ;
6399
6400 for ( p = parent->subelts; *p; ++p )
6401 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6402 return 1 ;
6403
6404 return 0 ;
6405}
6406/**
6407 * htmlElementStatusHere:
6408 * @parent: HTML parent element
6409 * @elt: HTML element
6410 *
6411 * Checks whether an HTML element may be a direct child of a parent element.
6412 * and if so whether it is valid or deprecated.
6413 *
6414 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6415 */
6416htmlStatus
6417htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6418 if ( ! parent || ! elt )
6419 return HTML_INVALID ;
6420 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6421 return HTML_INVALID ;
6422
6423 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6424}
6425/**
6426 * htmlAttrAllowed:
6427 * @elt: HTML element
6428 * @attr: HTML attribute
6429 * @legacy: whether to allow deprecated attributes
6430 *
6431 * Checks whether an attribute is valid for an element
6432 * Has full knowledge of Required and Deprecated attributes
6433 *
6434 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6435 */
6436htmlStatus
6437htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6438 const char** p ;
6439
6440 if ( !elt || ! attr )
6441 return HTML_INVALID ;
6442
6443 if ( elt->attrs_req )
6444 for ( p = elt->attrs_req; *p; ++p)
6445 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6446 return HTML_REQUIRED ;
6447
6448 if ( elt->attrs_opt )
6449 for ( p = elt->attrs_opt; *p; ++p)
6450 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6451 return HTML_VALID ;
6452
6453 if ( legacy && elt->attrs_depr )
6454 for ( p = elt->attrs_depr; *p; ++p)
6455 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6456 return HTML_DEPRECATED ;
6457
6458 return HTML_INVALID ;
6459}
6460/**
6461 * htmlNodeStatus:
6462 * @node: an htmlNodePtr in a tree
6463 * @legacy: whether to allow deprecated elements (YES is faster here
6464 * for Element nodes)
6465 *
6466 * Checks whether the tree node is valid. Experimental (the author
6467 * only uses the HTML enhancements in a SAX parser)
6468 *
6469 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6470 * legacy allowed) or htmlElementStatusHere (otherwise).
6471 * for Attribute nodes, a return from htmlAttrAllowed
6472 * for other nodes, HTML_NA (no checks performed)
6473 */
6474htmlStatus
6475htmlNodeStatus(const htmlNodePtr node, int legacy) {
6476 if ( ! node )
6477 return HTML_INVALID ;
6478
6479 switch ( node->type ) {
6480 case XML_ELEMENT_NODE:
6481 return legacy
6482 ? ( htmlElementAllowedHere (
6483 htmlTagLookup(node->parent->name) , node->name
6484 ) ? HTML_VALID : HTML_INVALID )
6485 : htmlElementStatusHere(
6486 htmlTagLookup(node->parent->name) ,
6487 htmlTagLookup(node->name) )
6488 ;
6489 case XML_ATTRIBUTE_NODE:
6490 return htmlAttrAllowed(
6491 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6492 default: return HTML_NA ;
6493 }
6494}
6495/************************************************************************
6496 * *
6497 * New set (2.6.0) of simpler and more flexible APIs *
6498 * *
6499 ************************************************************************/
6500/**
6501 * DICT_FREE:
6502 * @str: a string
6503 *
6504 * Free a string if it is not owned by the "dict" dictionary in the
6505 * current scope
6506 */
6507#define DICT_FREE(str) \
6508 if ((str) && ((!dict) || \
6509 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6510 xmlFree((char *)(str));
6511
6512/**
6513 * htmlCtxtReset:
6514 * @ctxt: an HTML parser context
6515 *
6516 * Reset a parser context
6517 */
6518void
6519htmlCtxtReset(htmlParserCtxtPtr ctxt)
6520{
6521 xmlParserInputPtr input;
6522 xmlDictPtr dict;
6523
6524 if (ctxt == NULL)
6525 return;
6526
6527 xmlInitParser();
6528 dict = ctxt->dict;
6529
6530 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6531 xmlFreeInputStream(input);
6532 }
6533 ctxt->inputNr = 0;
6534 ctxt->input = NULL;
6535
6536 ctxt->spaceNr = 0;
6537 if (ctxt->spaceTab != NULL) {
6538 ctxt->spaceTab[0] = -1;
6539 ctxt->space = &ctxt->spaceTab[0];
6540 } else {
6541 ctxt->space = NULL;
6542 }
6543
6544
6545 ctxt->nodeNr = 0;
6546 ctxt->node = NULL;
6547
6548 ctxt->nameNr = 0;
6549 ctxt->name = NULL;
6550
6551 DICT_FREE(ctxt->version);
6552 ctxt->version = NULL;
6553 DICT_FREE(ctxt->encoding);
6554 ctxt->encoding = NULL;
6555 DICT_FREE(ctxt->directory);
6556 ctxt->directory = NULL;
6557 DICT_FREE(ctxt->extSubURI);
6558 ctxt->extSubURI = NULL;
6559 DICT_FREE(ctxt->extSubSystem);
6560 ctxt->extSubSystem = NULL;
6561 if (ctxt->myDoc != NULL)
6562 xmlFreeDoc(ctxt->myDoc);
6563 ctxt->myDoc = NULL;
6564
6565 ctxt->standalone = -1;
6566 ctxt->hasExternalSubset = 0;
6567 ctxt->hasPErefs = 0;
6568 ctxt->html = 1;
6569 ctxt->external = 0;
6570 ctxt->instate = XML_PARSER_START;
6571 ctxt->token = 0;
6572
6573 ctxt->wellFormed = 1;
6574 ctxt->nsWellFormed = 1;
6575 ctxt->disableSAX = 0;
6576 ctxt->valid = 1;
6577 ctxt->vctxt.userData = ctxt;
6578 ctxt->vctxt.error = xmlParserValidityError;
6579 ctxt->vctxt.warning = xmlParserValidityWarning;
6580 ctxt->record_info = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006581 ctxt->checkIndex = 0;
6582 ctxt->inSubset = 0;
6583 ctxt->errNo = XML_ERR_OK;
6584 ctxt->depth = 0;
6585 ctxt->charset = XML_CHAR_ENCODING_NONE;
6586 ctxt->catalogs = NULL;
6587 xmlInitNodeInfoSeq(&ctxt->node_seq);
6588
6589 if (ctxt->attsDefault != NULL) {
6590 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6591 ctxt->attsDefault = NULL;
6592 }
6593 if (ctxt->attsSpecial != NULL) {
6594 xmlHashFree(ctxt->attsSpecial, NULL);
6595 ctxt->attsSpecial = NULL;
6596 }
6597}
6598
6599/**
6600 * htmlCtxtUseOptions:
6601 * @ctxt: an HTML parser context
6602 * @options: a combination of htmlParserOption(s)
6603 *
6604 * Applies the options to the parser context
6605 *
6606 * Returns 0 in case of success, the set of unknown or unimplemented options
6607 * in case of error.
6608 */
6609int
6610htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6611{
6612 if (ctxt == NULL)
6613 return(-1);
6614
6615 if (options & HTML_PARSE_NOWARNING) {
6616 ctxt->sax->warning = NULL;
6617 ctxt->vctxt.warning = NULL;
6618 options -= XML_PARSE_NOWARNING;
6619 ctxt->options |= XML_PARSE_NOWARNING;
6620 }
6621 if (options & HTML_PARSE_NOERROR) {
6622 ctxt->sax->error = NULL;
6623 ctxt->vctxt.error = NULL;
6624 ctxt->sax->fatalError = NULL;
6625 options -= XML_PARSE_NOERROR;
6626 ctxt->options |= XML_PARSE_NOERROR;
6627 }
6628 if (options & HTML_PARSE_PEDANTIC) {
6629 ctxt->pedantic = 1;
6630 options -= XML_PARSE_PEDANTIC;
6631 ctxt->options |= XML_PARSE_PEDANTIC;
6632 } else
6633 ctxt->pedantic = 0;
6634 if (options & XML_PARSE_NOBLANKS) {
6635 ctxt->keepBlanks = 0;
6636 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6637 options -= XML_PARSE_NOBLANKS;
6638 ctxt->options |= XML_PARSE_NOBLANKS;
6639 } else
6640 ctxt->keepBlanks = 1;
6641 if (options & HTML_PARSE_RECOVER) {
6642 ctxt->recovery = 1;
6643 options -= HTML_PARSE_RECOVER;
6644 } else
6645 ctxt->recovery = 0;
6646 if (options & HTML_PARSE_COMPACT) {
6647 ctxt->options |= HTML_PARSE_COMPACT;
6648 options -= HTML_PARSE_COMPACT;
6649 }
6650 if (options & XML_PARSE_HUGE) {
6651 ctxt->options |= XML_PARSE_HUGE;
6652 options -= XML_PARSE_HUGE;
6653 }
6654 if (options & HTML_PARSE_NODEFDTD) {
6655 ctxt->options |= HTML_PARSE_NODEFDTD;
6656 options -= HTML_PARSE_NODEFDTD;
6657 }
6658 if (options & HTML_PARSE_IGNORE_ENC) {
6659 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6660 options -= HTML_PARSE_IGNORE_ENC;
6661 }
6662 if (options & HTML_PARSE_NOIMPLIED) {
6663 ctxt->options |= HTML_PARSE_NOIMPLIED;
6664 options -= HTML_PARSE_NOIMPLIED;
6665 }
6666 ctxt->dictNames = 0;
6667 return (options);
6668}
6669
6670/**
6671 * htmlDoRead:
6672 * @ctxt: an HTML parser context
6673 * @URL: the base URL to use for the document
6674 * @encoding: the document encoding, or NULL
6675 * @options: a combination of htmlParserOption(s)
6676 * @reuse: keep the context for reuse
6677 *
6678 * Common front-end for the htmlRead functions
6679 *
6680 * Returns the resulting document tree or NULL
6681 */
6682static htmlDocPtr
6683htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6684 int options, int reuse)
6685{
6686 htmlDocPtr ret;
6687
6688 htmlCtxtUseOptions(ctxt, options);
6689 ctxt->html = 1;
6690 if (encoding != NULL) {
6691 xmlCharEncodingHandlerPtr hdlr;
6692
6693 hdlr = xmlFindCharEncodingHandler(encoding);
6694 if (hdlr != NULL) {
6695 xmlSwitchToEncoding(ctxt, hdlr);
6696 if (ctxt->input->encoding != NULL)
6697 xmlFree((xmlChar *) ctxt->input->encoding);
6698 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6699 }
6700 }
6701 if ((URL != NULL) && (ctxt->input != NULL) &&
6702 (ctxt->input->filename == NULL))
6703 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6704 htmlParseDocument(ctxt);
6705 ret = ctxt->myDoc;
6706 ctxt->myDoc = NULL;
6707 if (!reuse) {
6708 if ((ctxt->dictNames) &&
6709 (ret != NULL) &&
6710 (ret->dict == ctxt->dict))
6711 ctxt->dict = NULL;
6712 xmlFreeParserCtxt(ctxt);
6713 }
6714 return (ret);
6715}
6716
6717/**
6718 * htmlReadDoc:
6719 * @cur: a pointer to a zero terminated string
6720 * @URL: the base URL to use for the document
6721 * @encoding: the document encoding, or NULL
6722 * @options: a combination of htmlParserOption(s)
6723 *
6724 * parse an XML in-memory document and build a tree.
6725 *
6726 * Returns the resulting document tree
6727 */
6728htmlDocPtr
6729htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6730{
6731 htmlParserCtxtPtr ctxt;
6732
6733 if (cur == NULL)
6734 return (NULL);
6735
6736 xmlInitParser();
6737 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6738 if (ctxt == NULL)
6739 return (NULL);
6740 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6741}
6742
6743/**
6744 * htmlReadFile:
6745 * @filename: a file or URL
6746 * @encoding: the document encoding, or NULL
6747 * @options: a combination of htmlParserOption(s)
6748 *
6749 * parse an XML file from the filesystem or the network.
6750 *
6751 * Returns the resulting document tree
6752 */
6753htmlDocPtr
6754htmlReadFile(const char *filename, const char *encoding, int options)
6755{
6756 htmlParserCtxtPtr ctxt;
6757
6758 xmlInitParser();
6759 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6760 if (ctxt == NULL)
6761 return (NULL);
6762 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6763}
6764
6765/**
6766 * htmlReadMemory:
6767 * @buffer: a pointer to a char array
6768 * @size: the size of the array
6769 * @URL: the base URL to use for the document
6770 * @encoding: the document encoding, or NULL
6771 * @options: a combination of htmlParserOption(s)
6772 *
6773 * parse an XML in-memory document and build a tree.
6774 *
6775 * Returns the resulting document tree
6776 */
6777htmlDocPtr
6778htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6779{
6780 htmlParserCtxtPtr ctxt;
6781
6782 xmlInitParser();
6783 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6784 if (ctxt == NULL)
6785 return (NULL);
6786 htmlDefaultSAXHandlerInit();
6787 if (ctxt->sax != NULL)
6788 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6789 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6790}
6791
6792/**
6793 * htmlReadFd:
6794 * @fd: an open file descriptor
6795 * @URL: the base URL to use for the document
6796 * @encoding: the document encoding, or NULL
6797 * @options: a combination of htmlParserOption(s)
6798 *
6799 * parse an XML from a file descriptor and build a tree.
6800 *
6801 * Returns the resulting document tree
6802 */
6803htmlDocPtr
6804htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6805{
6806 htmlParserCtxtPtr ctxt;
6807 xmlParserInputBufferPtr input;
6808 xmlParserInputPtr stream;
6809
6810 if (fd < 0)
6811 return (NULL);
6812 xmlInitParser();
6813
6814 xmlInitParser();
6815 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6816 if (input == NULL)
6817 return (NULL);
6818 ctxt = xmlNewParserCtxt();
6819 if (ctxt == NULL) {
6820 xmlFreeParserInputBuffer(input);
6821 return (NULL);
6822 }
6823 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6824 if (stream == NULL) {
6825 xmlFreeParserInputBuffer(input);
6826 xmlFreeParserCtxt(ctxt);
6827 return (NULL);
6828 }
6829 inputPush(ctxt, stream);
6830 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6831}
6832
6833/**
6834 * htmlReadIO:
6835 * @ioread: an I/O read function
6836 * @ioclose: an I/O close function
6837 * @ioctx: an I/O handler
6838 * @URL: the base URL to use for the document
6839 * @encoding: the document encoding, or NULL
6840 * @options: a combination of htmlParserOption(s)
6841 *
6842 * parse an HTML document from I/O functions and source and build a tree.
6843 *
6844 * Returns the resulting document tree
6845 */
6846htmlDocPtr
6847htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6848 void *ioctx, const char *URL, const char *encoding, int options)
6849{
6850 htmlParserCtxtPtr ctxt;
6851 xmlParserInputBufferPtr input;
6852 xmlParserInputPtr stream;
6853
6854 if (ioread == NULL)
6855 return (NULL);
6856 xmlInitParser();
6857
6858 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6859 XML_CHAR_ENCODING_NONE);
6860 if (input == NULL) {
6861 if (ioclose != NULL)
6862 ioclose(ioctx);
6863 return (NULL);
6864 }
6865 ctxt = htmlNewParserCtxt();
6866 if (ctxt == NULL) {
6867 xmlFreeParserInputBuffer(input);
6868 return (NULL);
6869 }
6870 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6871 if (stream == NULL) {
6872 xmlFreeParserInputBuffer(input);
6873 xmlFreeParserCtxt(ctxt);
6874 return (NULL);
6875 }
6876 inputPush(ctxt, stream);
6877 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6878}
6879
6880/**
6881 * htmlCtxtReadDoc:
6882 * @ctxt: an HTML parser context
6883 * @cur: a pointer to a zero terminated string
6884 * @URL: the base URL to use for the document
6885 * @encoding: the document encoding, or NULL
6886 * @options: a combination of htmlParserOption(s)
6887 *
6888 * parse an XML in-memory document and build a tree.
6889 * This reuses the existing @ctxt parser context
6890 *
6891 * Returns the resulting document tree
6892 */
6893htmlDocPtr
6894htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6895 const char *URL, const char *encoding, int options)
6896{
6897 xmlParserInputPtr stream;
6898
6899 if (cur == NULL)
6900 return (NULL);
6901 if (ctxt == NULL)
6902 return (NULL);
6903 xmlInitParser();
6904
6905 htmlCtxtReset(ctxt);
6906
6907 stream = xmlNewStringInputStream(ctxt, cur);
6908 if (stream == NULL) {
6909 return (NULL);
6910 }
6911 inputPush(ctxt, stream);
6912 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6913}
6914
6915/**
6916 * htmlCtxtReadFile:
6917 * @ctxt: an HTML parser context
6918 * @filename: a file or URL
6919 * @encoding: the document encoding, or NULL
6920 * @options: a combination of htmlParserOption(s)
6921 *
6922 * parse an XML file from the filesystem or the network.
6923 * This reuses the existing @ctxt parser context
6924 *
6925 * Returns the resulting document tree
6926 */
6927htmlDocPtr
6928htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6929 const char *encoding, int options)
6930{
6931 xmlParserInputPtr stream;
6932
6933 if (filename == NULL)
6934 return (NULL);
6935 if (ctxt == NULL)
6936 return (NULL);
6937 xmlInitParser();
6938
6939 htmlCtxtReset(ctxt);
6940
6941 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6942 if (stream == NULL) {
6943 return (NULL);
6944 }
6945 inputPush(ctxt, stream);
6946 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6947}
6948
6949/**
6950 * htmlCtxtReadMemory:
6951 * @ctxt: an HTML parser context
6952 * @buffer: a pointer to a char array
6953 * @size: the size of the array
6954 * @URL: the base URL to use for the document
6955 * @encoding: the document encoding, or NULL
6956 * @options: a combination of htmlParserOption(s)
6957 *
6958 * parse an XML in-memory document and build a tree.
6959 * This reuses the existing @ctxt parser context
6960 *
6961 * Returns the resulting document tree
6962 */
6963htmlDocPtr
6964htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6965 const char *URL, const char *encoding, int options)
6966{
6967 xmlParserInputBufferPtr input;
6968 xmlParserInputPtr stream;
6969
6970 if (ctxt == NULL)
6971 return (NULL);
6972 if (buffer == NULL)
6973 return (NULL);
6974 xmlInitParser();
6975
6976 htmlCtxtReset(ctxt);
6977
6978 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6979 if (input == NULL) {
6980 return(NULL);
6981 }
6982
6983 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6984 if (stream == NULL) {
6985 xmlFreeParserInputBuffer(input);
6986 return(NULL);
6987 }
6988
6989 inputPush(ctxt, stream);
6990 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6991}
6992
6993/**
6994 * htmlCtxtReadFd:
6995 * @ctxt: an HTML parser context
6996 * @fd: an open file descriptor
6997 * @URL: the base URL to use for the document
6998 * @encoding: the document encoding, or NULL
6999 * @options: a combination of htmlParserOption(s)
7000 *
7001 * parse an XML from a file descriptor and build a tree.
7002 * This reuses the existing @ctxt parser context
7003 *
7004 * Returns the resulting document tree
7005 */
7006htmlDocPtr
7007htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7008 const char *URL, const char *encoding, int options)
7009{
7010 xmlParserInputBufferPtr input;
7011 xmlParserInputPtr stream;
7012
7013 if (fd < 0)
7014 return (NULL);
7015 if (ctxt == NULL)
7016 return (NULL);
7017 xmlInitParser();
7018
7019 htmlCtxtReset(ctxt);
7020
7021
7022 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7023 if (input == NULL)
7024 return (NULL);
7025 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7026 if (stream == NULL) {
7027 xmlFreeParserInputBuffer(input);
7028 return (NULL);
7029 }
7030 inputPush(ctxt, stream);
7031 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7032}
7033
7034/**
7035 * htmlCtxtReadIO:
7036 * @ctxt: an HTML parser context
7037 * @ioread: an I/O read function
7038 * @ioclose: an I/O close function
7039 * @ioctx: an I/O handler
7040 * @URL: the base URL to use for the document
7041 * @encoding: the document encoding, or NULL
7042 * @options: a combination of htmlParserOption(s)
7043 *
7044 * parse an HTML document from I/O functions and source and build a tree.
7045 * This reuses the existing @ctxt parser context
7046 *
7047 * Returns the resulting document tree
7048 */
7049htmlDocPtr
7050htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7051 xmlInputCloseCallback ioclose, void *ioctx,
7052 const char *URL,
7053 const char *encoding, int options)
7054{
7055 xmlParserInputBufferPtr input;
7056 xmlParserInputPtr stream;
7057
7058 if (ioread == NULL)
7059 return (NULL);
7060 if (ctxt == NULL)
7061 return (NULL);
7062 xmlInitParser();
7063
7064 htmlCtxtReset(ctxt);
7065
7066 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7067 XML_CHAR_ENCODING_NONE);
7068 if (input == NULL) {
7069 if (ioclose != NULL)
7070 ioclose(ioctx);
7071 return (NULL);
7072 }
7073 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7074 if (stream == NULL) {
7075 xmlFreeParserInputBuffer(input);
7076 return (NULL);
7077 }
7078 inputPush(ctxt, stream);
7079 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7080}
7081
7082#define bottom_HTMLparser
7083#include "elfgcchack.h"
7084#endif /* LIBXML_HTML_ENABLED */