blob: e72f41853c48e9caaebb628489193c7e59499267 [file] [log] [blame]
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef LIBXML_ZLIB_ENABLED
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#include "buf.h"
48#include "enc.h"
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57static int htmlOmittedDefaultValue = 1;
58
59xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63/************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69/**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
Haibo Huangcfd91dc2020-07-30 23:01:33 -070072 * @extra: extra information
Elliott Hughes7fbecab2019-01-10 16:42:03 -080073 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void LIBXML_ATTR_FORMAT(3,0)
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void LIBXML_ATTR_FORMAT(3,0)
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149}
150
151/************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166static int
167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168{
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196static const xmlChar *
197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213}
214
215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Haibo Huangf0a546b2020-09-01 20:28:19 -0700299#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306#define BASE_PTR ctxt->input->base
307
308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700320/* Imported from XML */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800321
322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
324#define NEXT xmlNextChar(ctxt)
325
326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
Haibo Huangf0a546b2020-09-01 20:28:19 -0700333 ctxt->token = 0; ctxt->input->cur += l; \
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800334 } while (0)
335
336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415static int
416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
420
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
423
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
427 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
431
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700432 /*
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
436 */
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
444 }
445 return((int) *ctxt->input->cur);
446 }
447
448 /*
449 * Humm this is bad, do an automatic flow conversion
450 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
Haibo Huang735158e2021-02-23 17:48:08 -0800460 /*
461 * Don't use UTF-8 encoder which isn't required and
462 * can produce invalid UTF-8.
463 */
464 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465 xmlSwitchToEncoding(ctxt, handler);
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800466 } else {
467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468 "Unsupported encoding %s", guess, NULL);
469 }
470 }
471 ctxt->charset = XML_CHAR_ENCODING_UTF8;
472 }
473
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700474 /*
475 * We are supposed to handle UTF8, check it's valid
476 * From rfc2044: encoding of the Unicode values on UTF-8:
477 *
478 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
479 * 0000 0000-0000 007F 0xxxxxxx
480 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
481 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
482 *
483 * Check for the 0x110000 limit too
484 */
485 cur = ctxt->input->cur;
486 c = *cur;
487 if (c & 0x80) {
488 if ((c & 0x40) == 0)
489 goto encoding_error;
490 if (cur[1] == 0) {
491 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492 cur = ctxt->input->cur;
493 }
494 if ((cur[1] & 0xc0) != 0x80)
495 goto encoding_error;
496 if ((c & 0xe0) == 0xe0) {
497
498 if (cur[2] == 0) {
499 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500 cur = ctxt->input->cur;
501 }
502 if ((cur[2] & 0xc0) != 0x80)
503 goto encoding_error;
504 if ((c & 0xf0) == 0xf0) {
505 if (cur[3] == 0) {
506 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507 cur = ctxt->input->cur;
508 }
509 if (((c & 0xf8) != 0xf0) ||
510 ((cur[3] & 0xc0) != 0x80))
511 goto encoding_error;
512 /* 4-byte code */
513 *len = 4;
514 val = (cur[0] & 0x7) << 18;
515 val |= (cur[1] & 0x3f) << 12;
516 val |= (cur[2] & 0x3f) << 6;
517 val |= cur[3] & 0x3f;
518 if (val < 0x10000)
519 goto encoding_error;
520 } else {
521 /* 3-byte code */
522 *len = 3;
523 val = (cur[0] & 0xf) << 12;
524 val |= (cur[1] & 0x3f) << 6;
525 val |= cur[2] & 0x3f;
526 if (val < 0x800)
527 goto encoding_error;
528 }
529 } else {
530 /* 2-byte code */
531 *len = 2;
532 val = (cur[0] & 0x1f) << 6;
533 val |= cur[1] & 0x3f;
534 if (val < 0x80)
535 goto encoding_error;
536 }
537 if (!IS_CHAR(val)) {
538 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539 "Char 0x%X out of allowed range\n", val);
540 }
541 return(val);
542 } else {
543 if ((*ctxt->input->cur == 0) &&
544 (ctxt->input->cur < ctxt->input->end)) {
545 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546 "Char 0x%X out of allowed range\n", 0);
547 *len = 1;
548 return(' ');
549 }
550 /* 1-byte code */
551 *len = 1;
552 return((int) *ctxt->input->cur);
553 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800554
555encoding_error:
556 /*
557 * If we detect an UTF8 error that probably mean that the
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700558 * input encoding didn't get properly advertised in the
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800559 * declaration header. Report the error and switch the encoding
560 * to ISO-Latin-1 (if you don't like this policy, just declare the
561 * encoding !)
562 */
563 {
564 char buffer[150];
565
566 if (ctxt->input->end - ctxt->input->cur >= 4) {
567 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 ctxt->input->cur[0], ctxt->input->cur[1],
569 ctxt->input->cur[2], ctxt->input->cur[3]);
570 } else {
571 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572 }
573 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 "Input is not proper UTF-8, indicate encoding !\n",
575 BAD_CAST buffer, NULL);
576 }
577
Haibo Huang735158e2021-02-23 17:48:08 -0800578 /*
579 * Don't switch encodings twice. Note that if there's an encoder, we
580 * shouldn't receive invalid UTF-8 anyway.
581 *
582 * Note that if ctxt->input->buf == NULL, switching encodings is
583 * impossible, see Gitlab issue #34.
584 */
585 if ((ctxt->input->buf != NULL) &&
586 (ctxt->input->buf->encoder == NULL))
587 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800588 *len = 1;
589 return((int) *ctxt->input->cur);
590}
591
592/**
593 * htmlSkipBlankChars:
594 * @ctxt: the HTML parser context
595 *
596 * skip all blanks character found at that point in the input streams.
597 *
598 * Returns the number of space chars skipped
599 */
600
601static int
602htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603 int res = 0;
604
605 while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 if ((*ctxt->input->cur == 0) &&
607 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 xmlPopInput(ctxt);
609 } else {
610 if (*(ctxt->input->cur) == '\n') {
611 ctxt->input->line++; ctxt->input->col = 1;
612 } else ctxt->input->col++;
613 ctxt->input->cur++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800614 if (*ctxt->input->cur == 0)
615 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616 }
617 res++;
618 }
619 return(res);
620}
621
622
623
624/************************************************************************
625 * *
626 * The list of HTML elements and their properties *
627 * *
628 ************************************************************************/
629
630/*
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700631 * Start Tag: 1 means the start tag can be omitted
632 * End Tag: 1 means the end tag can be omitted
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800633 * 2 means it's forbidden (empty elements)
634 * 3 means the tag is stylistic and should be closed easily
635 * Depr: this element is deprecated
636 * DTD: 1 means that this element is valid only in the Loose DTD
637 * 2 means that this element is valid only in the Frameset DTD
638 *
639 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 , subElements , impliedsubelt , Attributes, userdata
641 */
642
643/* Definitions and a couple of vars for HTML Elements */
644
645#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646#define NB_FONTSTYLE 8
647#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648#define NB_PHRASE 10
649#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650#define NB_SPECIAL 16
651#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654#define NB_BLOCK NB_HEADING + NB_LIST + 14
655#define FORMCTRL "input", "select", "textarea", "label", "button"
656#define NB_FORMCTRL 5
657#define PCDATA
658#define NB_PCDATA 0
659#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660#define NB_HEADING 6
661#define LIST "ul", "ol", "dir", "menu"
662#define NB_LIST 4
663#define MODIFIER
664#define NB_MODIFIER 0
665#define FLOW BLOCK,INLINE
666#define NB_FLOW NB_BLOCK + NB_INLINE
667#define EMPTY NULL
668
669
670static const char* const html_flow[] = { FLOW, NULL } ;
671static const char* const html_inline[] = { INLINE, NULL } ;
672
673/* placeholders: elts with content but no subelements */
674static const char* const html_pcdata[] = { NULL } ;
675#define html_cdata html_pcdata
676
677
678/* ... and for HTML Attributes */
679
680#define COREATTRS "id", "class", "style", "title"
681#define NB_COREATTRS 4
682#define I18N "lang", "dir"
683#define NB_I18N 2
684#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685#define NB_EVENTS 9
686#define ATTRS COREATTRS,I18N,EVENTS
687#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688#define CELLHALIGN "align", "char", "charoff"
689#define NB_CELLHALIGN 3
690#define CELLVALIGN "valign"
691#define NB_CELLVALIGN 1
692
693static const char* const html_attrs[] = { ATTRS, NULL } ;
694static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695static const char* const core_attrs[] = { COREATTRS, NULL } ;
696static const char* const i18n_attrs[] = { I18N, NULL } ;
697
698
699/* Other declarations that should go inline ... */
700static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 "tabindex", "onfocus", "onblur", NULL } ;
703static const char* const target_attr[] = { "target", NULL } ;
704static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705static const char* const alt_attr[] = { "alt", NULL } ;
706static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707static const char* const href_attrs[] = { "href", NULL } ;
708static const char* const clear_attrs[] = { "clear", NULL } ;
709static const char* const inline_p[] = { INLINE, "p", NULL } ;
710
711static const char* const flow_param[] = { FLOW, "param", NULL } ;
712static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 "archive", "alt", "name", "height", "width", "align",
714 "hspace", "vspace", NULL } ;
715static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717static const char* const basefont_attrs[] =
718 { "id", "size", "color", "face", NULL } ;
719static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722static const char* const body_depr[] = { "background", "bgcolor", "text",
723 "link", "vlink", "alink", NULL } ;
724static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726
727
728static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729static const char* const col_elt[] = { "col", NULL } ;
730static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733static const char* const compact_attr[] = { "compact", NULL } ;
734static const char* const label_attr[] = { "label", NULL } ;
735static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745static const char* const version_attr[] = { "version", NULL } ;
746static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754static const char* const align_attr[] = { "align", NULL } ;
755static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757static const char* const name_attr[] = { "name", NULL } ;
758static const char* const action_attr[] = { "action", NULL } ;
759static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761static const char* const content_attr[] = { "content", NULL } ;
762static const char* const type_attr[] = { "type", NULL } ;
763static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764static const char* const object_contents[] = { FLOW, "param", NULL } ;
765static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768static const char* const option_elt[] = { "option", NULL } ;
769static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772static const char* const width_attr[] = { "width", NULL } ;
773static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775static const char* const language_attr[] = { "language", NULL } ;
776static const char* const select_content[] = { "optgroup", "option", NULL } ;
777static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782static const char* const tr_elt[] = { "tr", NULL } ;
783static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787static const char* const tr_contents[] = { "th", "td", NULL } ;
788static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789static const char* const li_elt[] = { "li", NULL } ;
790static const char* const ul_depr[] = { "type", "compact", NULL} ;
791static const char* const dir_attr[] = { "dir", NULL} ;
792
793#define DECL (const char**)
794
795static const htmlElemDesc
796html40ElementTable[] = {
797{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
798 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799},
800{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802},
803{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
804 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805},
806{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
807 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
808},
809{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
810 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811},
812{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814},
815{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817},
818{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820},
821{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823},
824{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826},
827{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
828 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829},
830{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832},
833{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
834 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835},
836{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838},
839{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
840 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841},
842{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
843 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844},
845{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847},
848{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
849 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850},
851{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853},
854{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
855 EMPTY , NULL , DECL col_attrs , NULL, NULL
856},
857{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
858 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859},
860{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
861 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862},
863{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865},
866{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
867 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868},
869{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
870 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871},
872{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874},
875{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
876 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877},
878{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
879 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880},
881{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
882 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883},
884{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 EMPTY, NULL, DECL embed_attrs, NULL, NULL
886},
887{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
888 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889},
890{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892},
893{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895},
896{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 EMPTY, NULL, NULL, DECL frame_attrs, NULL
898},
899{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901},
902{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904},
905{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
906 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907},
908{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910},
911{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
912 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913},
914{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
915 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916},
917{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
918 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919},
920{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
921 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922},
923{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925},
926{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
927 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928},
929{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931},
932{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934},
935{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937},
938{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
939 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940},
941{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
942 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943},
944{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946},
947{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949},
950{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952},
953{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955},
956{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
957 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958},
959{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961},
962{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964},
965{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
966 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967},
968{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970},
971{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973},
974{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 DECL html_flow, "div", DECL html_attrs, NULL, NULL
976},
977{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979},
980{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982},
983{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
984 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985},
986{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988},
989{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991},
992{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
993 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994},
995{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997},
998{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000},
1001{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003},
1004{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006},
1007{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009},
1010{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012},
1013{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1014 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015},
1016{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018},
1019{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021},
1022{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024},
1025{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1026 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027},
1028{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1029 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030},
1031{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033},
1034{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1035 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036},
1037{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1038 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039},
1040{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042},
1043{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045},
1046{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048},
1049{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051},
1052{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1053 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054},
1055{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1056 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057},
1058{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1059 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060},
1061{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063},
1064{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066},
1067{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069},
1070{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072}
1073};
1074
Elliott Hughes5cefca72021-05-06 13:23:15 -07001075typedef struct {
1076 const char *oldTag;
1077 const char *newTag;
1078} htmlStartCloseEntry;
1079
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001080/*
1081 * start tags that imply the end of current element
1082 */
Elliott Hughes5cefca72021-05-06 13:23:15 -07001083static const htmlStartCloseEntry htmlStartClose[] = {
1084 { "a", "a" },
1085 { "a", "fieldset" },
1086 { "a", "table" },
1087 { "a", "td" },
1088 { "a", "th" },
1089 { "address", "dd" },
1090 { "address", "dl" },
1091 { "address", "dt" },
1092 { "address", "form" },
1093 { "address", "li" },
1094 { "address", "ul" },
1095 { "b", "center" },
1096 { "b", "p" },
1097 { "b", "td" },
1098 { "b", "th" },
1099 { "big", "p" },
1100 { "caption", "col" },
1101 { "caption", "colgroup" },
1102 { "caption", "tbody" },
1103 { "caption", "tfoot" },
1104 { "caption", "thead" },
1105 { "caption", "tr" },
1106 { "col", "col" },
1107 { "col", "colgroup" },
1108 { "col", "tbody" },
1109 { "col", "tfoot" },
1110 { "col", "thead" },
1111 { "col", "tr" },
1112 { "colgroup", "colgroup" },
1113 { "colgroup", "tbody" },
1114 { "colgroup", "tfoot" },
1115 { "colgroup", "thead" },
1116 { "colgroup", "tr" },
1117 { "dd", "dt" },
1118 { "dir", "dd" },
1119 { "dir", "dl" },
1120 { "dir", "dt" },
1121 { "dir", "form" },
1122 { "dir", "ul" },
1123 { "dl", "form" },
1124 { "dl", "li" },
1125 { "dt", "dd" },
1126 { "dt", "dl" },
1127 { "font", "center" },
1128 { "font", "td" },
1129 { "font", "th" },
1130 { "form", "form" },
1131 { "h1", "fieldset" },
1132 { "h1", "form" },
1133 { "h1", "li" },
1134 { "h1", "p" },
1135 { "h1", "table" },
1136 { "h2", "fieldset" },
1137 { "h2", "form" },
1138 { "h2", "li" },
1139 { "h2", "p" },
1140 { "h2", "table" },
1141 { "h3", "fieldset" },
1142 { "h3", "form" },
1143 { "h3", "li" },
1144 { "h3", "p" },
1145 { "h3", "table" },
1146 { "h4", "fieldset" },
1147 { "h4", "form" },
1148 { "h4", "li" },
1149 { "h4", "p" },
1150 { "h4", "table" },
1151 { "h5", "fieldset" },
1152 { "h5", "form" },
1153 { "h5", "li" },
1154 { "h5", "p" },
1155 { "h5", "table" },
1156 { "h6", "fieldset" },
1157 { "h6", "form" },
1158 { "h6", "li" },
1159 { "h6", "p" },
1160 { "h6", "table" },
1161 { "head", "a" },
1162 { "head", "abbr" },
1163 { "head", "acronym" },
1164 { "head", "address" },
1165 { "head", "b" },
1166 { "head", "bdo" },
1167 { "head", "big" },
1168 { "head", "blockquote" },
1169 { "head", "body" },
1170 { "head", "br" },
1171 { "head", "center" },
1172 { "head", "cite" },
1173 { "head", "code" },
1174 { "head", "dd" },
1175 { "head", "dfn" },
1176 { "head", "dir" },
1177 { "head", "div" },
1178 { "head", "dl" },
1179 { "head", "dt" },
1180 { "head", "em" },
1181 { "head", "fieldset" },
1182 { "head", "font" },
1183 { "head", "form" },
1184 { "head", "frameset" },
1185 { "head", "h1" },
1186 { "head", "h2" },
1187 { "head", "h3" },
1188 { "head", "h4" },
1189 { "head", "h5" },
1190 { "head", "h6" },
1191 { "head", "hr" },
1192 { "head", "i" },
1193 { "head", "iframe" },
1194 { "head", "img" },
1195 { "head", "kbd" },
1196 { "head", "li" },
1197 { "head", "listing" },
1198 { "head", "map" },
1199 { "head", "menu" },
1200 { "head", "ol" },
1201 { "head", "p" },
1202 { "head", "pre" },
1203 { "head", "q" },
1204 { "head", "s" },
1205 { "head", "samp" },
1206 { "head", "small" },
1207 { "head", "span" },
1208 { "head", "strike" },
1209 { "head", "strong" },
1210 { "head", "sub" },
1211 { "head", "sup" },
1212 { "head", "table" },
1213 { "head", "tt" },
1214 { "head", "u" },
1215 { "head", "ul" },
1216 { "head", "var" },
1217 { "head", "xmp" },
1218 { "hr", "form" },
1219 { "i", "center" },
1220 { "i", "p" },
1221 { "i", "td" },
1222 { "i", "th" },
1223 { "legend", "fieldset" },
1224 { "li", "li" },
1225 { "link", "body" },
1226 { "link", "frameset" },
1227 { "listing", "dd" },
1228 { "listing", "dl" },
1229 { "listing", "dt" },
1230 { "listing", "fieldset" },
1231 { "listing", "form" },
1232 { "listing", "li" },
1233 { "listing", "table" },
1234 { "listing", "ul" },
1235 { "menu", "dd" },
1236 { "menu", "dl" },
1237 { "menu", "dt" },
1238 { "menu", "form" },
1239 { "menu", "ul" },
1240 { "ol", "form" },
1241 { "ol", "ul" },
1242 { "option", "optgroup" },
1243 { "option", "option" },
1244 { "p", "address" },
1245 { "p", "blockquote" },
1246 { "p", "body" },
1247 { "p", "caption" },
1248 { "p", "center" },
1249 { "p", "col" },
1250 { "p", "colgroup" },
1251 { "p", "dd" },
1252 { "p", "dir" },
1253 { "p", "div" },
1254 { "p", "dl" },
1255 { "p", "dt" },
1256 { "p", "fieldset" },
1257 { "p", "form" },
1258 { "p", "frameset" },
1259 { "p", "h1" },
1260 { "p", "h2" },
1261 { "p", "h3" },
1262 { "p", "h4" },
1263 { "p", "h5" },
1264 { "p", "h6" },
1265 { "p", "head" },
1266 { "p", "hr" },
1267 { "p", "li" },
1268 { "p", "listing" },
1269 { "p", "menu" },
1270 { "p", "ol" },
1271 { "p", "p" },
1272 { "p", "pre" },
1273 { "p", "table" },
1274 { "p", "tbody" },
1275 { "p", "td" },
1276 { "p", "tfoot" },
1277 { "p", "th" },
1278 { "p", "title" },
1279 { "p", "tr" },
1280 { "p", "ul" },
1281 { "p", "xmp" },
1282 { "pre", "dd" },
1283 { "pre", "dl" },
1284 { "pre", "dt" },
1285 { "pre", "fieldset" },
1286 { "pre", "form" },
1287 { "pre", "li" },
1288 { "pre", "table" },
1289 { "pre", "ul" },
1290 { "s", "p" },
1291 { "script", "noscript" },
1292 { "small", "p" },
1293 { "span", "td" },
1294 { "span", "th" },
1295 { "strike", "p" },
1296 { "style", "body" },
1297 { "style", "frameset" },
1298 { "tbody", "tbody" },
1299 { "tbody", "tfoot" },
1300 { "td", "tbody" },
1301 { "td", "td" },
1302 { "td", "tfoot" },
1303 { "td", "th" },
1304 { "td", "tr" },
1305 { "tfoot", "tbody" },
1306 { "th", "tbody" },
1307 { "th", "td" },
1308 { "th", "tfoot" },
1309 { "th", "th" },
1310 { "th", "tr" },
1311 { "thead", "tbody" },
1312 { "thead", "tfoot" },
1313 { "title", "body" },
1314 { "title", "frameset" },
1315 { "tr", "tbody" },
1316 { "tr", "tfoot" },
1317 { "tr", "tr" },
1318 { "tt", "p" },
1319 { "u", "p" },
1320 { "u", "td" },
1321 { "u", "th" },
1322 { "ul", "address" },
1323 { "ul", "form" },
1324 { "ul", "menu" },
1325 { "ul", "ol" },
1326 { "ul", "pre" },
1327 { "xmp", "dd" },
1328 { "xmp", "dl" },
1329 { "xmp", "dt" },
1330 { "xmp", "fieldset" },
1331 { "xmp", "form" },
1332 { "xmp", "li" },
1333 { "xmp", "table" },
1334 { "xmp", "ul" }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001335};
1336
1337/*
1338 * The list of HTML elements which are supposed not to have
1339 * CDATA content and where a p element will be implied
1340 *
1341 * TODO: extend that list by reading the HTML SGML DTD on
1342 * implied paragraph
1343 */
1344static const char *const htmlNoContentElements[] = {
1345 "html",
1346 "head",
1347 NULL
1348};
1349
1350/*
1351 * The list of HTML attributes which are of content %Script;
1352 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353 * it assumes the name starts with 'on'
1354 */
1355static const char *const htmlScriptAttributes[] = {
1356 "onclick",
1357 "ondblclick",
1358 "onmousedown",
1359 "onmouseup",
1360 "onmouseover",
1361 "onmousemove",
1362 "onmouseout",
1363 "onkeypress",
1364 "onkeydown",
1365 "onkeyup",
1366 "onload",
1367 "onunload",
1368 "onfocus",
1369 "onblur",
1370 "onsubmit",
1371 "onreset",
1372 "onchange",
1373 "onselect"
1374};
1375
1376/*
1377 * This table is used by the htmlparser to know what to do with
1378 * broken html pages. By assigning different priorities to different
1379 * elements the parser can decide how to handle extra endtags.
1380 * Endtags are only allowed to close elements with lower or equal
1381 * priority.
1382 */
1383
1384typedef struct {
1385 const char *name;
1386 int priority;
1387} elementPriority;
1388
1389static const elementPriority htmlEndPriority[] = {
1390 {"div", 150},
1391 {"td", 160},
1392 {"th", 160},
1393 {"tr", 170},
1394 {"thead", 180},
1395 {"tbody", 180},
1396 {"tfoot", 180},
1397 {"table", 190},
1398 {"head", 200},
1399 {"body", 200},
1400 {"html", 220},
1401 {NULL, 100} /* Default priority */
1402};
1403
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001404/************************************************************************
1405 * *
1406 * functions to handle HTML specific data *
1407 * *
1408 ************************************************************************/
1409
1410/**
1411 * htmlInitAutoClose:
1412 *
Elliott Hughes5cefca72021-05-06 13:23:15 -07001413 * This is a no-op now.
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001414 */
1415void
1416htmlInitAutoClose(void) {
Elliott Hughes5cefca72021-05-06 13:23:15 -07001417}
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001418
Elliott Hughes5cefca72021-05-06 13:23:15 -07001419static int
1420htmlCompareTags(const void *key, const void *member) {
1421 const xmlChar *tag = (const xmlChar *) key;
1422 const htmlElemDesc *desc = (const htmlElemDesc *) member;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001423
Elliott Hughes5cefca72021-05-06 13:23:15 -07001424 return(xmlStrcasecmp(tag, BAD_CAST desc->name));
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001425}
1426
1427/**
1428 * htmlTagLookup:
1429 * @tag: The tag name in lowercase
1430 *
1431 * Lookup the HTML tag in the ElementTable
1432 *
1433 * Returns the related htmlElemDescPtr or NULL if not found.
1434 */
1435const htmlElemDesc *
1436htmlTagLookup(const xmlChar *tag) {
Elliott Hughes5cefca72021-05-06 13:23:15 -07001437 if (tag == NULL)
1438 return(NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001439
Elliott Hughes5cefca72021-05-06 13:23:15 -07001440 return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442 sizeof(htmlElemDesc), htmlCompareTags));
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001443}
1444
1445/**
1446 * htmlGetEndPriority:
1447 * @name: The name of the element to look up the priority for.
1448 *
1449 * Return value: The "endtag" priority.
1450 **/
1451static int
1452htmlGetEndPriority (const xmlChar *name) {
1453 int i = 0;
1454
1455 while ((htmlEndPriority[i].name != NULL) &&
1456 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457 i++;
1458
1459 return(htmlEndPriority[i].priority);
1460}
1461
1462
Elliott Hughes5cefca72021-05-06 13:23:15 -07001463static int
1464htmlCompareStartClose(const void *vkey, const void *member) {
1465 const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466 const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467 int ret;
1468
1469 ret = strcmp(key->oldTag, entry->oldTag);
1470 if (ret == 0)
1471 ret = strcmp(key->newTag, entry->newTag);
1472
1473 return(ret);
1474}
1475
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001476/**
1477 * htmlCheckAutoClose:
1478 * @newtag: The new tag name
1479 * @oldtag: The old tag name
1480 *
1481 * Checks whether the new tag is one of the registered valid tags for
1482 * closing old.
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001483 *
1484 * Returns 0 if no, 1 if yes.
1485 */
1486static int
1487htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488{
Elliott Hughes5cefca72021-05-06 13:23:15 -07001489 htmlStartCloseEntry key;
1490 void *res;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001491
Elliott Hughes5cefca72021-05-06 13:23:15 -07001492 key.oldTag = (const char *) oldtag;
1493 key.newTag = (const char *) newtag;
1494 res = bsearch(&key, htmlStartClose,
1495 sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496 sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497 return(res != NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001498}
1499
1500/**
1501 * htmlAutoCloseOnClose:
1502 * @ctxt: an HTML parser context
1503 * @newtag: The new tag name
1504 * @force: force the tag closure
1505 *
1506 * The HTML DTD allows an ending tag to implicitly close other tags.
1507 */
1508static void
1509htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510{
1511 const htmlElemDesc *info;
1512 int i, priority;
1513
1514 priority = htmlGetEndPriority(newtag);
1515
1516 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517
1518 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519 break;
1520 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001521 * A misplaced endtag can only close elements with lower
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001522 * or equal priority, so if we find an element with higher
1523 * priority before we find an element with
1524 * matching name, we just ignore this endtag
1525 */
1526 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527 return;
1528 }
1529 if (i < 0)
1530 return;
1531
1532 while (!xmlStrEqual(newtag, ctxt->name)) {
1533 info = htmlTagLookup(ctxt->name);
1534 if ((info != NULL) && (info->endTag == 3)) {
1535 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536 "Opening and ending tag mismatch: %s and %s\n",
1537 newtag, ctxt->name);
1538 }
1539 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541 htmlnamePop(ctxt);
1542 }
1543}
1544
1545/**
1546 * htmlAutoCloseOnEnd:
1547 * @ctxt: an HTML parser context
1548 *
1549 * Close all remaining tags at the end of the stream
1550 */
1551static void
1552htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553{
1554 int i;
1555
1556 if (ctxt->nameNr == 0)
1557 return;
1558 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561 htmlnamePop(ctxt);
1562 }
1563}
1564
1565/**
1566 * htmlAutoClose:
1567 * @ctxt: an HTML parser context
1568 * @newtag: The new tag name or NULL
1569 *
1570 * The HTML DTD allows a tag to implicitly close other tags.
1571 * The list is kept in htmlStartClose array. This function is
1572 * called when a new tag has been detected and generates the
1573 * appropriates closes if possible/needed.
1574 * If newtag is NULL this mean we are at the end of the resource
1575 * and we should check
1576 */
1577static void
1578htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579{
1580 while ((newtag != NULL) && (ctxt->name != NULL) &&
1581 (htmlCheckAutoClose(newtag, ctxt->name))) {
1582 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 htmlnamePop(ctxt);
1585 }
1586 if (newtag == NULL) {
1587 htmlAutoCloseOnEnd(ctxt);
1588 return;
1589 }
1590 while ((newtag == NULL) && (ctxt->name != NULL) &&
1591 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596 htmlnamePop(ctxt);
1597 }
1598}
1599
1600/**
1601 * htmlAutoCloseTag:
1602 * @doc: the HTML document
1603 * @name: The tag name
1604 * @elem: the HTML element
1605 *
1606 * The HTML DTD allows a tag to implicitly close other tags.
1607 * The list is kept in htmlStartClose array. This function checks
1608 * if the element or one of it's children would autoclose the
1609 * given tag.
1610 *
1611 * Returns 1 if autoclose, 0 otherwise
1612 */
1613int
1614htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615 htmlNodePtr child;
1616
1617 if (elem == NULL) return(1);
1618 if (xmlStrEqual(name, elem->name)) return(0);
1619 if (htmlCheckAutoClose(elem->name, name)) return(1);
1620 child = elem->children;
1621 while (child != NULL) {
1622 if (htmlAutoCloseTag(doc, name, child)) return(1);
1623 child = child->next;
1624 }
1625 return(0);
1626}
1627
1628/**
1629 * htmlIsAutoClosed:
1630 * @doc: the HTML document
1631 * @elem: the HTML element
1632 *
1633 * The HTML DTD allows a tag to implicitly close other tags.
1634 * The list is kept in htmlStartClose array. This function checks
1635 * if a tag is autoclosed by one of it's child
1636 *
1637 * Returns 1 if autoclosed, 0 otherwise
1638 */
1639int
1640htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641 htmlNodePtr child;
1642
1643 if (elem == NULL) return(1);
1644 child = elem->children;
1645 while (child != NULL) {
1646 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647 child = child->next;
1648 }
1649 return(0);
1650}
1651
1652/**
1653 * htmlCheckImplied:
1654 * @ctxt: an HTML parser context
1655 * @newtag: The new tag name
1656 *
1657 * The HTML DTD allows a tag to exists only implicitly
1658 * called when a new tag has been detected and generates the
1659 * appropriates implicit tags if missing
1660 */
1661static void
1662htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663 int i;
1664
1665 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666 return;
1667 if (!htmlOmittedDefaultValue)
1668 return;
1669 if (xmlStrEqual(newtag, BAD_CAST"html"))
1670 return;
1671 if (ctxt->nameNr <= 0) {
1672 htmlnamePush(ctxt, BAD_CAST"html");
1673 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675 }
1676 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677 return;
1678 if ((ctxt->nameNr <= 1) &&
1679 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685 if (ctxt->html >= 3) {
1686 /* we already saw or generated an <head> before */
1687 return;
1688 }
1689 /*
1690 * dropped OBJECT ... i you put it first BODY will be
1691 * assumed !
1692 */
1693 htmlnamePush(ctxt, BAD_CAST"head");
1694 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699 if (ctxt->html >= 10) {
1700 /* we already saw or generated a <body> before */
1701 return;
1702 }
1703 for (i = 0;i < ctxt->nameNr;i++) {
1704 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705 return;
1706 }
1707 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708 return;
1709 }
1710 }
1711
1712 htmlnamePush(ctxt, BAD_CAST"body");
1713 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715 }
1716}
1717
1718/**
1719 * htmlCheckParagraph
1720 * @ctxt: an HTML parser context
1721 *
1722 * Check whether a p element need to be implied before inserting
1723 * characters in the current element.
1724 *
1725 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726 * in case of error.
1727 */
1728
1729static int
1730htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731 const xmlChar *tag;
1732 int i;
1733
1734 if (ctxt == NULL)
1735 return(-1);
1736 tag = ctxt->name;
1737 if (tag == NULL) {
1738 htmlAutoClose(ctxt, BAD_CAST"p");
1739 htmlCheckImplied(ctxt, BAD_CAST"p");
1740 htmlnamePush(ctxt, BAD_CAST"p");
1741 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743 return(1);
1744 }
1745 if (!htmlOmittedDefaultValue)
1746 return(0);
1747 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749 htmlAutoClose(ctxt, BAD_CAST"p");
1750 htmlCheckImplied(ctxt, BAD_CAST"p");
1751 htmlnamePush(ctxt, BAD_CAST"p");
1752 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754 return(1);
1755 }
1756 }
1757 return(0);
1758}
1759
1760/**
1761 * htmlIsScriptAttribute:
1762 * @name: an attribute name
1763 *
1764 * Check if an attribute is of content type Script
1765 *
1766 * Returns 1 is the attribute is a script 0 otherwise
1767 */
1768int
1769htmlIsScriptAttribute(const xmlChar *name) {
1770 unsigned int i;
1771
1772 if (name == NULL)
1773 return(0);
1774 /*
1775 * all script attributes start with 'on'
1776 */
1777 if ((name[0] != 'o') || (name[1] != 'n'))
1778 return(0);
1779 for (i = 0;
1780 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781 i++) {
1782 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783 return(1);
1784 }
1785 return(0);
1786}
1787
1788/************************************************************************
1789 * *
1790 * The list of HTML predefined entities *
1791 * *
1792 ************************************************************************/
1793
1794
1795static const htmlEntityDesc html40EntitiesTable[] = {
1796/*
1797 * the 4 absolute ones, plus apostrophe.
1798 */
1799{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1800{ 38, "amp", "ampersand, U+0026 ISOnum" },
1801{ 39, "apos", "single quote" },
1802{ 60, "lt", "less-than sign, U+003C ISOnum" },
1803{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1804
1805/*
1806 * A bunch still in the 128-255 range
1807 * Replacing them depend really on the charset used.
1808 */
1809{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1810{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1812{ 163, "pound","pound sign, U+00A3 ISOnum" },
1813{ 164, "curren","currency sign, U+00A4 ISOnum" },
1814{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1815{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816{ 167, "sect", "section sign, U+00A7 ISOnum" },
1817{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1819{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1820{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821{ 172, "not", "not sign, U+00AC ISOnum" },
1822{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1824{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1826{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830{ 181, "micro","micro sign, U+00B5 ISOnum" },
1831{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1835{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1836{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1858{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1865{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1885{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1889{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1890{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896{ 247, "divide","division sign, U+00F7 ISOnum" },
1897{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1902{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1905
1906{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1908{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911
1912/*
1913 * Anything below should really be kept as entities references
1914 */
1915{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1916
1917{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1918{ 732, "tilde","small tilde, U+02DC ISOdia" },
1919
1920{ 913, "Alpha","greek capital letter alpha, U+0391" },
1921{ 914, "Beta", "greek capital letter beta, U+0392" },
1922{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1925{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1926{ 919, "Eta", "greek capital letter eta, U+0397" },
1927{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928{ 921, "Iota", "greek capital letter iota, U+0399" },
1929{ 922, "Kappa","greek capital letter kappa, U+039A" },
1930{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931{ 924, "Mu", "greek capital letter mu, U+039C" },
1932{ 925, "Nu", "greek capital letter nu, U+039D" },
1933{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1934{ 927, "Omicron","greek capital letter omicron, U+039F" },
1935{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1936{ 929, "Rho", "greek capital letter rho, U+03A1" },
1937{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938{ 932, "Tau", "greek capital letter tau, U+03A4" },
1939{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1941{ 935, "Chi", "greek capital letter chi, U+03A7" },
1942{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1943{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944
1945{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1947{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1949{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1951{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1952{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1953{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1954{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1957{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1958{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1959{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1960{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1961{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1962{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1965{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1967{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1968{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1969{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1970{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1973
1974{ 8194, "ensp", "en space, U+2002 ISOpub" },
1975{ 8195, "emsp", "em space, U+2003 ISOpub" },
1976{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1977{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1978{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1979{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1980{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1981{ 8211, "ndash","en dash, U+2013 ISOpub" },
1982{ 8212, "mdash","em dash, U+2014 ISOpub" },
1983{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1984{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1985{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1986{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1987{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1988{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1989{ 8224, "dagger","dagger, U+2020 ISOpub" },
1990{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1991
1992{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1993{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994
1995{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1996
1997{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1998{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999
2000{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002
2003{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
2004{ 8260, "frasl","fraction slash, U+2044 NEW" },
2005
2006{ 8364, "euro", "euro sign, U+20AC NEW" },
2007
2008{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2011{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
2012{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2014{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2015{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2016{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2017{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2018{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2020{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2021{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2022{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2023{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2024
2025{ 8704, "forall","for all, U+2200 ISOtech" },
2026{ 8706, "part", "partial differential, U+2202 ISOtech" },
2027{ 8707, "exist","there exists, U+2203 ISOtech" },
2028{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2029{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2030{ 8712, "isin", "element of, U+2208 ISOtech" },
2031{ 8713, "notin","not an element of, U+2209 ISOtech" },
2032{ 8715, "ni", "contains as member, U+220B ISOtech" },
2033{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2034{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
2035{ 8722, "minus","minus sign, U+2212 ISOtech" },
2036{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2037{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
2038{ 8733, "prop", "proportional to, U+221D ISOtech" },
2039{ 8734, "infin","infinity, U+221E ISOtech" },
2040{ 8736, "ang", "angle, U+2220 ISOamso" },
2041{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
2042{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
2043{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
2044{ 8746, "cup", "union = cup, U+222A ISOtech" },
2045{ 8747, "int", "integral, U+222B ISOtech" },
2046{ 8756, "there4","therefore, U+2234 ISOtech" },
2047{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
2048{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2049{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050{ 8800, "ne", "not equal to, U+2260 ISOtech" },
2051{ 8801, "equiv","identical to, U+2261 ISOtech" },
2052{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2053{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2054{ 8834, "sub", "subset of, U+2282 ISOtech" },
2055{ 8835, "sup", "superset of, U+2283 ISOtech" },
2056{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2057{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2058{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2059{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2061{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2063{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2065{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066{ 8971, "rfloor","right floor, U+230B ISOamsc" },
2067{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2068{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2069{ 9674, "loz", "lozenge, U+25CA ISOpub" },
2070
2071{ 9824, "spades","black spade suit, U+2660 ISOpub" },
2072{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2073{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2074{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
2075
2076};
2077
2078/************************************************************************
2079 * *
2080 * Commodity functions to handle entities *
2081 * *
2082 ************************************************************************/
2083
2084/*
2085 * Macro used to grow the current buffer.
2086 */
2087#define growBuffer(buffer) { \
2088 xmlChar *tmp; \
2089 buffer##_size *= 2; \
2090 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091 if (tmp == NULL) { \
2092 htmlErrMemory(ctxt, "growing buffer\n"); \
2093 xmlFree(buffer); \
2094 return(NULL); \
2095 } \
2096 buffer = tmp; \
2097}
2098
2099/**
2100 * htmlEntityLookup:
2101 * @name: the entity name
2102 *
2103 * Lookup the given entity in EntitiesTable
2104 *
2105 * TODO: the linear scan is really ugly, an hash table is really needed.
2106 *
2107 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108 */
2109const htmlEntityDesc *
2110htmlEntityLookup(const xmlChar *name) {
2111 unsigned int i;
2112
2113 for (i = 0;i < (sizeof(html40EntitiesTable)/
2114 sizeof(html40EntitiesTable[0]));i++) {
2115 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117 }
2118 }
2119 return(NULL);
2120}
2121
2122/**
2123 * htmlEntityValueLookup:
2124 * @value: the entity's unicode value
2125 *
2126 * Lookup the given entity in EntitiesTable
2127 *
2128 * TODO: the linear scan is really ugly, an hash table is really needed.
2129 *
2130 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131 */
2132const htmlEntityDesc *
2133htmlEntityValueLookup(unsigned int value) {
2134 unsigned int i;
2135
2136 for (i = 0;i < (sizeof(html40EntitiesTable)/
2137 sizeof(html40EntitiesTable[0]));i++) {
2138 if (html40EntitiesTable[i].value >= value) {
2139 if (html40EntitiesTable[i].value > value)
2140 break;
2141 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142 }
2143 }
2144 return(NULL);
2145}
2146
2147/**
2148 * UTF8ToHtml:
2149 * @out: a pointer to an array of bytes to store the result
2150 * @outlen: the length of @out
2151 * @in: a pointer to an array of UTF-8 chars
2152 * @inlen: the length of @in
2153 *
2154 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155 * plus HTML entities block of chars out.
2156 *
2157 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158 * The value of @inlen after return is the number of octets consumed
2159 * as the return value is positive, else unpredictable.
2160 * The value of @outlen after return is the number of octets consumed.
2161 */
2162int
2163UTF8ToHtml(unsigned char* out, int *outlen,
2164 const unsigned char* in, int *inlen) {
2165 const unsigned char* processed = in;
2166 const unsigned char* outend;
2167 const unsigned char* outstart = out;
2168 const unsigned char* instart = in;
2169 const unsigned char* inend;
2170 unsigned int c, d;
2171 int trailing;
2172
2173 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174 if (in == NULL) {
2175 /*
2176 * initialization nothing to do
2177 */
2178 *outlen = 0;
2179 *inlen = 0;
2180 return(0);
2181 }
2182 inend = in + (*inlen);
2183 outend = out + (*outlen);
2184 while (in < inend) {
2185 d = *in++;
2186 if (d < 0x80) { c= d; trailing= 0; }
2187 else if (d < 0xC0) {
2188 /* trailing byte in leading position */
2189 *outlen = out - outstart;
2190 *inlen = processed - instart;
2191 return(-2);
2192 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2193 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2194 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2195 else {
2196 /* no chance for this in Ascii */
2197 *outlen = out - outstart;
2198 *inlen = processed - instart;
2199 return(-2);
2200 }
2201
2202 if (inend - in < trailing) {
2203 break;
2204 }
2205
2206 for ( ; trailing; trailing--) {
2207 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208 break;
2209 c <<= 6;
2210 c |= d & 0x3F;
2211 }
2212
2213 /* assertion: c is a single UTF-4 value */
2214 if (c < 0x80) {
2215 if (out + 1 >= outend)
2216 break;
2217 *out++ = c;
2218 } else {
2219 int len;
2220 const htmlEntityDesc * ent;
2221 const char *cp;
2222 char nbuf[16];
2223
2224 /*
2225 * Try to lookup a predefined HTML entity for it
2226 */
2227
2228 ent = htmlEntityValueLookup(c);
2229 if (ent == NULL) {
2230 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231 cp = nbuf;
2232 }
2233 else
2234 cp = ent->name;
2235 len = strlen(cp);
2236 if (out + 2 + len >= outend)
2237 break;
2238 *out++ = '&';
2239 memcpy(out, cp, len);
2240 out += len;
2241 *out++ = ';';
2242 }
2243 processed = in;
2244 }
2245 *outlen = out - outstart;
2246 *inlen = processed - instart;
2247 return(0);
2248}
2249
2250/**
2251 * htmlEncodeEntities:
2252 * @out: a pointer to an array of bytes to store the result
2253 * @outlen: the length of @out
2254 * @in: a pointer to an array of UTF-8 chars
2255 * @inlen: the length of @in
2256 * @quoteChar: the quote character to escape (' or ") or zero.
2257 *
2258 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259 * plus HTML entities block of chars out.
2260 *
2261 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262 * The value of @inlen after return is the number of octets consumed
2263 * as the return value is positive, else unpredictable.
2264 * The value of @outlen after return is the number of octets consumed.
2265 */
2266int
2267htmlEncodeEntities(unsigned char* out, int *outlen,
2268 const unsigned char* in, int *inlen, int quoteChar) {
2269 const unsigned char* processed = in;
2270 const unsigned char* outend;
2271 const unsigned char* outstart = out;
2272 const unsigned char* instart = in;
2273 const unsigned char* inend;
2274 unsigned int c, d;
2275 int trailing;
2276
2277 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278 return(-1);
2279 outend = out + (*outlen);
2280 inend = in + (*inlen);
2281 while (in < inend) {
2282 d = *in++;
2283 if (d < 0x80) { c= d; trailing= 0; }
2284 else if (d < 0xC0) {
2285 /* trailing byte in leading position */
2286 *outlen = out - outstart;
2287 *inlen = processed - instart;
2288 return(-2);
2289 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2290 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2291 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2292 else {
2293 /* no chance for this in Ascii */
2294 *outlen = out - outstart;
2295 *inlen = processed - instart;
2296 return(-2);
2297 }
2298
2299 if (inend - in < trailing)
2300 break;
2301
2302 while (trailing--) {
2303 if (((d= *in++) & 0xC0) != 0x80) {
2304 *outlen = out - outstart;
2305 *inlen = processed - instart;
2306 return(-2);
2307 }
2308 c <<= 6;
2309 c |= d & 0x3F;
2310 }
2311
2312 /* assertion: c is a single UTF-4 value */
2313 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314 (c != '&') && (c != '<') && (c != '>')) {
2315 if (out >= outend)
2316 break;
2317 *out++ = c;
2318 } else {
2319 const htmlEntityDesc * ent;
2320 const char *cp;
2321 char nbuf[16];
2322 int len;
2323
2324 /*
2325 * Try to lookup a predefined HTML entity for it
2326 */
2327 ent = htmlEntityValueLookup(c);
2328 if (ent == NULL) {
2329 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330 cp = nbuf;
2331 }
2332 else
2333 cp = ent->name;
2334 len = strlen(cp);
2335 if (out + 2 + len > outend)
2336 break;
2337 *out++ = '&';
2338 memcpy(out, cp, len);
2339 out += len;
2340 *out++ = ';';
2341 }
2342 processed = in;
2343 }
2344 *outlen = out - outstart;
2345 *inlen = processed - instart;
2346 return(0);
2347}
2348
2349/************************************************************************
2350 * *
2351 * Commodity functions to handle streams *
2352 * *
2353 ************************************************************************/
2354
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002355#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002356/**
2357 * htmlNewInputStream:
2358 * @ctxt: an HTML parser context
2359 *
2360 * Create a new input stream structure
2361 * Returns the new input stream or NULL
2362 */
2363static htmlParserInputPtr
2364htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365 htmlParserInputPtr input;
2366
2367 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368 if (input == NULL) {
2369 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370 return(NULL);
2371 }
2372 memset(input, 0, sizeof(htmlParserInput));
2373 input->filename = NULL;
2374 input->directory = NULL;
2375 input->base = NULL;
2376 input->cur = NULL;
2377 input->buf = NULL;
2378 input->line = 1;
2379 input->col = 1;
2380 input->buf = NULL;
2381 input->free = NULL;
2382 input->version = NULL;
2383 input->consumed = 0;
2384 input->length = 0;
2385 return(input);
2386}
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002387#endif
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002388
2389
2390/************************************************************************
2391 * *
2392 * Commodity functions, cleanup needed ? *
2393 * *
2394 ************************************************************************/
2395/*
2396 * all tags allowing pc data from the html 4.01 loose dtd
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002397 * NOTE: it might be more appropriate to integrate this information
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002398 * into the html40ElementTable array but I don't want to risk any
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002399 * binary incompatibility
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002400 */
2401static const char *allowPCData[] = {
2402 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403 "blockquote", "body", "button", "caption", "center", "cite", "code",
2404 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408};
2409
2410/**
2411 * areBlanks:
2412 * @ctxt: an HTML parser context
2413 * @str: a xmlChar *
2414 * @len: the size of @str
2415 *
2416 * Is this a sequence of blank chars that one can ignore ?
2417 *
2418 * Returns 1 if ignorable 0 otherwise.
2419 */
2420
2421static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422 unsigned int i;
2423 int j;
2424 xmlNodePtr lastChild;
2425 xmlDtdPtr dtd;
2426
2427 for (j = 0;j < len;j++)
2428 if (!(IS_BLANK_CH(str[j]))) return(0);
2429
2430 if (CUR == 0) return(1);
2431 if (CUR != '<') return(0);
2432 if (ctxt->name == NULL)
2433 return(1);
2434 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435 return(1);
2436 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437 return(1);
2438
2439 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441 dtd = xmlGetIntSubset(ctxt->myDoc);
2442 if (dtd != NULL && dtd->ExternalID != NULL) {
2443 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445 return(1);
2446 }
2447 }
2448
2449 if (ctxt->node == NULL) return(0);
2450 lastChild = xmlGetLastChild(ctxt->node);
2451 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452 lastChild = lastChild->prev;
2453 if (lastChild == NULL) {
2454 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455 (ctxt->node->content != NULL)) return(0);
2456 /* keep ws in constructs like ...<b> </b>...
2457 for all tags "b" allowing PCDATA */
2458 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460 return(0);
2461 }
2462 }
2463 } else if (xmlNodeIsText(lastChild)) {
2464 return(0);
2465 } else {
2466 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467 for all tags "p" allowing PCDATA */
2468 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470 return(0);
2471 }
2472 }
2473 }
2474 return(1);
2475}
2476
2477/**
2478 * htmlNewDocNoDtD:
2479 * @URI: URI for the dtd, or NULL
2480 * @ExternalID: the external ID of the DTD, or NULL
2481 *
2482 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483 * are NULL
2484 *
2485 * Returns a new document, do not initialize the DTD if not provided
2486 */
2487htmlDocPtr
2488htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489 xmlDocPtr cur;
2490
2491 /*
2492 * Allocate a new document and fill the fields.
2493 */
2494 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495 if (cur == NULL) {
2496 htmlErrMemory(NULL, "HTML document creation failed\n");
2497 return(NULL);
2498 }
2499 memset(cur, 0, sizeof(xmlDoc));
2500
2501 cur->type = XML_HTML_DOCUMENT_NODE;
2502 cur->version = NULL;
2503 cur->intSubset = NULL;
2504 cur->doc = cur;
2505 cur->name = NULL;
2506 cur->children = NULL;
2507 cur->extSubset = NULL;
2508 cur->oldNs = NULL;
2509 cur->encoding = NULL;
2510 cur->standalone = 1;
2511 cur->compression = 0;
2512 cur->ids = NULL;
2513 cur->refs = NULL;
2514 cur->_private = NULL;
2515 cur->charset = XML_CHAR_ENCODING_UTF8;
2516 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517 if ((ExternalID != NULL) ||
2518 (URI != NULL))
2519 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Elliott Hughesecdab2a2022-02-23 14:33:50 -08002520 if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2521 xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002522 return(cur);
2523}
2524
2525/**
2526 * htmlNewDoc:
2527 * @URI: URI for the dtd, or NULL
2528 * @ExternalID: the external ID of the DTD, or NULL
2529 *
2530 * Creates a new HTML document
2531 *
2532 * Returns a new document
2533 */
2534htmlDocPtr
2535htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2536 if ((URI == NULL) && (ExternalID == NULL))
2537 return(htmlNewDocNoDtD(
2538 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2539 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2540
2541 return(htmlNewDocNoDtD(URI, ExternalID));
2542}
2543
2544
2545/************************************************************************
2546 * *
2547 * The parser itself *
2548 * Relates to http://www.w3.org/TR/html40 *
2549 * *
2550 ************************************************************************/
2551
2552/************************************************************************
2553 * *
2554 * The parser itself *
2555 * *
2556 ************************************************************************/
2557
2558static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2559
2560/**
2561 * htmlParseHTMLName:
2562 * @ctxt: an HTML parser context
2563 *
2564 * parse an HTML tag or attribute name, note that we convert it to lowercase
2565 * since HTML names are not case-sensitive.
2566 *
2567 * Returns the Tag Name parsed or NULL
2568 */
2569
2570static const xmlChar *
2571htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2572 int i = 0;
2573 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574
2575 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2576 (CUR != ':') && (CUR != '.')) return(NULL);
2577
2578 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2580 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2581 (CUR == '.'))) {
2582 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2583 else loc[i] = CUR;
2584 i++;
2585
2586 NEXT;
2587 }
2588
2589 return(xmlDictLookup(ctxt->dict, loc, i));
2590}
2591
2592
2593/**
2594 * htmlParseHTMLName_nonInvasive:
2595 * @ctxt: an HTML parser context
2596 *
2597 * parse an HTML tag or attribute name, note that we convert it to lowercase
2598 * since HTML names are not case-sensitive, this doesn't consume the data
2599 * from the stream, it's a look-ahead
2600 *
2601 * Returns the Tag Name parsed or NULL
2602 */
2603
2604static const xmlChar *
2605htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2606 int i = 0;
2607 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2608
2609 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2610 (NXT(1) != ':')) return(NULL);
2611
2612 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2613 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2614 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2615 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2616 else loc[i] = NXT(1+i);
2617 i++;
2618 }
2619
2620 return(xmlDictLookup(ctxt->dict, loc, i));
2621}
2622
2623
2624/**
2625 * htmlParseName:
2626 * @ctxt: an HTML parser context
2627 *
2628 * parse an HTML name, this routine is case sensitive.
2629 *
2630 * Returns the Name parsed or NULL
2631 */
2632
2633static const xmlChar *
2634htmlParseName(htmlParserCtxtPtr ctxt) {
2635 const xmlChar *in;
2636 const xmlChar *ret;
2637 int count = 0;
2638
2639 GROW;
2640
2641 /*
2642 * Accelerator for simple ASCII names
2643 */
2644 in = ctxt->input->cur;
2645 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2646 ((*in >= 0x41) && (*in <= 0x5A)) ||
2647 (*in == '_') || (*in == ':')) {
2648 in++;
2649 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2650 ((*in >= 0x41) && (*in <= 0x5A)) ||
2651 ((*in >= 0x30) && (*in <= 0x39)) ||
2652 (*in == '_') || (*in == '-') ||
2653 (*in == ':') || (*in == '.'))
2654 in++;
2655
2656 if (in == ctxt->input->end)
2657 return(NULL);
2658
2659 if ((*in > 0) && (*in < 0x80)) {
2660 count = in - ctxt->input->cur;
2661 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2662 ctxt->input->cur = in;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002663 ctxt->input->col += count;
2664 return(ret);
2665 }
2666 }
2667 return(htmlParseNameComplex(ctxt));
2668}
2669
2670static const xmlChar *
2671htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2672 int len = 0, l;
2673 int c;
2674 int count = 0;
2675 const xmlChar *base = ctxt->input->base;
2676
2677 /*
2678 * Handler for more complex cases
2679 */
2680 GROW;
2681 c = CUR_CHAR(l);
2682 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2683 (!IS_LETTER(c) && (c != '_') &&
2684 (c != ':'))) {
2685 return(NULL);
2686 }
2687
2688 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2689 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2690 (c == '.') || (c == '-') ||
2691 (c == '_') || (c == ':') ||
2692 (IS_COMBINING(c)) ||
2693 (IS_EXTENDER(c)))) {
2694 if (count++ > 100) {
2695 count = 0;
2696 GROW;
2697 }
2698 len += l;
2699 NEXTL(l);
2700 c = CUR_CHAR(l);
2701 if (ctxt->input->base != base) {
2702 /*
2703 * We changed encoding from an unknown encoding
2704 * Input buffer changed location, so we better start again
2705 */
2706 return(htmlParseNameComplex(ctxt));
2707 }
2708 }
2709
2710 if (ctxt->input->cur - ctxt->input->base < len) {
2711 /* Sanity check */
2712 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2713 "unexpected change of input buffer", NULL, NULL);
2714 return (NULL);
2715 }
2716
2717 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2718}
2719
2720
2721/**
2722 * htmlParseHTMLAttribute:
2723 * @ctxt: an HTML parser context
2724 * @stop: a char stop value
2725 *
2726 * parse an HTML attribute value till the stop (quote), if
2727 * stop is 0 then it stops at the first space
2728 *
2729 * Returns the attribute parsed or NULL
2730 */
2731
2732static xmlChar *
2733htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2734 xmlChar *buffer = NULL;
2735 int buffer_size = 0;
2736 xmlChar *out = NULL;
2737 const xmlChar *name = NULL;
2738 const xmlChar *cur = NULL;
2739 const htmlEntityDesc * ent;
2740
2741 /*
2742 * allocate a translation buffer.
2743 */
2744 buffer_size = HTML_PARSER_BUFFER_SIZE;
2745 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2746 if (buffer == NULL) {
2747 htmlErrMemory(ctxt, "buffer allocation failed\n");
2748 return(NULL);
2749 }
2750 out = buffer;
2751
2752 /*
2753 * Ok loop until we reach one of the ending chars
2754 */
2755 while ((CUR != 0) && (CUR != stop)) {
2756 if ((stop == 0) && (CUR == '>')) break;
2757 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2758 if (CUR == '&') {
2759 if (NXT(1) == '#') {
2760 unsigned int c;
2761 int bits;
2762
2763 c = htmlParseCharRef(ctxt);
2764 if (c < 0x80)
2765 { *out++ = c; bits= -6; }
2766 else if (c < 0x800)
2767 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2768 else if (c < 0x10000)
2769 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2770 else
2771 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2772
2773 for ( ; bits >= 0; bits-= 6) {
2774 *out++ = ((c >> bits) & 0x3F) | 0x80;
2775 }
2776
2777 if (out - buffer > buffer_size - 100) {
2778 int indx = out - buffer;
2779
2780 growBuffer(buffer);
2781 out = &buffer[indx];
2782 }
2783 } else {
2784 ent = htmlParseEntityRef(ctxt, &name);
2785 if (name == NULL) {
2786 *out++ = '&';
2787 if (out - buffer > buffer_size - 100) {
2788 int indx = out - buffer;
2789
2790 growBuffer(buffer);
2791 out = &buffer[indx];
2792 }
2793 } else if (ent == NULL) {
2794 *out++ = '&';
2795 cur = name;
2796 while (*cur != 0) {
2797 if (out - buffer > buffer_size - 100) {
2798 int indx = out - buffer;
2799
2800 growBuffer(buffer);
2801 out = &buffer[indx];
2802 }
2803 *out++ = *cur++;
2804 }
2805 } else {
2806 unsigned int c;
2807 int bits;
2808
2809 if (out - buffer > buffer_size - 100) {
2810 int indx = out - buffer;
2811
2812 growBuffer(buffer);
2813 out = &buffer[indx];
2814 }
2815 c = ent->value;
2816 if (c < 0x80)
2817 { *out++ = c; bits= -6; }
2818 else if (c < 0x800)
2819 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2820 else if (c < 0x10000)
2821 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2822 else
2823 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2824
2825 for ( ; bits >= 0; bits-= 6) {
2826 *out++ = ((c >> bits) & 0x3F) | 0x80;
2827 }
2828 }
2829 }
2830 } else {
2831 unsigned int c;
2832 int bits, l;
2833
2834 if (out - buffer > buffer_size - 100) {
2835 int indx = out - buffer;
2836
2837 growBuffer(buffer);
2838 out = &buffer[indx];
2839 }
2840 c = CUR_CHAR(l);
2841 if (c < 0x80)
2842 { *out++ = c; bits= -6; }
2843 else if (c < 0x800)
2844 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2845 else if (c < 0x10000)
2846 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2847 else
2848 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2849
2850 for ( ; bits >= 0; bits-= 6) {
2851 *out++ = ((c >> bits) & 0x3F) | 0x80;
2852 }
2853 NEXT;
2854 }
2855 }
2856 *out = 0;
2857 return(buffer);
2858}
2859
2860/**
2861 * htmlParseEntityRef:
2862 * @ctxt: an HTML parser context
2863 * @str: location to store the entity name
2864 *
2865 * parse an HTML ENTITY references
2866 *
2867 * [68] EntityRef ::= '&' Name ';'
2868 *
2869 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2870 * if non-NULL *str will have to be freed by the caller.
2871 */
2872const htmlEntityDesc *
2873htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2874 const xmlChar *name;
2875 const htmlEntityDesc * ent = NULL;
2876
2877 if (str != NULL) *str = NULL;
2878 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2879
2880 if (CUR == '&') {
2881 NEXT;
2882 name = htmlParseName(ctxt);
2883 if (name == NULL) {
2884 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2885 "htmlParseEntityRef: no name\n", NULL, NULL);
2886 } else {
2887 GROW;
2888 if (CUR == ';') {
2889 if (str != NULL)
2890 *str = name;
2891
2892 /*
2893 * Lookup the entity in the table.
2894 */
2895 ent = htmlEntityLookup(name);
2896 if (ent != NULL) /* OK that's ugly !!! */
2897 NEXT;
2898 } else {
2899 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2900 "htmlParseEntityRef: expecting ';'\n",
2901 NULL, NULL);
2902 if (str != NULL)
2903 *str = name;
2904 }
2905 }
2906 }
2907 return(ent);
2908}
2909
2910/**
2911 * htmlParseAttValue:
2912 * @ctxt: an HTML parser context
2913 *
2914 * parse a value for an attribute
2915 * Note: the parser won't do substitution of entities here, this
2916 * will be handled later in xmlStringGetNodeList, unless it was
2917 * asked for ctxt->replaceEntities != 0
2918 *
2919 * Returns the AttValue parsed or NULL.
2920 */
2921
2922static xmlChar *
2923htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2924 xmlChar *ret = NULL;
2925
2926 if (CUR == '"') {
2927 NEXT;
2928 ret = htmlParseHTMLAttribute(ctxt, '"');
2929 if (CUR != '"') {
2930 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2931 "AttValue: \" expected\n", NULL, NULL);
2932 } else
2933 NEXT;
2934 } else if (CUR == '\'') {
2935 NEXT;
2936 ret = htmlParseHTMLAttribute(ctxt, '\'');
2937 if (CUR != '\'') {
2938 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2939 "AttValue: ' expected\n", NULL, NULL);
2940 } else
2941 NEXT;
2942 } else {
2943 /*
2944 * That's an HTMLism, the attribute value may not be quoted
2945 */
2946 ret = htmlParseHTMLAttribute(ctxt, 0);
2947 if (ret == NULL) {
2948 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2949 "AttValue: no value found\n", NULL, NULL);
2950 }
2951 }
2952 return(ret);
2953}
2954
2955/**
2956 * htmlParseSystemLiteral:
2957 * @ctxt: an HTML parser context
2958 *
2959 * parse an HTML Literal
2960 *
2961 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2962 *
2963 * Returns the SystemLiteral parsed or NULL
2964 */
2965
2966static xmlChar *
2967htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2968 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002969 int err = 0;
2970 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002971 xmlChar *ret = NULL;
2972
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002973 if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002974 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002975 "SystemLiteral \" or ' expected\n", NULL, NULL);
2976 return(NULL);
2977 }
2978 quote = CUR;
2979 NEXT;
2980
2981 if (CUR_PTR < BASE_PTR)
2982 return(ret);
2983 startPosition = CUR_PTR - BASE_PTR;
2984
2985 while ((CUR != 0) && (CUR != quote)) {
2986 /* TODO: Handle UTF-8 */
2987 if (!IS_CHAR_CH(CUR)) {
2988 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2989 "Invalid char in SystemLiteral 0x%X\n", CUR);
2990 err = 1;
2991 }
2992 NEXT;
2993 len++;
2994 }
2995 if (CUR != quote) {
2996 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2997 "Unfinished SystemLiteral\n", NULL, NULL);
2998 } else {
2999 NEXT;
3000 if (err == 0)
3001 ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003002 }
3003
3004 return(ret);
3005}
3006
3007/**
3008 * htmlParsePubidLiteral:
3009 * @ctxt: an HTML parser context
3010 *
3011 * parse an HTML public literal
3012 *
3013 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3014 *
3015 * Returns the PubidLiteral parsed or NULL.
3016 */
3017
3018static xmlChar *
3019htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3020 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003021 int err = 0;
3022 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003023 xmlChar *ret = NULL;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003024
3025 if ((CUR != '"') && (CUR != '\'')) {
3026 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3027 "PubidLiteral \" or ' expected\n", NULL, NULL);
3028 return(NULL);
3029 }
3030 quote = CUR;
3031 NEXT;
3032
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003033 /*
3034 * Name ::= (Letter | '_') (NameChar)*
3035 */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003036 if (CUR_PTR < BASE_PTR)
3037 return(ret);
3038 startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003039
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003040 while ((CUR != 0) && (CUR != quote)) {
3041 if (!IS_PUBIDCHAR_CH(CUR)) {
3042 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3043 "Invalid char in PubidLiteral 0x%X\n", CUR);
3044 err = 1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003045 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003046 len++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003047 NEXT;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003048 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003049
Elliott Hughesecdab2a2022-02-23 14:33:50 -08003050 if (CUR != quote) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003051 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3052 "Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003053 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003054 NEXT;
3055 if (err == 0)
3056 ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003057 }
3058
3059 return(ret);
3060}
3061
3062/**
3063 * htmlParseScript:
3064 * @ctxt: an HTML parser context
3065 *
3066 * parse the content of an HTML SCRIPT or STYLE element
3067 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3068 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3069 * http://www.w3.org/TR/html4/types.html#type-script
3070 * http://www.w3.org/TR/html4/types.html#h-6.15
3071 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3072 *
3073 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3074 * element and the value of intrinsic event attributes. User agents must
3075 * not evaluate script data as HTML markup but instead must pass it on as
3076 * data to a script engine.
3077 * NOTES:
3078 * - The content is passed like CDATA
3079 * - the attributes for style and scripting "onXXX" are also described
3080 * as CDATA but SGML allows entities references in attributes so their
3081 * processing is identical as other attributes
3082 */
3083static void
3084htmlParseScript(htmlParserCtxtPtr ctxt) {
3085 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3086 int nbchar = 0;
3087 int cur,l;
3088
3089 SHRINK;
3090 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003091 while (cur != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003092 if ((cur == '<') && (NXT(1) == '/')) {
3093 /*
3094 * One should break here, the specification is clear:
3095 * Authors should therefore escape "</" within the content.
3096 * Escape mechanisms are specific to each scripting or
3097 * style sheet language.
3098 *
3099 * In recovery mode, only break if end tag match the
3100 * current tag, effectively ignoring all tags inside the
3101 * script/style block and treating the entire block as
3102 * CDATA.
3103 */
3104 if (ctxt->recovery) {
3105 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3106 xmlStrlen(ctxt->name)) == 0)
3107 {
3108 break; /* while */
3109 } else {
3110 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3111 "Element %s embeds close tag\n",
3112 ctxt->name, NULL);
3113 }
3114 } else {
3115 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3116 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3117 {
3118 break; /* while */
3119 }
3120 }
3121 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003122 if (IS_CHAR(cur)) {
3123 COPY_BUF(l,buf,nbchar,cur);
3124 } else {
3125 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3126 "Invalid char in CDATA 0x%X\n", cur);
3127 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003128 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003129 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003130 if (ctxt->sax->cdataBlock!= NULL) {
3131 /*
3132 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3133 */
3134 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3135 } else if (ctxt->sax->characters != NULL) {
3136 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3137 }
3138 nbchar = 0;
3139 }
3140 GROW;
3141 NEXTL(l);
3142 cur = CUR_CHAR(l);
3143 }
3144
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003145 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003146 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003147 if (ctxt->sax->cdataBlock!= NULL) {
3148 /*
3149 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3150 */
3151 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3152 } else if (ctxt->sax->characters != NULL) {
3153 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3154 }
3155 }
3156}
3157
3158
3159/**
3160 * htmlParseCharDataInternal:
3161 * @ctxt: an HTML parser context
3162 * @readahead: optional read ahead character in ascii range
3163 *
3164 * parse a CharData section.
3165 * if we are within a CDATA section ']]>' marks an end of section.
3166 *
3167 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3168 */
3169
3170static void
3171htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3172 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3173 int nbchar = 0;
3174 int cur, l;
3175 int chunk = 0;
3176
3177 if (readahead)
3178 buf[nbchar++] = readahead;
3179
3180 SHRINK;
3181 cur = CUR_CHAR(l);
3182 while (((cur != '<') || (ctxt->token == '<')) &&
3183 ((cur != '&') || (ctxt->token == '&')) &&
3184 (cur != 0)) {
3185 if (!(IS_CHAR(cur))) {
3186 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3187 "Invalid char in CDATA 0x%X\n", cur);
3188 } else {
3189 COPY_BUF(l,buf,nbchar,cur);
3190 }
3191 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003192 buf[nbchar] = 0;
3193
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003194 /*
3195 * Ok the segment is to be consumed as chars.
3196 */
3197 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3198 if (areBlanks(ctxt, buf, nbchar)) {
3199 if (ctxt->keepBlanks) {
3200 if (ctxt->sax->characters != NULL)
3201 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3202 } else {
3203 if (ctxt->sax->ignorableWhitespace != NULL)
3204 ctxt->sax->ignorableWhitespace(ctxt->userData,
3205 buf, nbchar);
3206 }
3207 } else {
3208 htmlCheckParagraph(ctxt);
3209 if (ctxt->sax->characters != NULL)
3210 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3211 }
3212 }
3213 nbchar = 0;
3214 }
3215 NEXTL(l);
3216 chunk++;
3217 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3218 chunk = 0;
3219 SHRINK;
3220 GROW;
3221 }
3222 cur = CUR_CHAR(l);
3223 if (cur == 0) {
3224 SHRINK;
3225 GROW;
3226 cur = CUR_CHAR(l);
3227 }
3228 }
3229 if (nbchar != 0) {
3230 buf[nbchar] = 0;
3231
3232 /*
3233 * Ok the segment is to be consumed as chars.
3234 */
3235 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3236 if (areBlanks(ctxt, buf, nbchar)) {
3237 if (ctxt->keepBlanks) {
3238 if (ctxt->sax->characters != NULL)
3239 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3240 } else {
3241 if (ctxt->sax->ignorableWhitespace != NULL)
3242 ctxt->sax->ignorableWhitespace(ctxt->userData,
3243 buf, nbchar);
3244 }
3245 } else {
3246 htmlCheckParagraph(ctxt);
3247 if (ctxt->sax->characters != NULL)
3248 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3249 }
3250 }
3251 } else {
3252 /*
3253 * Loop detection
3254 */
3255 if (cur == 0)
3256 ctxt->instate = XML_PARSER_EOF;
3257 }
3258}
3259
3260/**
3261 * htmlParseCharData:
3262 * @ctxt: an HTML parser context
3263 *
3264 * parse a CharData section.
3265 * if we are within a CDATA section ']]>' marks an end of section.
3266 *
3267 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3268 */
3269
3270static void
3271htmlParseCharData(htmlParserCtxtPtr ctxt) {
3272 htmlParseCharDataInternal(ctxt, 0);
3273}
3274
3275/**
3276 * htmlParseExternalID:
3277 * @ctxt: an HTML parser context
3278 * @publicID: a xmlChar** receiving PubidLiteral
3279 *
3280 * Parse an External ID or a Public ID
3281 *
3282 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3283 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3284 *
3285 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3286 *
3287 * Returns the function returns SystemLiteral and in the second
3288 * case publicID receives PubidLiteral, is strict is off
3289 * it is possible to return NULL and have publicID set.
3290 */
3291
3292static xmlChar *
3293htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3294 xmlChar *URI = NULL;
3295
3296 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3297 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3298 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3299 SKIP(6);
3300 if (!IS_BLANK_CH(CUR)) {
3301 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3302 "Space required after 'SYSTEM'\n", NULL, NULL);
3303 }
3304 SKIP_BLANKS;
3305 URI = htmlParseSystemLiteral(ctxt);
3306 if (URI == NULL) {
3307 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3308 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3309 }
3310 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3311 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3312 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3313 SKIP(6);
3314 if (!IS_BLANK_CH(CUR)) {
3315 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3316 "Space required after 'PUBLIC'\n", NULL, NULL);
3317 }
3318 SKIP_BLANKS;
3319 *publicID = htmlParsePubidLiteral(ctxt);
3320 if (*publicID == NULL) {
3321 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3322 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3323 NULL, NULL);
3324 }
3325 SKIP_BLANKS;
3326 if ((CUR == '"') || (CUR == '\'')) {
3327 URI = htmlParseSystemLiteral(ctxt);
3328 }
3329 }
3330 return(URI);
3331}
3332
3333/**
3334 * xmlParsePI:
3335 * @ctxt: an XML parser context
3336 *
3337 * parse an XML Processing Instruction.
3338 *
3339 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3340 */
3341static void
3342htmlParsePI(htmlParserCtxtPtr ctxt) {
3343 xmlChar *buf = NULL;
3344 int len = 0;
3345 int size = HTML_PARSER_BUFFER_SIZE;
3346 int cur, l;
3347 const xmlChar *target;
3348 xmlParserInputState state;
3349 int count = 0;
3350
3351 if ((RAW == '<') && (NXT(1) == '?')) {
3352 state = ctxt->instate;
3353 ctxt->instate = XML_PARSER_PI;
3354 /*
3355 * this is a Processing Instruction.
3356 */
3357 SKIP(2);
3358 SHRINK;
3359
3360 /*
3361 * Parse the target name and check for special support like
3362 * namespace.
3363 */
3364 target = htmlParseName(ctxt);
3365 if (target != NULL) {
3366 if (RAW == '>') {
3367 SKIP(1);
3368
3369 /*
3370 * SAX: PI detected.
3371 */
3372 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3373 (ctxt->sax->processingInstruction != NULL))
3374 ctxt->sax->processingInstruction(ctxt->userData,
3375 target, NULL);
3376 ctxt->instate = state;
3377 return;
3378 }
3379 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3380 if (buf == NULL) {
3381 htmlErrMemory(ctxt, NULL);
3382 ctxt->instate = state;
3383 return;
3384 }
3385 cur = CUR;
3386 if (!IS_BLANK(cur)) {
3387 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3388 "ParsePI: PI %s space expected\n", target, NULL);
3389 }
3390 SKIP_BLANKS;
3391 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003392 while ((cur != 0) && (cur != '>')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003393 if (len + 5 >= size) {
3394 xmlChar *tmp;
3395
3396 size *= 2;
3397 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3398 if (tmp == NULL) {
3399 htmlErrMemory(ctxt, NULL);
3400 xmlFree(buf);
3401 ctxt->instate = state;
3402 return;
3403 }
3404 buf = tmp;
3405 }
3406 count++;
3407 if (count > 50) {
3408 GROW;
3409 count = 0;
3410 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003411 if (IS_CHAR(cur)) {
3412 COPY_BUF(l,buf,len,cur);
3413 } else {
3414 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3415 "Invalid char in processing instruction "
3416 "0x%X\n", cur);
3417 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003418 NEXTL(l);
3419 cur = CUR_CHAR(l);
3420 if (cur == 0) {
3421 SHRINK;
3422 GROW;
3423 cur = CUR_CHAR(l);
3424 }
3425 }
3426 buf[len] = 0;
3427 if (cur != '>') {
3428 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3429 "ParsePI: PI %s never end ...\n", target, NULL);
3430 } else {
3431 SKIP(1);
3432
3433 /*
3434 * SAX: PI detected.
3435 */
3436 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3437 (ctxt->sax->processingInstruction != NULL))
3438 ctxt->sax->processingInstruction(ctxt->userData,
3439 target, buf);
3440 }
3441 xmlFree(buf);
3442 } else {
3443 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3444 "PI is not started correctly", NULL, NULL);
3445 }
3446 ctxt->instate = state;
3447 }
3448}
3449
3450/**
3451 * htmlParseComment:
3452 * @ctxt: an HTML parser context
3453 *
3454 * Parse an XML (SGML) comment <!-- .... -->
3455 *
3456 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3457 */
3458static void
3459htmlParseComment(htmlParserCtxtPtr ctxt) {
3460 xmlChar *buf = NULL;
3461 int len;
3462 int size = HTML_PARSER_BUFFER_SIZE;
3463 int q, ql;
3464 int r, rl;
3465 int cur, l;
Haibo Huangd75f3892021-01-05 21:34:50 -08003466 int next, nl;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003467 xmlParserInputState state;
3468
3469 /*
3470 * Check that there is a comment right here.
3471 */
3472 if ((RAW != '<') || (NXT(1) != '!') ||
3473 (NXT(2) != '-') || (NXT(3) != '-')) return;
3474
3475 state = ctxt->instate;
3476 ctxt->instate = XML_PARSER_COMMENT;
3477 SHRINK;
3478 SKIP(4);
3479 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3480 if (buf == NULL) {
3481 htmlErrMemory(ctxt, "buffer allocation failed\n");
3482 ctxt->instate = state;
3483 return;
3484 }
3485 len = 0;
3486 buf[len] = 0;
3487 q = CUR_CHAR(ql);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003488 if (q == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003489 goto unfinished;
3490 NEXTL(ql);
3491 r = CUR_CHAR(rl);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003492 if (r == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003493 goto unfinished;
3494 NEXTL(rl);
3495 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003496 while ((cur != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003497 ((cur != '>') ||
3498 (r != '-') || (q != '-'))) {
Haibo Huangd75f3892021-01-05 21:34:50 -08003499 NEXTL(l);
3500 next = CUR_CHAR(nl);
3501 if (next == 0) {
3502 SHRINK;
3503 GROW;
3504 next = CUR_CHAR(nl);
3505 }
3506
3507 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3508 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3509 "Comment incorrectly closed by '--!>'", NULL, NULL);
3510 cur = '>';
3511 break;
3512 }
3513
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003514 if (len + 5 >= size) {
3515 xmlChar *tmp;
3516
3517 size *= 2;
3518 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3519 if (tmp == NULL) {
3520 xmlFree(buf);
3521 htmlErrMemory(ctxt, "growing buffer failed\n");
3522 ctxt->instate = state;
3523 return;
3524 }
3525 buf = tmp;
3526 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003527 if (IS_CHAR(q)) {
3528 COPY_BUF(ql,buf,len,q);
3529 } else {
3530 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3531 "Invalid char in comment 0x%X\n", q);
3532 }
Haibo Huangd75f3892021-01-05 21:34:50 -08003533
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003534 q = r;
3535 ql = rl;
3536 r = cur;
3537 rl = l;
Haibo Huangd75f3892021-01-05 21:34:50 -08003538 cur = next;
3539 l = nl;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003540 }
3541 buf[len] = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003542 if (cur == '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003543 NEXT;
3544 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3545 (!ctxt->disableSAX))
3546 ctxt->sax->comment(ctxt->userData, buf);
3547 xmlFree(buf);
3548 ctxt->instate = state;
3549 return;
3550 }
3551
3552unfinished:
3553 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3554 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3555 xmlFree(buf);
3556}
3557
3558/**
3559 * htmlParseCharRef:
3560 * @ctxt: an HTML parser context
3561 *
3562 * parse Reference declarations
3563 *
3564 * [66] CharRef ::= '&#' [0-9]+ ';' |
3565 * '&#x' [0-9a-fA-F]+ ';'
3566 *
3567 * Returns the value parsed (as an int)
3568 */
3569int
3570htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3571 int val = 0;
3572
3573 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3574 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3575 "htmlParseCharRef: context error\n",
3576 NULL, NULL);
3577 return(0);
3578 }
3579 if ((CUR == '&') && (NXT(1) == '#') &&
3580 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3581 SKIP(3);
3582 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003583 if ((CUR >= '0') && (CUR <= '9')) {
3584 if (val < 0x110000)
3585 val = val * 16 + (CUR - '0');
3586 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3587 if (val < 0x110000)
3588 val = val * 16 + (CUR - 'a') + 10;
3589 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3590 if (val < 0x110000)
3591 val = val * 16 + (CUR - 'A') + 10;
3592 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003593 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3594 "htmlParseCharRef: missing semicolon\n",
3595 NULL, NULL);
3596 break;
3597 }
3598 NEXT;
3599 }
3600 if (CUR == ';')
3601 NEXT;
3602 } else if ((CUR == '&') && (NXT(1) == '#')) {
3603 SKIP(2);
3604 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003605 if ((CUR >= '0') && (CUR <= '9')) {
3606 if (val < 0x110000)
3607 val = val * 10 + (CUR - '0');
3608 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003609 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3610 "htmlParseCharRef: missing semicolon\n",
3611 NULL, NULL);
3612 break;
3613 }
3614 NEXT;
3615 }
3616 if (CUR == ';')
3617 NEXT;
3618 } else {
3619 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3620 "htmlParseCharRef: invalid value\n", NULL, NULL);
3621 }
3622 /*
3623 * Check the value IS_CHAR ...
3624 */
3625 if (IS_CHAR(val)) {
3626 return(val);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003627 } else if (val >= 0x110000) {
3628 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3629 "htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003630 } else {
3631 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3632 "htmlParseCharRef: invalid xmlChar value %d\n",
3633 val);
3634 }
3635 return(0);
3636}
3637
3638
3639/**
3640 * htmlParseDocTypeDecl:
3641 * @ctxt: an HTML parser context
3642 *
3643 * parse a DOCTYPE declaration
3644 *
3645 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3646 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3647 */
3648
3649static void
3650htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3651 const xmlChar *name;
3652 xmlChar *ExternalID = NULL;
3653 xmlChar *URI = NULL;
3654
3655 /*
3656 * We know that '<!DOCTYPE' has been detected.
3657 */
3658 SKIP(9);
3659
3660 SKIP_BLANKS;
3661
3662 /*
3663 * Parse the DOCTYPE name.
3664 */
3665 name = htmlParseName(ctxt);
3666 if (name == NULL) {
3667 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3668 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3669 NULL, NULL);
3670 }
3671 /*
3672 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3673 */
3674
3675 SKIP_BLANKS;
3676
3677 /*
3678 * Check for SystemID and ExternalID
3679 */
3680 URI = htmlParseExternalID(ctxt, &ExternalID);
3681 SKIP_BLANKS;
3682
3683 /*
3684 * We should be at the end of the DOCTYPE declaration.
3685 */
3686 if (CUR != '>') {
3687 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3688 "DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003689 /* Ignore bogus content */
3690 while ((CUR != 0) && (CUR != '>'))
3691 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003692 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003693 if (CUR == '>')
3694 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003695
3696 /*
3697 * Create or update the document accordingly to the DOCTYPE
3698 */
3699 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3700 (!ctxt->disableSAX))
3701 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3702
3703 /*
3704 * Cleanup, since we don't use all those identifiers
3705 */
3706 if (URI != NULL) xmlFree(URI);
3707 if (ExternalID != NULL) xmlFree(ExternalID);
3708}
3709
3710/**
3711 * htmlParseAttribute:
3712 * @ctxt: an HTML parser context
3713 * @value: a xmlChar ** used to store the value of the attribute
3714 *
3715 * parse an attribute
3716 *
3717 * [41] Attribute ::= Name Eq AttValue
3718 *
3719 * [25] Eq ::= S? '=' S?
3720 *
3721 * With namespace:
3722 *
3723 * [NS 11] Attribute ::= QName Eq AttValue
3724 *
3725 * Also the case QName == xmlns:??? is handled independently as a namespace
3726 * definition.
3727 *
3728 * Returns the attribute name, and the value in *value.
3729 */
3730
3731static const xmlChar *
3732htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3733 const xmlChar *name;
3734 xmlChar *val = NULL;
3735
3736 *value = NULL;
3737 name = htmlParseHTMLName(ctxt);
3738 if (name == NULL) {
3739 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3740 "error parsing attribute name\n", NULL, NULL);
3741 return(NULL);
3742 }
3743
3744 /*
3745 * read the value
3746 */
3747 SKIP_BLANKS;
3748 if (CUR == '=') {
3749 NEXT;
3750 SKIP_BLANKS;
3751 val = htmlParseAttValue(ctxt);
3752 }
3753
3754 *value = val;
3755 return(name);
3756}
3757
3758/**
3759 * htmlCheckEncodingDirect:
3760 * @ctxt: an HTML parser context
3761 * @attvalue: the attribute value
3762 *
3763 * Checks an attribute value to detect
3764 * the encoding
3765 * If a new encoding is detected the parser is switched to decode
3766 * it and pass UTF8
3767 */
3768static void
3769htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3770
3771 if ((ctxt == NULL) || (encoding == NULL) ||
3772 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3773 return;
3774
3775 /* do not change encoding */
3776 if (ctxt->input->encoding != NULL)
3777 return;
3778
3779 if (encoding != NULL) {
3780 xmlCharEncoding enc;
3781 xmlCharEncodingHandlerPtr handler;
3782
3783 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3784
3785 if (ctxt->input->encoding != NULL)
3786 xmlFree((xmlChar *) ctxt->input->encoding);
3787 ctxt->input->encoding = xmlStrdup(encoding);
3788
3789 enc = xmlParseCharEncoding((const char *) encoding);
3790 /*
3791 * registered set of known encodings
3792 */
3793 if (enc != XML_CHAR_ENCODING_ERROR) {
3794 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3795 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3796 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3797 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3798 (ctxt->input->buf != NULL) &&
3799 (ctxt->input->buf->encoder == NULL)) {
3800 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3801 "htmlCheckEncoding: wrong encoding meta\n",
3802 NULL, NULL);
3803 } else {
3804 xmlSwitchEncoding(ctxt, enc);
3805 }
3806 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3807 } else {
3808 /*
3809 * fallback for unknown encodings
3810 */
3811 handler = xmlFindCharEncodingHandler((const char *) encoding);
3812 if (handler != NULL) {
3813 xmlSwitchToEncoding(ctxt, handler);
3814 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3815 } else {
3816 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3817 "htmlCheckEncoding: unknown encoding %s\n",
3818 encoding, NULL);
3819 }
3820 }
3821
3822 if ((ctxt->input->buf != NULL) &&
3823 (ctxt->input->buf->encoder != NULL) &&
3824 (ctxt->input->buf->raw != NULL) &&
3825 (ctxt->input->buf->buffer != NULL)) {
3826 int nbchars;
3827 int processed;
3828
3829 /*
3830 * convert as much as possible to the parser reading buffer.
3831 */
3832 processed = ctxt->input->cur - ctxt->input->base;
3833 xmlBufShrink(ctxt->input->buf->buffer, processed);
3834 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3835 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3836 if (nbchars < 0) {
3837 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3838 "htmlCheckEncoding: encoder error\n",
3839 NULL, NULL);
3840 }
3841 }
3842 }
3843}
3844
3845/**
3846 * htmlCheckEncoding:
3847 * @ctxt: an HTML parser context
3848 * @attvalue: the attribute value
3849 *
3850 * Checks an http-equiv attribute from a Meta tag to detect
3851 * the encoding
3852 * If a new encoding is detected the parser is switched to decode
3853 * it and pass UTF8
3854 */
3855static void
3856htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3857 const xmlChar *encoding;
3858
3859 if (!attvalue)
3860 return;
3861
3862 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3863 if (encoding != NULL) {
3864 encoding += 7;
3865 }
3866 /*
3867 * skip blank
3868 */
3869 if (encoding && IS_BLANK_CH(*encoding))
3870 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3871 if (encoding && *encoding == '=') {
3872 encoding ++;
3873 htmlCheckEncodingDirect(ctxt, encoding);
3874 }
3875}
3876
3877/**
3878 * htmlCheckMeta:
3879 * @ctxt: an HTML parser context
3880 * @atts: the attributes values
3881 *
3882 * Checks an attributes from a Meta tag
3883 */
3884static void
3885htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3886 int i;
3887 const xmlChar *att, *value;
3888 int http = 0;
3889 const xmlChar *content = NULL;
3890
3891 if ((ctxt == NULL) || (atts == NULL))
3892 return;
3893
3894 i = 0;
3895 att = atts[i++];
3896 while (att != NULL) {
3897 value = atts[i++];
3898 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3899 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3900 http = 1;
3901 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3902 htmlCheckEncodingDirect(ctxt, value);
3903 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3904 content = value;
3905 att = atts[i++];
3906 }
3907 if ((http) && (content != NULL))
3908 htmlCheckEncoding(ctxt, content);
3909
3910}
3911
3912/**
3913 * htmlParseStartTag:
3914 * @ctxt: an HTML parser context
3915 *
3916 * parse a start of tag either for rule element or
3917 * EmptyElement. In both case we don't parse the tag closing chars.
3918 *
3919 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3920 *
3921 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3922 *
3923 * With namespace:
3924 *
3925 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3926 *
3927 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3928 *
3929 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3930 */
3931
3932static int
3933htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3934 const xmlChar *name;
3935 const xmlChar *attname;
3936 xmlChar *attvalue;
3937 const xmlChar **atts;
3938 int nbatts = 0;
3939 int maxatts;
3940 int meta = 0;
3941 int i;
3942 int discardtag = 0;
3943
3944 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3945 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3946 "htmlParseStartTag: context error\n", NULL, NULL);
3947 return -1;
3948 }
3949 if (ctxt->instate == XML_PARSER_EOF)
3950 return(-1);
3951 if (CUR != '<') return -1;
3952 NEXT;
3953
3954 atts = ctxt->atts;
3955 maxatts = ctxt->maxatts;
3956
3957 GROW;
3958 name = htmlParseHTMLName(ctxt);
3959 if (name == NULL) {
3960 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3961 "htmlParseStartTag: invalid element name\n",
3962 NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003963 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003964 while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003965 (ctxt->instate != XML_PARSER_EOF))
3966 NEXT;
3967 return -1;
3968 }
3969 if (xmlStrEqual(name, BAD_CAST"meta"))
3970 meta = 1;
3971
3972 /*
3973 * Check for auto-closure of HTML elements.
3974 */
3975 htmlAutoClose(ctxt, name);
3976
3977 /*
3978 * Check for implied HTML elements.
3979 */
3980 htmlCheckImplied(ctxt, name);
3981
3982 /*
3983 * Avoid html at any level > 0, head at any level != 1
3984 * or any attempt to recurse body
3985 */
3986 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3987 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3988 "htmlParseStartTag: misplaced <html> tag\n",
3989 name, NULL);
3990 discardtag = 1;
3991 ctxt->depth++;
3992 }
3993 if ((ctxt->nameNr != 1) &&
3994 (xmlStrEqual(name, BAD_CAST"head"))) {
3995 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3996 "htmlParseStartTag: misplaced <head> tag\n",
3997 name, NULL);
3998 discardtag = 1;
3999 ctxt->depth++;
4000 }
4001 if (xmlStrEqual(name, BAD_CAST"body")) {
4002 int indx;
4003 for (indx = 0;indx < ctxt->nameNr;indx++) {
4004 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4005 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4006 "htmlParseStartTag: misplaced <body> tag\n",
4007 name, NULL);
4008 discardtag = 1;
4009 ctxt->depth++;
4010 }
4011 }
4012 }
4013
4014 /*
4015 * Now parse the attributes, it ends up with the ending
4016 *
4017 * (S Attribute)* S?
4018 */
4019 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004020 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004021 (CUR != '>') &&
4022 ((CUR != '/') || (NXT(1) != '>'))) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004023 GROW;
4024 attname = htmlParseAttribute(ctxt, &attvalue);
4025 if (attname != NULL) {
4026
4027 /*
4028 * Well formedness requires at most one declaration of an attribute
4029 */
4030 for (i = 0; i < nbatts;i += 2) {
4031 if (xmlStrEqual(atts[i], attname)) {
4032 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4033 "Attribute %s redefined\n", attname, NULL);
4034 if (attvalue != NULL)
4035 xmlFree(attvalue);
4036 goto failed;
4037 }
4038 }
4039
4040 /*
4041 * Add the pair to atts
4042 */
4043 if (atts == NULL) {
4044 maxatts = 22; /* allow for 10 attrs by default */
4045 atts = (const xmlChar **)
4046 xmlMalloc(maxatts * sizeof(xmlChar *));
4047 if (atts == NULL) {
4048 htmlErrMemory(ctxt, NULL);
4049 if (attvalue != NULL)
4050 xmlFree(attvalue);
4051 goto failed;
4052 }
4053 ctxt->atts = atts;
4054 ctxt->maxatts = maxatts;
4055 } else if (nbatts + 4 > maxatts) {
4056 const xmlChar **n;
4057
4058 maxatts *= 2;
4059 n = (const xmlChar **) xmlRealloc((void *) atts,
4060 maxatts * sizeof(const xmlChar *));
4061 if (n == NULL) {
4062 htmlErrMemory(ctxt, NULL);
4063 if (attvalue != NULL)
4064 xmlFree(attvalue);
4065 goto failed;
4066 }
4067 atts = n;
4068 ctxt->atts = atts;
4069 ctxt->maxatts = maxatts;
4070 }
4071 atts[nbatts++] = attname;
4072 atts[nbatts++] = attvalue;
4073 atts[nbatts] = NULL;
4074 atts[nbatts + 1] = NULL;
4075 }
4076 else {
4077 if (attvalue != NULL)
4078 xmlFree(attvalue);
4079 /* Dump the bogus attribute string up to the next blank or
4080 * the end of the tag. */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004081 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004082 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4083 ((CUR != '/') || (NXT(1) != '>')))
4084 NEXT;
4085 }
4086
4087failed:
4088 SKIP_BLANKS;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004089 }
4090
4091 /*
4092 * Handle specific association to the META tag
4093 */
4094 if (meta && (nbatts != 0))
4095 htmlCheckMeta(ctxt, atts);
4096
4097 /*
4098 * SAX: Start of Element !
4099 */
4100 if (!discardtag) {
4101 htmlnamePush(ctxt, name);
4102 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4103 if (nbatts != 0)
4104 ctxt->sax->startElement(ctxt->userData, name, atts);
4105 else
4106 ctxt->sax->startElement(ctxt->userData, name, NULL);
4107 }
4108 }
4109
4110 if (atts != NULL) {
4111 for (i = 1;i < nbatts;i += 2) {
4112 if (atts[i] != NULL)
4113 xmlFree((xmlChar *) atts[i]);
4114 }
4115 }
4116
4117 return(discardtag);
4118}
4119
4120/**
4121 * htmlParseEndTag:
4122 * @ctxt: an HTML parser context
4123 *
4124 * parse an end of tag
4125 *
4126 * [42] ETag ::= '</' Name S? '>'
4127 *
4128 * With namespace
4129 *
4130 * [NS 9] ETag ::= '</' QName S? '>'
4131 *
4132 * Returns 1 if the current level should be closed.
4133 */
4134
4135static int
4136htmlParseEndTag(htmlParserCtxtPtr ctxt)
4137{
4138 const xmlChar *name;
4139 const xmlChar *oldname;
4140 int i, ret;
4141
4142 if ((CUR != '<') || (NXT(1) != '/')) {
4143 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4144 "htmlParseEndTag: '</' not found\n", NULL, NULL);
4145 return (0);
4146 }
4147 SKIP(2);
4148
4149 name = htmlParseHTMLName(ctxt);
4150 if (name == NULL)
4151 return (0);
4152 /*
4153 * We should definitely be at the ending "S? '>'" part
4154 */
4155 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004156 if (CUR != '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004157 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4158 "End tag : expected '>'\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004159 /* Skip to next '>' */
4160 while ((CUR != 0) && (CUR != '>'))
4161 NEXT;
4162 }
4163 if (CUR == '>')
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004164 NEXT;
4165
4166 /*
4167 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4168 * out now.
4169 */
4170 if ((ctxt->depth > 0) &&
4171 (xmlStrEqual(name, BAD_CAST "html") ||
4172 xmlStrEqual(name, BAD_CAST "body") ||
4173 xmlStrEqual(name, BAD_CAST "head"))) {
4174 ctxt->depth--;
4175 return (0);
4176 }
4177
4178 /*
4179 * If the name read is not one of the element in the parsing stack
4180 * then return, it's just an error.
4181 */
4182 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4183 if (xmlStrEqual(name, ctxt->nameTab[i]))
4184 break;
4185 }
4186 if (i < 0) {
4187 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4188 "Unexpected end tag : %s\n", name, NULL);
4189 return (0);
4190 }
4191
4192
4193 /*
4194 * Check for auto-closure of HTML elements.
4195 */
4196
4197 htmlAutoCloseOnClose(ctxt, name);
4198
4199 /*
4200 * Well formedness constraints, opening and closing must match.
4201 * With the exception that the autoclose may have popped stuff out
4202 * of the stack.
4203 */
Elliott Hughes5cefca72021-05-06 13:23:15 -07004204 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4205 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4206 "Opening and ending tag mismatch: %s and %s\n",
4207 name, ctxt->name);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004208 }
4209
4210 /*
4211 * SAX: End of Tag
4212 */
4213 oldname = ctxt->name;
4214 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4215 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4216 ctxt->sax->endElement(ctxt->userData, name);
4217 htmlNodeInfoPop(ctxt);
4218 htmlnamePop(ctxt);
4219 ret = 1;
4220 } else {
4221 ret = 0;
4222 }
4223
4224 return (ret);
4225}
4226
4227
4228/**
4229 * htmlParseReference:
4230 * @ctxt: an HTML parser context
4231 *
4232 * parse and handle entity references in content,
4233 * this will end-up in a call to character() since this is either a
4234 * CharRef, or a predefined entity.
4235 */
4236static void
4237htmlParseReference(htmlParserCtxtPtr ctxt) {
4238 const htmlEntityDesc * ent;
4239 xmlChar out[6];
4240 const xmlChar *name;
4241 if (CUR != '&') return;
4242
4243 if (NXT(1) == '#') {
4244 unsigned int c;
4245 int bits, i = 0;
4246
4247 c = htmlParseCharRef(ctxt);
4248 if (c == 0)
4249 return;
4250
4251 if (c < 0x80) { out[i++]= c; bits= -6; }
4252 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4253 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4254 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4255
4256 for ( ; bits >= 0; bits-= 6) {
4257 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4258 }
4259 out[i] = 0;
4260
4261 htmlCheckParagraph(ctxt);
4262 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4263 ctxt->sax->characters(ctxt->userData, out, i);
4264 } else {
4265 ent = htmlParseEntityRef(ctxt, &name);
4266 if (name == NULL) {
4267 htmlCheckParagraph(ctxt);
4268 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4269 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4270 return;
4271 }
4272 if ((ent == NULL) || !(ent->value > 0)) {
4273 htmlCheckParagraph(ctxt);
4274 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4275 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4276 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4277 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4278 }
4279 } else {
4280 unsigned int c;
4281 int bits, i = 0;
4282
4283 c = ent->value;
4284 if (c < 0x80)
4285 { out[i++]= c; bits= -6; }
4286 else if (c < 0x800)
4287 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4288 else if (c < 0x10000)
4289 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4290 else
4291 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4292
4293 for ( ; bits >= 0; bits-= 6) {
4294 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4295 }
4296 out[i] = 0;
4297
4298 htmlCheckParagraph(ctxt);
4299 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4300 ctxt->sax->characters(ctxt->userData, out, i);
4301 }
4302 }
4303}
4304
4305/**
4306 * htmlParseContent:
4307 * @ctxt: an HTML parser context
4308 *
4309 * Parse a content: comment, sub-element, reference or text.
4310 * Kept for compatibility with old code
4311 */
4312
4313static void
4314htmlParseContent(htmlParserCtxtPtr ctxt) {
4315 xmlChar *currentNode;
4316 int depth;
4317 const xmlChar *name;
4318
4319 currentNode = xmlStrdup(ctxt->name);
4320 depth = ctxt->nameNr;
4321 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004322 GROW;
4323
4324 if (ctxt->instate == XML_PARSER_EOF)
4325 break;
4326
4327 /*
4328 * Our tag or one of it's parent or children is ending.
4329 */
4330 if ((CUR == '<') && (NXT(1) == '/')) {
4331 if (htmlParseEndTag(ctxt) &&
4332 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4333 if (currentNode != NULL)
4334 xmlFree(currentNode);
4335 return;
4336 }
4337 continue; /* while */
4338 }
4339
4340 else if ((CUR == '<') &&
4341 ((IS_ASCII_LETTER(NXT(1))) ||
4342 (NXT(1) == '_') || (NXT(1) == ':'))) {
4343 name = htmlParseHTMLName_nonInvasive(ctxt);
4344 if (name == NULL) {
4345 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4346 "htmlParseStartTag: invalid element name\n",
4347 NULL, NULL);
4348 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004349 while ((CUR != 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004350 NEXT;
4351
4352 if (currentNode != NULL)
4353 xmlFree(currentNode);
4354 return;
4355 }
4356
4357 if (ctxt->name != NULL) {
4358 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4359 htmlAutoClose(ctxt, name);
4360 continue;
4361 }
4362 }
4363 }
4364
4365 /*
4366 * Has this node been popped out during parsing of
4367 * the next element
4368 */
4369 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4370 (!xmlStrEqual(currentNode, ctxt->name)))
4371 {
4372 if (currentNode != NULL) xmlFree(currentNode);
4373 return;
4374 }
4375
4376 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4377 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4378 /*
4379 * Handle SCRIPT/STYLE separately
4380 */
4381 htmlParseScript(ctxt);
4382 } else {
4383 /*
4384 * Sometimes DOCTYPE arrives in the middle of the document
4385 */
4386 if ((CUR == '<') && (NXT(1) == '!') &&
4387 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4388 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4389 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4390 (UPP(8) == 'E')) {
4391 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4392 "Misplaced DOCTYPE declaration\n",
4393 BAD_CAST "DOCTYPE" , NULL);
4394 htmlParseDocTypeDecl(ctxt);
4395 }
4396
4397 /*
4398 * First case : a comment
4399 */
4400 if ((CUR == '<') && (NXT(1) == '!') &&
4401 (NXT(2) == '-') && (NXT(3) == '-')) {
4402 htmlParseComment(ctxt);
4403 }
4404
4405 /*
4406 * Second case : a Processing Instruction.
4407 */
4408 else if ((CUR == '<') && (NXT(1) == '?')) {
4409 htmlParsePI(ctxt);
4410 }
4411
4412 /*
4413 * Third case : a sub-element.
4414 */
Elliott Hughesecdab2a2022-02-23 14:33:50 -08004415 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004416 htmlParseElement(ctxt);
4417 }
Elliott Hughesecdab2a2022-02-23 14:33:50 -08004418 else if (CUR == '<') {
4419 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4420 (ctxt->sax->characters != NULL))
4421 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4422 NEXT;
4423 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004424
4425 /*
4426 * Fourth case : a reference. If if has not been resolved,
4427 * parsing returns it's Name, create the node
4428 */
4429 else if (CUR == '&') {
4430 htmlParseReference(ctxt);
4431 }
4432
4433 /*
4434 * Fifth case : end of the resource
4435 */
4436 else if (CUR == 0) {
4437 htmlAutoCloseOnEnd(ctxt);
4438 break;
4439 }
4440
4441 /*
4442 * Last case, text. Note that References are handled directly.
4443 */
4444 else {
4445 htmlParseCharData(ctxt);
4446 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004447 }
4448 GROW;
4449 }
4450 if (currentNode != NULL) xmlFree(currentNode);
4451}
4452
4453/**
4454 * htmlParseElement:
4455 * @ctxt: an HTML parser context
4456 *
4457 * parse an HTML element, this is highly recursive
4458 * this is kept for compatibility with previous code versions
4459 *
4460 * [39] element ::= EmptyElemTag | STag content ETag
4461 *
4462 * [41] Attribute ::= Name Eq AttValue
4463 */
4464
4465void
4466htmlParseElement(htmlParserCtxtPtr ctxt) {
4467 const xmlChar *name;
4468 xmlChar *currentNode = NULL;
4469 const htmlElemDesc * info;
4470 htmlParserNodeInfo node_info;
4471 int failed;
4472 int depth;
4473 const xmlChar *oldptr;
4474
4475 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4476 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4477 "htmlParseElement: context error\n", NULL, NULL);
4478 return;
4479 }
4480
4481 if (ctxt->instate == XML_PARSER_EOF)
4482 return;
4483
4484 /* Capture start position */
4485 if (ctxt->record_info) {
4486 node_info.begin_pos = ctxt->input->consumed +
4487 (CUR_PTR - ctxt->input->base);
4488 node_info.begin_line = ctxt->input->line;
4489 }
4490
4491 failed = htmlParseStartTag(ctxt);
4492 name = ctxt->name;
4493 if ((failed == -1) || (name == NULL)) {
4494 if (CUR == '>')
4495 NEXT;
4496 return;
4497 }
4498
4499 /*
4500 * Lookup the info for that element.
4501 */
4502 info = htmlTagLookup(name);
4503 if (info == NULL) {
4504 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4505 "Tag %s invalid\n", name, NULL);
4506 }
4507
4508 /*
4509 * Check for an Empty Element labeled the XML/SGML way
4510 */
4511 if ((CUR == '/') && (NXT(1) == '>')) {
4512 SKIP(2);
4513 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4514 ctxt->sax->endElement(ctxt->userData, name);
4515 htmlnamePop(ctxt);
4516 return;
4517 }
4518
4519 if (CUR == '>') {
4520 NEXT;
4521 } else {
4522 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4523 "Couldn't find end of Start Tag %s\n", name, NULL);
4524
4525 /*
4526 * end of parsing of this node.
4527 */
4528 if (xmlStrEqual(name, ctxt->name)) {
4529 nodePop(ctxt);
4530 htmlnamePop(ctxt);
4531 }
4532
4533 /*
4534 * Capture end position and add node
4535 */
4536 if (ctxt->record_info) {
4537 node_info.end_pos = ctxt->input->consumed +
4538 (CUR_PTR - ctxt->input->base);
4539 node_info.end_line = ctxt->input->line;
4540 node_info.node = ctxt->node;
4541 xmlParserAddNodeInfo(ctxt, &node_info);
4542 }
4543 return;
4544 }
4545
4546 /*
4547 * Check for an Empty Element from DTD definition
4548 */
4549 if ((info != NULL) && (info->empty)) {
4550 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4551 ctxt->sax->endElement(ctxt->userData, name);
4552 htmlnamePop(ctxt);
4553 return;
4554 }
4555
4556 /*
4557 * Parse the content of the element:
4558 */
4559 currentNode = xmlStrdup(ctxt->name);
4560 depth = ctxt->nameNr;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004561 while (CUR != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004562 oldptr = ctxt->input->cur;
4563 htmlParseContent(ctxt);
4564 if (oldptr==ctxt->input->cur) break;
4565 if (ctxt->nameNr < depth) break;
4566 }
4567
4568 /*
4569 * Capture end position and add node
4570 */
4571 if ( currentNode != NULL && ctxt->record_info ) {
4572 node_info.end_pos = ctxt->input->consumed +
4573 (CUR_PTR - ctxt->input->base);
4574 node_info.end_line = ctxt->input->line;
4575 node_info.node = ctxt->node;
4576 xmlParserAddNodeInfo(ctxt, &node_info);
4577 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004578 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004579 htmlAutoCloseOnEnd(ctxt);
4580 }
4581
4582 if (currentNode != NULL)
4583 xmlFree(currentNode);
4584}
4585
4586static void
4587htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4588 /*
4589 * Capture end position and add node
4590 */
4591 if ( ctxt->node != NULL && ctxt->record_info ) {
4592 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4593 (CUR_PTR - ctxt->input->base);
4594 ctxt->nodeInfo->end_line = ctxt->input->line;
4595 ctxt->nodeInfo->node = ctxt->node;
4596 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4597 htmlNodeInfoPop(ctxt);
4598 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004599 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004600 htmlAutoCloseOnEnd(ctxt);
4601 }
4602}
4603
4604/**
4605 * htmlParseElementInternal:
4606 * @ctxt: an HTML parser context
4607 *
4608 * parse an HTML element, new version, non recursive
4609 *
4610 * [39] element ::= EmptyElemTag | STag content ETag
4611 *
4612 * [41] Attribute ::= Name Eq AttValue
4613 */
4614
4615static void
4616htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4617 const xmlChar *name;
4618 const htmlElemDesc * info;
4619 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4620 int failed;
4621
4622 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4623 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4624 "htmlParseElementInternal: context error\n", NULL, NULL);
4625 return;
4626 }
4627
4628 if (ctxt->instate == XML_PARSER_EOF)
4629 return;
4630
4631 /* Capture start position */
4632 if (ctxt->record_info) {
4633 node_info.begin_pos = ctxt->input->consumed +
4634 (CUR_PTR - ctxt->input->base);
4635 node_info.begin_line = ctxt->input->line;
4636 }
4637
4638 failed = htmlParseStartTag(ctxt);
4639 name = ctxt->name;
4640 if ((failed == -1) || (name == NULL)) {
4641 if (CUR == '>')
4642 NEXT;
4643 return;
4644 }
4645
4646 /*
4647 * Lookup the info for that element.
4648 */
4649 info = htmlTagLookup(name);
4650 if (info == NULL) {
4651 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4652 "Tag %s invalid\n", name, NULL);
4653 }
4654
4655 /*
4656 * Check for an Empty Element labeled the XML/SGML way
4657 */
4658 if ((CUR == '/') && (NXT(1) == '>')) {
4659 SKIP(2);
4660 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4661 ctxt->sax->endElement(ctxt->userData, name);
4662 htmlnamePop(ctxt);
4663 return;
4664 }
4665
4666 if (CUR == '>') {
4667 NEXT;
4668 } else {
4669 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4670 "Couldn't find end of Start Tag %s\n", name, NULL);
4671
4672 /*
4673 * end of parsing of this node.
4674 */
4675 if (xmlStrEqual(name, ctxt->name)) {
4676 nodePop(ctxt);
4677 htmlnamePop(ctxt);
4678 }
4679
4680 if (ctxt->record_info)
4681 htmlNodeInfoPush(ctxt, &node_info);
4682 htmlParserFinishElementParsing(ctxt);
4683 return;
4684 }
4685
4686 /*
4687 * Check for an Empty Element from DTD definition
4688 */
4689 if ((info != NULL) && (info->empty)) {
4690 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4691 ctxt->sax->endElement(ctxt->userData, name);
4692 htmlnamePop(ctxt);
4693 return;
4694 }
4695
4696 if (ctxt->record_info)
4697 htmlNodeInfoPush(ctxt, &node_info);
4698}
4699
4700/**
4701 * htmlParseContentInternal:
4702 * @ctxt: an HTML parser context
4703 *
4704 * Parse a content: comment, sub-element, reference or text.
4705 * New version for non recursive htmlParseElementInternal
4706 */
4707
4708static void
4709htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4710 xmlChar *currentNode;
4711 int depth;
4712 const xmlChar *name;
4713
4714 currentNode = xmlStrdup(ctxt->name);
4715 depth = ctxt->nameNr;
4716 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004717 GROW;
4718
4719 if (ctxt->instate == XML_PARSER_EOF)
4720 break;
4721
4722 /*
4723 * Our tag or one of it's parent or children is ending.
4724 */
4725 if ((CUR == '<') && (NXT(1) == '/')) {
4726 if (htmlParseEndTag(ctxt) &&
4727 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4728 if (currentNode != NULL)
4729 xmlFree(currentNode);
4730
4731 currentNode = xmlStrdup(ctxt->name);
4732 depth = ctxt->nameNr;
4733 }
4734 continue; /* while */
4735 }
4736
4737 else if ((CUR == '<') &&
4738 ((IS_ASCII_LETTER(NXT(1))) ||
4739 (NXT(1) == '_') || (NXT(1) == ':'))) {
4740 name = htmlParseHTMLName_nonInvasive(ctxt);
4741 if (name == NULL) {
4742 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4743 "htmlParseStartTag: invalid element name\n",
4744 NULL, NULL);
4745 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004746 while ((CUR == 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004747 NEXT;
4748
4749 htmlParserFinishElementParsing(ctxt);
4750 if (currentNode != NULL)
4751 xmlFree(currentNode);
4752
4753 currentNode = xmlStrdup(ctxt->name);
4754 depth = ctxt->nameNr;
4755 continue;
4756 }
4757
4758 if (ctxt->name != NULL) {
4759 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4760 htmlAutoClose(ctxt, name);
4761 continue;
4762 }
4763 }
4764 }
4765
4766 /*
4767 * Has this node been popped out during parsing of
4768 * the next element
4769 */
4770 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4771 (!xmlStrEqual(currentNode, ctxt->name)))
4772 {
4773 htmlParserFinishElementParsing(ctxt);
4774 if (currentNode != NULL) xmlFree(currentNode);
4775
4776 currentNode = xmlStrdup(ctxt->name);
4777 depth = ctxt->nameNr;
4778 continue;
4779 }
4780
4781 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4782 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4783 /*
4784 * Handle SCRIPT/STYLE separately
4785 */
4786 htmlParseScript(ctxt);
4787 } else {
4788 /*
4789 * Sometimes DOCTYPE arrives in the middle of the document
4790 */
4791 if ((CUR == '<') && (NXT(1) == '!') &&
4792 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4793 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4794 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795 (UPP(8) == 'E')) {
4796 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4797 "Misplaced DOCTYPE declaration\n",
4798 BAD_CAST "DOCTYPE" , NULL);
4799 htmlParseDocTypeDecl(ctxt);
4800 }
4801
4802 /*
4803 * First case : a comment
4804 */
4805 if ((CUR == '<') && (NXT(1) == '!') &&
4806 (NXT(2) == '-') && (NXT(3) == '-')) {
4807 htmlParseComment(ctxt);
4808 }
4809
4810 /*
4811 * Second case : a Processing Instruction.
4812 */
4813 else if ((CUR == '<') && (NXT(1) == '?')) {
4814 htmlParsePI(ctxt);
4815 }
4816
4817 /*
4818 * Third case : a sub-element.
4819 */
Elliott Hughesecdab2a2022-02-23 14:33:50 -08004820 else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004821 htmlParseElementInternal(ctxt);
4822 if (currentNode != NULL) xmlFree(currentNode);
4823
4824 currentNode = xmlStrdup(ctxt->name);
4825 depth = ctxt->nameNr;
4826 }
Elliott Hughesecdab2a2022-02-23 14:33:50 -08004827 else if (CUR == '<') {
4828 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4829 (ctxt->sax->characters != NULL))
4830 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4831 NEXT;
4832 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004833
4834 /*
4835 * Fourth case : a reference. If if has not been resolved,
4836 * parsing returns it's Name, create the node
4837 */
4838 else if (CUR == '&') {
4839 htmlParseReference(ctxt);
4840 }
4841
4842 /*
4843 * Fifth case : end of the resource
4844 */
4845 else if (CUR == 0) {
4846 htmlAutoCloseOnEnd(ctxt);
4847 break;
4848 }
4849
4850 /*
4851 * Last case, text. Note that References are handled directly.
4852 */
4853 else {
4854 htmlParseCharData(ctxt);
4855 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004856 }
4857 GROW;
4858 }
4859 if (currentNode != NULL) xmlFree(currentNode);
4860}
4861
4862/**
4863 * htmlParseContent:
4864 * @ctxt: an HTML parser context
4865 *
4866 * Parse a content: comment, sub-element, reference or text.
4867 * This is the entry point when called from parser.c
4868 */
4869
4870void
4871__htmlParseContent(void *ctxt) {
4872 if (ctxt != NULL)
4873 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4874}
4875
4876/**
4877 * htmlParseDocument:
4878 * @ctxt: an HTML parser context
4879 *
4880 * parse an HTML document (and build a tree if using the standard SAX
4881 * interface).
4882 *
4883 * Returns 0, -1 in case of error. the parser context is augmented
4884 * as a result of the parsing.
4885 */
4886
4887int
4888htmlParseDocument(htmlParserCtxtPtr ctxt) {
4889 xmlChar start[4];
4890 xmlCharEncoding enc;
4891 xmlDtdPtr dtd;
4892
4893 xmlInitParser();
4894
4895 htmlDefaultSAXHandlerInit();
4896
4897 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4898 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4899 "htmlParseDocument: context error\n", NULL, NULL);
4900 return(XML_ERR_INTERNAL_ERROR);
4901 }
4902 ctxt->html = 1;
4903 ctxt->linenumbers = 1;
4904 GROW;
4905 /*
4906 * SAX: beginning of the document processing.
4907 */
4908 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4909 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4910
4911 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4912 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4913 /*
4914 * Get the 4 first bytes and decode the charset
4915 * if enc != XML_CHAR_ENCODING_NONE
4916 * plug some encoding conversion routines.
4917 */
4918 start[0] = RAW;
4919 start[1] = NXT(1);
4920 start[2] = NXT(2);
4921 start[3] = NXT(3);
4922 enc = xmlDetectCharEncoding(&start[0], 4);
4923 if (enc != XML_CHAR_ENCODING_NONE) {
4924 xmlSwitchEncoding(ctxt, enc);
4925 }
4926 }
4927
4928 /*
4929 * Wipe out everything which is before the first '<'
4930 */
4931 SKIP_BLANKS;
4932 if (CUR == 0) {
4933 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4934 "Document is empty\n", NULL, NULL);
4935 }
4936
4937 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4938 ctxt->sax->startDocument(ctxt->userData);
4939
4940
4941 /*
4942 * Parse possible comments and PIs before any content
4943 */
4944 while (((CUR == '<') && (NXT(1) == '!') &&
4945 (NXT(2) == '-') && (NXT(3) == '-')) ||
4946 ((CUR == '<') && (NXT(1) == '?'))) {
4947 htmlParseComment(ctxt);
4948 htmlParsePI(ctxt);
4949 SKIP_BLANKS;
4950 }
4951
4952
4953 /*
4954 * Then possibly doc type declaration(s) and more Misc
4955 * (doctypedecl Misc*)?
4956 */
4957 if ((CUR == '<') && (NXT(1) == '!') &&
4958 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4959 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4960 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4961 (UPP(8) == 'E')) {
4962 htmlParseDocTypeDecl(ctxt);
4963 }
4964 SKIP_BLANKS;
4965
4966 /*
4967 * Parse possible comments and PIs before any content
4968 */
4969 while (((CUR == '<') && (NXT(1) == '!') &&
4970 (NXT(2) == '-') && (NXT(3) == '-')) ||
4971 ((CUR == '<') && (NXT(1) == '?'))) {
4972 htmlParseComment(ctxt);
4973 htmlParsePI(ctxt);
4974 SKIP_BLANKS;
4975 }
4976
4977 /*
4978 * Time to start parsing the tree itself
4979 */
4980 htmlParseContentInternal(ctxt);
4981
4982 /*
4983 * autoclose
4984 */
4985 if (CUR == 0)
4986 htmlAutoCloseOnEnd(ctxt);
4987
4988
4989 /*
4990 * SAX: end of the document processing.
4991 */
4992 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4993 ctxt->sax->endDocument(ctxt->userData);
4994
4995 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4996 dtd = xmlGetIntSubset(ctxt->myDoc);
4997 if (dtd == NULL)
4998 ctxt->myDoc->intSubset =
4999 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5000 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5001 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5002 }
5003 if (! ctxt->wellFormed) return(-1);
5004 return(0);
5005}
5006
5007
5008/************************************************************************
5009 * *
5010 * Parser contexts handling *
5011 * *
5012 ************************************************************************/
5013
5014/**
5015 * htmlInitParserCtxt:
5016 * @ctxt: an HTML parser context
5017 *
5018 * Initialize a parser context
5019 *
5020 * Returns 0 in case of success and -1 in case of error
5021 */
5022
5023static int
5024htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5025{
5026 htmlSAXHandler *sax;
5027
5028 if (ctxt == NULL) return(-1);
5029 memset(ctxt, 0, sizeof(htmlParserCtxt));
5030
5031 ctxt->dict = xmlDictCreate();
5032 if (ctxt->dict == NULL) {
5033 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5034 return(-1);
5035 }
5036 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5037 if (sax == NULL) {
5038 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5039 return(-1);
5040 }
5041 else
5042 memset(sax, 0, sizeof(htmlSAXHandler));
5043
5044 /* Allocate the Input stack */
5045 ctxt->inputTab = (htmlParserInputPtr *)
5046 xmlMalloc(5 * sizeof(htmlParserInputPtr));
5047 if (ctxt->inputTab == NULL) {
5048 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5049 ctxt->inputNr = 0;
5050 ctxt->inputMax = 0;
5051 ctxt->input = NULL;
5052 return(-1);
5053 }
5054 ctxt->inputNr = 0;
5055 ctxt->inputMax = 5;
5056 ctxt->input = NULL;
5057 ctxt->version = NULL;
5058 ctxt->encoding = NULL;
5059 ctxt->standalone = -1;
5060 ctxt->instate = XML_PARSER_START;
5061
5062 /* Allocate the Node stack */
5063 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5064 if (ctxt->nodeTab == NULL) {
5065 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5066 ctxt->nodeNr = 0;
5067 ctxt->nodeMax = 0;
5068 ctxt->node = NULL;
5069 ctxt->inputNr = 0;
5070 ctxt->inputMax = 0;
5071 ctxt->input = NULL;
5072 return(-1);
5073 }
5074 ctxt->nodeNr = 0;
5075 ctxt->nodeMax = 10;
5076 ctxt->node = NULL;
5077
5078 /* Allocate the Name stack */
5079 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5080 if (ctxt->nameTab == NULL) {
5081 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5082 ctxt->nameNr = 0;
5083 ctxt->nameMax = 0;
5084 ctxt->name = NULL;
5085 ctxt->nodeNr = 0;
5086 ctxt->nodeMax = 0;
5087 ctxt->node = NULL;
5088 ctxt->inputNr = 0;
5089 ctxt->inputMax = 0;
5090 ctxt->input = NULL;
5091 return(-1);
5092 }
5093 ctxt->nameNr = 0;
5094 ctxt->nameMax = 10;
5095 ctxt->name = NULL;
5096
5097 ctxt->nodeInfoTab = NULL;
5098 ctxt->nodeInfoNr = 0;
5099 ctxt->nodeInfoMax = 0;
5100
5101 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5102 else {
5103 ctxt->sax = sax;
5104 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5105 }
5106 ctxt->userData = ctxt;
5107 ctxt->myDoc = NULL;
5108 ctxt->wellFormed = 1;
5109 ctxt->replaceEntities = 0;
5110 ctxt->linenumbers = xmlLineNumbersDefaultValue;
5111 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5112 ctxt->html = 1;
Elliott Hughesecdab2a2022-02-23 14:33:50 -08005113 ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005114 ctxt->vctxt.userData = ctxt;
5115 ctxt->vctxt.error = xmlParserValidityError;
5116 ctxt->vctxt.warning = xmlParserValidityWarning;
5117 ctxt->record_info = 0;
5118 ctxt->validate = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005119 ctxt->checkIndex = 0;
5120 ctxt->catalogs = NULL;
5121 xmlInitNodeInfoSeq(&ctxt->node_seq);
5122 return(0);
5123}
5124
5125/**
5126 * htmlFreeParserCtxt:
5127 * @ctxt: an HTML parser context
5128 *
5129 * Free all the memory used by a parser context. However the parsed
5130 * document in ctxt->myDoc is not freed.
5131 */
5132
5133void
5134htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5135{
5136 xmlFreeParserCtxt(ctxt);
5137}
5138
5139/**
5140 * htmlNewParserCtxt:
5141 *
5142 * Allocate and initialize a new parser context.
5143 *
5144 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5145 */
5146
5147htmlParserCtxtPtr
5148htmlNewParserCtxt(void)
5149{
5150 xmlParserCtxtPtr ctxt;
5151
5152 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5153 if (ctxt == NULL) {
5154 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5155 return(NULL);
5156 }
5157 memset(ctxt, 0, sizeof(xmlParserCtxt));
5158 if (htmlInitParserCtxt(ctxt) < 0) {
5159 htmlFreeParserCtxt(ctxt);
5160 return(NULL);
5161 }
5162 return(ctxt);
5163}
5164
5165/**
5166 * htmlCreateMemoryParserCtxt:
5167 * @buffer: a pointer to a char array
5168 * @size: the size of the array
5169 *
5170 * Create a parser context for an HTML in-memory document.
5171 *
5172 * Returns the new parser context or NULL
5173 */
5174htmlParserCtxtPtr
5175htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5176 xmlParserCtxtPtr ctxt;
5177 xmlParserInputPtr input;
5178 xmlParserInputBufferPtr buf;
5179
5180 if (buffer == NULL)
5181 return(NULL);
5182 if (size <= 0)
5183 return(NULL);
5184
5185 ctxt = htmlNewParserCtxt();
5186 if (ctxt == NULL)
5187 return(NULL);
5188
5189 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5190 if (buf == NULL) return(NULL);
5191
5192 input = xmlNewInputStream(ctxt);
5193 if (input == NULL) {
Elliott Hughesecdab2a2022-02-23 14:33:50 -08005194 xmlFreeParserInputBuffer(buf);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005195 xmlFreeParserCtxt(ctxt);
5196 return(NULL);
5197 }
5198
5199 input->filename = NULL;
5200 input->buf = buf;
5201 xmlBufResetInput(buf->buffer, input);
5202
5203 inputPush(ctxt, input);
5204 return(ctxt);
5205}
5206
5207/**
5208 * htmlCreateDocParserCtxt:
5209 * @cur: a pointer to an array of xmlChar
5210 * @encoding: a free form C string describing the HTML document encoding, or NULL
5211 *
5212 * Create a parser context for an HTML document.
5213 *
5214 * TODO: check the need to add encoding handling there
5215 *
5216 * Returns the new parser context or NULL
5217 */
5218static htmlParserCtxtPtr
5219htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5220 int len;
5221 htmlParserCtxtPtr ctxt;
5222
5223 if (cur == NULL)
5224 return(NULL);
5225 len = xmlStrlen(cur);
5226 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5227 if (ctxt == NULL)
5228 return(NULL);
5229
5230 if (encoding != NULL) {
5231 xmlCharEncoding enc;
5232 xmlCharEncodingHandlerPtr handler;
5233
5234 if (ctxt->input->encoding != NULL)
5235 xmlFree((xmlChar *) ctxt->input->encoding);
5236 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5237
5238 enc = xmlParseCharEncoding(encoding);
5239 /*
5240 * registered set of known encodings
5241 */
5242 if (enc != XML_CHAR_ENCODING_ERROR) {
5243 xmlSwitchEncoding(ctxt, enc);
5244 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5245 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5246 "Unsupported encoding %s\n",
5247 (const xmlChar *) encoding, NULL);
5248 }
5249 } else {
5250 /*
5251 * fallback for unknown encodings
5252 */
5253 handler = xmlFindCharEncodingHandler((const char *) encoding);
5254 if (handler != NULL) {
5255 xmlSwitchToEncoding(ctxt, handler);
5256 } else {
5257 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5258 "Unsupported encoding %s\n",
5259 (const xmlChar *) encoding, NULL);
5260 }
5261 }
5262 }
5263 return(ctxt);
5264}
5265
5266#ifdef LIBXML_PUSH_ENABLED
5267/************************************************************************
5268 * *
5269 * Progressive parsing interfaces *
5270 * *
5271 ************************************************************************/
5272
5273/**
5274 * htmlParseLookupSequence:
5275 * @ctxt: an HTML parser context
5276 * @first: the first char to lookup
5277 * @next: the next char to lookup or zero
5278 * @third: the next char to lookup or zero
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005279 * @ignoreattrval: skip over attribute values
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005280 *
5281 * Try to find if a sequence (first, next, third) or just (first next) or
5282 * (first) is available in the input stream.
5283 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5284 * to avoid rescanning sequences of bytes, it DOES change the state of the
5285 * parser, do not use liberally.
5286 * This is basically similar to xmlParseLookupSequence()
5287 *
5288 * Returns the index to the current parsing point if the full sequence
5289 * is available, -1 otherwise.
5290 */
5291static int
5292htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005293 xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005294{
5295 int base, len;
5296 htmlParserInputPtr in;
5297 const xmlChar *buf;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005298 int invalue = 0;
5299 char valdellim = 0x0;
5300
5301 in = ctxt->input;
5302 if (in == NULL)
5303 return (-1);
5304
5305 base = in->cur - in->base;
5306 if (base < 0)
5307 return (-1);
5308
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005309 if (ctxt->checkIndex > base) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005310 base = ctxt->checkIndex;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005311 /* Abuse hasPErefs member to restore current state. */
5312 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5313 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005314
5315 if (in->buf == NULL) {
5316 buf = in->base;
5317 len = in->length;
5318 } else {
5319 buf = xmlBufContent(in->buf->buffer);
5320 len = xmlBufUse(in->buf->buffer);
5321 }
5322
5323 /* take into account the sequence length */
5324 if (third)
5325 len -= 2;
5326 else if (next)
5327 len--;
5328 for (; base < len; base++) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005329 if (ignoreattrval) {
5330 if (buf[base] == '"' || buf[base] == '\'') {
5331 if (invalue) {
5332 if (buf[base] == valdellim) {
5333 invalue = 0;
5334 continue;
5335 }
5336 } else {
5337 valdellim = buf[base];
5338 invalue = 1;
5339 continue;
5340 }
5341 } else if (invalue) {
5342 continue;
5343 }
5344 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005345 if (buf[base] == first) {
5346 if (third != 0) {
5347 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5348 continue;
5349 } else if (next != 0) {
5350 if (buf[base + 1] != next)
5351 continue;
5352 }
5353 ctxt->checkIndex = 0;
5354#ifdef DEBUG_PUSH
5355 if (next == 0)
5356 xmlGenericError(xmlGenericErrorContext,
5357 "HPP: lookup '%c' found at %d\n",
5358 first, base);
5359 else if (third == 0)
5360 xmlGenericError(xmlGenericErrorContext,
5361 "HPP: lookup '%c%c' found at %d\n",
5362 first, next, base);
5363 else
5364 xmlGenericError(xmlGenericErrorContext,
5365 "HPP: lookup '%c%c%c' found at %d\n",
5366 first, next, third, base);
5367#endif
5368 return (base - (in->cur - in->base));
5369 }
5370 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005371 ctxt->checkIndex = base;
5372 /* Abuse hasPErefs member to track current state. */
5373 if (invalue)
5374 ctxt->hasPErefs |= 1;
5375 else
5376 ctxt->hasPErefs &= ~1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005377#ifdef DEBUG_PUSH
5378 if (next == 0)
5379 xmlGenericError(xmlGenericErrorContext,
5380 "HPP: lookup '%c' failed\n", first);
5381 else if (third == 0)
5382 xmlGenericError(xmlGenericErrorContext,
5383 "HPP: lookup '%c%c' failed\n", first, next);
5384 else
5385 xmlGenericError(xmlGenericErrorContext,
5386 "HPP: lookup '%c%c%c' failed\n", first, next,
5387 third);
5388#endif
5389 return (-1);
5390}
5391
5392/**
Haibo Huangd75f3892021-01-05 21:34:50 -08005393 * htmlParseLookupCommentEnd:
5394 * @ctxt: an HTML parser context
5395 *
5396 * Try to find a comment end tag in the input stream
5397 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5398 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5399 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5400 * to avoid rescanning sequences of bytes, it DOES change the state of the
5401 * parser, do not use liberally.
5402 * This wraps to htmlParseLookupSequence()
5403 *
5404 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5405 */
5406static int
5407htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5408{
5409 int mark = 0;
5410 int cur = CUR_PTR - BASE_PTR;
5411
5412 while (mark >= 0) {
5413 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5414 if ((mark < 0) ||
5415 (NXT(mark+2) == '>') ||
5416 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5417 return mark;
5418 }
5419 ctxt->checkIndex = cur + mark + 1;
5420 }
5421 return mark;
5422}
5423
5424
5425/**
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005426 * htmlParseTryOrFinish:
5427 * @ctxt: an HTML parser context
5428 * @terminate: last chunk indicator
5429 *
5430 * Try to progress on parsing
5431 *
5432 * Returns zero if no parsing was possible
5433 */
5434static int
5435htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5436 int ret = 0;
5437 htmlParserInputPtr in;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005438 ptrdiff_t avail = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005439 xmlChar cur, next;
5440
5441 htmlParserNodeInfo node_info;
5442
5443#ifdef DEBUG_PUSH
5444 switch (ctxt->instate) {
5445 case XML_PARSER_EOF:
5446 xmlGenericError(xmlGenericErrorContext,
5447 "HPP: try EOF\n"); break;
5448 case XML_PARSER_START:
5449 xmlGenericError(xmlGenericErrorContext,
5450 "HPP: try START\n"); break;
5451 case XML_PARSER_MISC:
5452 xmlGenericError(xmlGenericErrorContext,
5453 "HPP: try MISC\n");break;
5454 case XML_PARSER_COMMENT:
5455 xmlGenericError(xmlGenericErrorContext,
5456 "HPP: try COMMENT\n");break;
5457 case XML_PARSER_PROLOG:
5458 xmlGenericError(xmlGenericErrorContext,
5459 "HPP: try PROLOG\n");break;
5460 case XML_PARSER_START_TAG:
5461 xmlGenericError(xmlGenericErrorContext,
5462 "HPP: try START_TAG\n");break;
5463 case XML_PARSER_CONTENT:
5464 xmlGenericError(xmlGenericErrorContext,
5465 "HPP: try CONTENT\n");break;
5466 case XML_PARSER_CDATA_SECTION:
5467 xmlGenericError(xmlGenericErrorContext,
5468 "HPP: try CDATA_SECTION\n");break;
5469 case XML_PARSER_END_TAG:
5470 xmlGenericError(xmlGenericErrorContext,
5471 "HPP: try END_TAG\n");break;
5472 case XML_PARSER_ENTITY_DECL:
5473 xmlGenericError(xmlGenericErrorContext,
5474 "HPP: try ENTITY_DECL\n");break;
5475 case XML_PARSER_ENTITY_VALUE:
5476 xmlGenericError(xmlGenericErrorContext,
5477 "HPP: try ENTITY_VALUE\n");break;
5478 case XML_PARSER_ATTRIBUTE_VALUE:
5479 xmlGenericError(xmlGenericErrorContext,
5480 "HPP: try ATTRIBUTE_VALUE\n");break;
5481 case XML_PARSER_DTD:
5482 xmlGenericError(xmlGenericErrorContext,
5483 "HPP: try DTD\n");break;
5484 case XML_PARSER_EPILOG:
5485 xmlGenericError(xmlGenericErrorContext,
5486 "HPP: try EPILOG\n");break;
5487 case XML_PARSER_PI:
5488 xmlGenericError(xmlGenericErrorContext,
5489 "HPP: try PI\n");break;
5490 case XML_PARSER_SYSTEM_LITERAL:
5491 xmlGenericError(xmlGenericErrorContext,
5492 "HPP: try SYSTEM_LITERAL\n");break;
5493 }
5494#endif
5495
5496 while (1) {
5497
5498 in = ctxt->input;
5499 if (in == NULL) break;
5500 if (in->buf == NULL)
5501 avail = in->length - (in->cur - in->base);
5502 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005503 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5504 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005505 if ((avail == 0) && (terminate)) {
5506 htmlAutoCloseOnEnd(ctxt);
5507 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5508 /*
5509 * SAX: end of the document processing.
5510 */
5511 ctxt->instate = XML_PARSER_EOF;
5512 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5513 ctxt->sax->endDocument(ctxt->userData);
5514 }
5515 }
5516 if (avail < 1)
5517 goto done;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005518 /*
5519 * This is done to make progress and avoid an infinite loop
5520 * if a parsing attempt was aborted by hitting a NUL byte. After
5521 * changing htmlCurrentChar, this probably isn't necessary anymore.
5522 * We should consider removing this check.
5523 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005524 cur = in->cur[0];
5525 if (cur == 0) {
5526 SKIP(1);
5527 continue;
5528 }
5529
5530 switch (ctxt->instate) {
5531 case XML_PARSER_EOF:
5532 /*
5533 * Document parsing is done !
5534 */
5535 goto done;
5536 case XML_PARSER_START:
5537 /*
5538 * Very first chars read from the document flow.
5539 */
5540 cur = in->cur[0];
5541 if (IS_BLANK_CH(cur)) {
5542 SKIP_BLANKS;
5543 if (in->buf == NULL)
5544 avail = in->length - (in->cur - in->base);
5545 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005546 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5547 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005548 }
5549 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5550 ctxt->sax->setDocumentLocator(ctxt->userData,
5551 &xmlDefaultSAXLocator);
5552 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5553 (!ctxt->disableSAX))
5554 ctxt->sax->startDocument(ctxt->userData);
5555
5556 cur = in->cur[0];
5557 next = in->cur[1];
5558 if ((cur == '<') && (next == '!') &&
5559 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5560 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5561 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5562 (UPP(8) == 'E')) {
5563 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005564 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005565 goto done;
5566#ifdef DEBUG_PUSH
5567 xmlGenericError(xmlGenericErrorContext,
5568 "HPP: Parsing internal subset\n");
5569#endif
5570 htmlParseDocTypeDecl(ctxt);
5571 ctxt->instate = XML_PARSER_PROLOG;
5572#ifdef DEBUG_PUSH
5573 xmlGenericError(xmlGenericErrorContext,
5574 "HPP: entering PROLOG\n");
5575#endif
5576 } else {
5577 ctxt->instate = XML_PARSER_MISC;
5578#ifdef DEBUG_PUSH
5579 xmlGenericError(xmlGenericErrorContext,
5580 "HPP: entering MISC\n");
5581#endif
5582 }
5583 break;
5584 case XML_PARSER_MISC:
5585 SKIP_BLANKS;
5586 if (in->buf == NULL)
5587 avail = in->length - (in->cur - in->base);
5588 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005589 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5590 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005591 /*
5592 * no chars in buffer
5593 */
5594 if (avail < 1)
5595 goto done;
5596 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005597 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005598 */
5599 if (avail < 2) {
5600 if (!terminate)
5601 goto done;
5602 else
5603 next = ' ';
5604 } else {
5605 next = in->cur[1];
5606 }
5607 cur = in->cur[0];
5608 if ((cur == '<') && (next == '!') &&
5609 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005610 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005611 goto done;
5612#ifdef DEBUG_PUSH
5613 xmlGenericError(xmlGenericErrorContext,
5614 "HPP: Parsing Comment\n");
5615#endif
5616 htmlParseComment(ctxt);
5617 ctxt->instate = XML_PARSER_MISC;
5618 } else if ((cur == '<') && (next == '?')) {
5619 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005620 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005621 goto done;
5622#ifdef DEBUG_PUSH
5623 xmlGenericError(xmlGenericErrorContext,
5624 "HPP: Parsing PI\n");
5625#endif
5626 htmlParsePI(ctxt);
5627 ctxt->instate = XML_PARSER_MISC;
5628 } else if ((cur == '<') && (next == '!') &&
5629 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5630 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5631 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5632 (UPP(8) == 'E')) {
5633 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005634 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005635 goto done;
5636#ifdef DEBUG_PUSH
5637 xmlGenericError(xmlGenericErrorContext,
5638 "HPP: Parsing internal subset\n");
5639#endif
5640 htmlParseDocTypeDecl(ctxt);
5641 ctxt->instate = XML_PARSER_PROLOG;
5642#ifdef DEBUG_PUSH
5643 xmlGenericError(xmlGenericErrorContext,
5644 "HPP: entering PROLOG\n");
5645#endif
5646 } else if ((cur == '<') && (next == '!') &&
5647 (avail < 9)) {
5648 goto done;
5649 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005650 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005651#ifdef DEBUG_PUSH
5652 xmlGenericError(xmlGenericErrorContext,
5653 "HPP: entering START_TAG\n");
5654#endif
5655 }
5656 break;
5657 case XML_PARSER_PROLOG:
5658 SKIP_BLANKS;
5659 if (in->buf == NULL)
5660 avail = in->length - (in->cur - in->base);
5661 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005662 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5663 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005664 if (avail < 2)
5665 goto done;
5666 cur = in->cur[0];
5667 next = in->cur[1];
5668 if ((cur == '<') && (next == '!') &&
5669 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005670 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005671 goto done;
5672#ifdef DEBUG_PUSH
5673 xmlGenericError(xmlGenericErrorContext,
5674 "HPP: Parsing Comment\n");
5675#endif
5676 htmlParseComment(ctxt);
5677 ctxt->instate = XML_PARSER_PROLOG;
5678 } else if ((cur == '<') && (next == '?')) {
5679 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005680 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005681 goto done;
5682#ifdef DEBUG_PUSH
5683 xmlGenericError(xmlGenericErrorContext,
5684 "HPP: Parsing PI\n");
5685#endif
5686 htmlParsePI(ctxt);
5687 ctxt->instate = XML_PARSER_PROLOG;
5688 } else if ((cur == '<') && (next == '!') &&
5689 (avail < 4)) {
5690 goto done;
5691 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005692 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005693#ifdef DEBUG_PUSH
5694 xmlGenericError(xmlGenericErrorContext,
5695 "HPP: entering START_TAG\n");
5696#endif
5697 }
5698 break;
5699 case XML_PARSER_EPILOG:
5700 if (in->buf == NULL)
5701 avail = in->length - (in->cur - in->base);
5702 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005703 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5704 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005705 if (avail < 1)
5706 goto done;
5707 cur = in->cur[0];
5708 if (IS_BLANK_CH(cur)) {
5709 htmlParseCharData(ctxt);
5710 goto done;
5711 }
5712 if (avail < 2)
5713 goto done;
5714 next = in->cur[1];
5715 if ((cur == '<') && (next == '!') &&
5716 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005717 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005718 goto done;
5719#ifdef DEBUG_PUSH
5720 xmlGenericError(xmlGenericErrorContext,
5721 "HPP: Parsing Comment\n");
5722#endif
5723 htmlParseComment(ctxt);
5724 ctxt->instate = XML_PARSER_EPILOG;
5725 } else if ((cur == '<') && (next == '?')) {
5726 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005727 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005728 goto done;
5729#ifdef DEBUG_PUSH
5730 xmlGenericError(xmlGenericErrorContext,
5731 "HPP: Parsing PI\n");
5732#endif
5733 htmlParsePI(ctxt);
5734 ctxt->instate = XML_PARSER_EPILOG;
5735 } else if ((cur == '<') && (next == '!') &&
5736 (avail < 4)) {
5737 goto done;
5738 } else {
5739 ctxt->errNo = XML_ERR_DOCUMENT_END;
5740 ctxt->wellFormed = 0;
5741 ctxt->instate = XML_PARSER_EOF;
5742#ifdef DEBUG_PUSH
5743 xmlGenericError(xmlGenericErrorContext,
5744 "HPP: entering EOF\n");
5745#endif
5746 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5747 ctxt->sax->endDocument(ctxt->userData);
5748 goto done;
5749 }
5750 break;
5751 case XML_PARSER_START_TAG: {
5752 const xmlChar *name;
5753 int failed;
5754 const htmlElemDesc * info;
5755
5756 /*
5757 * no chars in buffer
5758 */
5759 if (avail < 1)
5760 goto done;
5761 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005762 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005763 */
5764 if (avail < 2) {
5765 if (!terminate)
5766 goto done;
5767 else
5768 next = ' ';
5769 } else {
5770 next = in->cur[1];
5771 }
5772 cur = in->cur[0];
5773 if (cur != '<') {
5774 ctxt->instate = XML_PARSER_CONTENT;
5775#ifdef DEBUG_PUSH
5776 xmlGenericError(xmlGenericErrorContext,
5777 "HPP: entering CONTENT\n");
5778#endif
5779 break;
5780 }
5781 if (next == '/') {
5782 ctxt->instate = XML_PARSER_END_TAG;
5783 ctxt->checkIndex = 0;
5784#ifdef DEBUG_PUSH
5785 xmlGenericError(xmlGenericErrorContext,
5786 "HPP: entering END_TAG\n");
5787#endif
5788 break;
5789 }
5790 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005791 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005792 goto done;
5793
5794 /* Capture start position */
5795 if (ctxt->record_info) {
5796 node_info.begin_pos = ctxt->input->consumed +
5797 (CUR_PTR - ctxt->input->base);
5798 node_info.begin_line = ctxt->input->line;
5799 }
5800
5801
5802 failed = htmlParseStartTag(ctxt);
5803 name = ctxt->name;
5804 if ((failed == -1) ||
5805 (name == NULL)) {
5806 if (CUR == '>')
5807 NEXT;
5808 break;
5809 }
5810
5811 /*
5812 * Lookup the info for that element.
5813 */
5814 info = htmlTagLookup(name);
5815 if (info == NULL) {
5816 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5817 "Tag %s invalid\n", name, NULL);
5818 }
5819
5820 /*
5821 * Check for an Empty Element labeled the XML/SGML way
5822 */
5823 if ((CUR == '/') && (NXT(1) == '>')) {
5824 SKIP(2);
5825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5826 ctxt->sax->endElement(ctxt->userData, name);
5827 htmlnamePop(ctxt);
5828 ctxt->instate = XML_PARSER_CONTENT;
5829#ifdef DEBUG_PUSH
5830 xmlGenericError(xmlGenericErrorContext,
5831 "HPP: entering CONTENT\n");
5832#endif
5833 break;
5834 }
5835
5836 if (CUR == '>') {
5837 NEXT;
5838 } else {
5839 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5840 "Couldn't find end of Start Tag %s\n",
5841 name, NULL);
5842
5843 /*
5844 * end of parsing of this node.
5845 */
5846 if (xmlStrEqual(name, ctxt->name)) {
5847 nodePop(ctxt);
5848 htmlnamePop(ctxt);
5849 }
5850
5851 if (ctxt->record_info)
5852 htmlNodeInfoPush(ctxt, &node_info);
5853
5854 ctxt->instate = XML_PARSER_CONTENT;
5855#ifdef DEBUG_PUSH
5856 xmlGenericError(xmlGenericErrorContext,
5857 "HPP: entering CONTENT\n");
5858#endif
5859 break;
5860 }
5861
5862 /*
5863 * Check for an Empty Element from DTD definition
5864 */
5865 if ((info != NULL) && (info->empty)) {
5866 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5867 ctxt->sax->endElement(ctxt->userData, name);
5868 htmlnamePop(ctxt);
5869 }
5870
5871 if (ctxt->record_info)
5872 htmlNodeInfoPush(ctxt, &node_info);
5873
5874 ctxt->instate = XML_PARSER_CONTENT;
5875#ifdef DEBUG_PUSH
5876 xmlGenericError(xmlGenericErrorContext,
5877 "HPP: entering CONTENT\n");
5878#endif
5879 break;
5880 }
5881 case XML_PARSER_CONTENT: {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005882 xmlChar chr[2] = { 0, 0 };
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005883
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005884 /*
5885 * Handle preparsed entities and charRef
5886 */
5887 if (ctxt->token != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005888 chr[0] = (xmlChar) ctxt->token;
5889 htmlCheckParagraph(ctxt);
5890 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5891 ctxt->sax->characters(ctxt->userData, chr, 1);
5892 ctxt->token = 0;
5893 ctxt->checkIndex = 0;
5894 }
5895 if ((avail == 1) && (terminate)) {
5896 cur = in->cur[0];
5897 if ((cur != '<') && (cur != '&')) {
5898 if (ctxt->sax != NULL) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005899 chr[0] = cur;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005900 if (IS_BLANK_CH(cur)) {
5901 if (ctxt->keepBlanks) {
5902 if (ctxt->sax->characters != NULL)
5903 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005904 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005905 } else {
5906 if (ctxt->sax->ignorableWhitespace != NULL)
5907 ctxt->sax->ignorableWhitespace(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005908 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005909 }
5910 } else {
5911 htmlCheckParagraph(ctxt);
5912 if (ctxt->sax->characters != NULL)
5913 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005914 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005915 }
5916 }
5917 ctxt->token = 0;
5918 ctxt->checkIndex = 0;
5919 in->cur++;
5920 break;
5921 }
5922 }
5923 if (avail < 2)
5924 goto done;
5925 cur = in->cur[0];
5926 next = in->cur[1];
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005927 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5928 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5929 /*
5930 * Handle SCRIPT/STYLE separately
5931 */
5932 if (!terminate) {
5933 int idx;
5934 xmlChar val;
5935
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005936 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005937 if (idx < 0)
5938 goto done;
5939 val = in->cur[idx + 2];
5940 if (val == 0) /* bad cut of input */
5941 goto done;
5942 }
5943 htmlParseScript(ctxt);
5944 if ((cur == '<') && (next == '/')) {
5945 ctxt->instate = XML_PARSER_END_TAG;
5946 ctxt->checkIndex = 0;
5947#ifdef DEBUG_PUSH
5948 xmlGenericError(xmlGenericErrorContext,
5949 "HPP: entering END_TAG\n");
5950#endif
5951 break;
5952 }
5953 } else {
5954 /*
5955 * Sometimes DOCTYPE arrives in the middle of the document
5956 */
5957 if ((cur == '<') && (next == '!') &&
5958 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5959 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5960 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5961 (UPP(8) == 'E')) {
5962 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005963 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005964 goto done;
5965 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5966 "Misplaced DOCTYPE declaration\n",
5967 BAD_CAST "DOCTYPE" , NULL);
5968 htmlParseDocTypeDecl(ctxt);
5969 } else if ((cur == '<') && (next == '!') &&
5970 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005971 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005972 goto done;
5973#ifdef DEBUG_PUSH
5974 xmlGenericError(xmlGenericErrorContext,
5975 "HPP: Parsing Comment\n");
5976#endif
5977 htmlParseComment(ctxt);
5978 ctxt->instate = XML_PARSER_CONTENT;
5979 } else if ((cur == '<') && (next == '?')) {
5980 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005981 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005982 goto done;
5983#ifdef DEBUG_PUSH
5984 xmlGenericError(xmlGenericErrorContext,
5985 "HPP: Parsing PI\n");
5986#endif
5987 htmlParsePI(ctxt);
5988 ctxt->instate = XML_PARSER_CONTENT;
5989 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5990 goto done;
5991 } else if ((cur == '<') && (next == '/')) {
5992 ctxt->instate = XML_PARSER_END_TAG;
5993 ctxt->checkIndex = 0;
5994#ifdef DEBUG_PUSH
5995 xmlGenericError(xmlGenericErrorContext,
5996 "HPP: entering END_TAG\n");
5997#endif
5998 break;
Elliott Hughesecdab2a2022-02-23 14:33:50 -08005999 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006000 if ((!terminate) && (next == 0))
6001 goto done;
Elliott Hughesecdab2a2022-02-23 14:33:50 -08006002 ctxt->instate = XML_PARSER_START_TAG;
6003 ctxt->checkIndex = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006004#ifdef DEBUG_PUSH
Elliott Hughesecdab2a2022-02-23 14:33:50 -08006005 xmlGenericError(xmlGenericErrorContext,
6006 "HPP: entering START_TAG\n");
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006007#endif
6008 break;
Elliott Hughesecdab2a2022-02-23 14:33:50 -08006009 } else if (cur == '<') {
6010 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6011 (ctxt->sax->characters != NULL))
6012 ctxt->sax->characters(ctxt->userData,
6013 BAD_CAST "<", 1);
6014 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006015 } else {
6016 /*
6017 * check that the text sequence is complete
6018 * before handing out the data to the parser
6019 * to avoid problems with erroneous end of
6020 * data detection.
6021 */
6022 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006023 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006024 goto done;
6025 ctxt->checkIndex = 0;
6026#ifdef DEBUG_PUSH
6027 xmlGenericError(xmlGenericErrorContext,
6028 "HPP: Parsing char data\n");
6029#endif
Haibo Huangca689272021-02-09 16:43:43 -08006030 while ((ctxt->instate != XML_PARSER_EOF) &&
6031 (cur != '<') && (in->cur < in->end)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006032 if (cur == '&') {
6033 htmlParseReference(ctxt);
6034 } else {
6035 htmlParseCharData(ctxt);
6036 }
6037 cur = in->cur[0];
6038 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006039 }
6040 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006041
6042 break;
6043 }
6044 case XML_PARSER_END_TAG:
6045 if (avail < 2)
6046 goto done;
6047 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006048 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006049 goto done;
6050 htmlParseEndTag(ctxt);
6051 if (ctxt->nameNr == 0) {
6052 ctxt->instate = XML_PARSER_EPILOG;
6053 } else {
6054 ctxt->instate = XML_PARSER_CONTENT;
6055 }
6056 ctxt->checkIndex = 0;
6057#ifdef DEBUG_PUSH
6058 xmlGenericError(xmlGenericErrorContext,
6059 "HPP: entering CONTENT\n");
6060#endif
6061 break;
6062 case XML_PARSER_CDATA_SECTION:
6063 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6064 "HPP: internal error, state == CDATA\n",
6065 NULL, NULL);
6066 ctxt->instate = XML_PARSER_CONTENT;
6067 ctxt->checkIndex = 0;
6068#ifdef DEBUG_PUSH
6069 xmlGenericError(xmlGenericErrorContext,
6070 "HPP: entering CONTENT\n");
6071#endif
6072 break;
6073 case XML_PARSER_DTD:
6074 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6075 "HPP: internal error, state == DTD\n",
6076 NULL, NULL);
6077 ctxt->instate = XML_PARSER_CONTENT;
6078 ctxt->checkIndex = 0;
6079#ifdef DEBUG_PUSH
6080 xmlGenericError(xmlGenericErrorContext,
6081 "HPP: entering CONTENT\n");
6082#endif
6083 break;
6084 case XML_PARSER_COMMENT:
6085 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6086 "HPP: internal error, state == COMMENT\n",
6087 NULL, NULL);
6088 ctxt->instate = XML_PARSER_CONTENT;
6089 ctxt->checkIndex = 0;
6090#ifdef DEBUG_PUSH
6091 xmlGenericError(xmlGenericErrorContext,
6092 "HPP: entering CONTENT\n");
6093#endif
6094 break;
6095 case XML_PARSER_PI:
6096 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6097 "HPP: internal error, state == PI\n",
6098 NULL, NULL);
6099 ctxt->instate = XML_PARSER_CONTENT;
6100 ctxt->checkIndex = 0;
6101#ifdef DEBUG_PUSH
6102 xmlGenericError(xmlGenericErrorContext,
6103 "HPP: entering CONTENT\n");
6104#endif
6105 break;
6106 case XML_PARSER_ENTITY_DECL:
6107 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6108 "HPP: internal error, state == ENTITY_DECL\n",
6109 NULL, NULL);
6110 ctxt->instate = XML_PARSER_CONTENT;
6111 ctxt->checkIndex = 0;
6112#ifdef DEBUG_PUSH
6113 xmlGenericError(xmlGenericErrorContext,
6114 "HPP: entering CONTENT\n");
6115#endif
6116 break;
6117 case XML_PARSER_ENTITY_VALUE:
6118 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6119 "HPP: internal error, state == ENTITY_VALUE\n",
6120 NULL, NULL);
6121 ctxt->instate = XML_PARSER_CONTENT;
6122 ctxt->checkIndex = 0;
6123#ifdef DEBUG_PUSH
6124 xmlGenericError(xmlGenericErrorContext,
6125 "HPP: entering DTD\n");
6126#endif
6127 break;
6128 case XML_PARSER_ATTRIBUTE_VALUE:
6129 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6130 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6131 NULL, NULL);
6132 ctxt->instate = XML_PARSER_START_TAG;
6133 ctxt->checkIndex = 0;
6134#ifdef DEBUG_PUSH
6135 xmlGenericError(xmlGenericErrorContext,
6136 "HPP: entering START_TAG\n");
6137#endif
6138 break;
6139 case XML_PARSER_SYSTEM_LITERAL:
6140 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6141 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6142 NULL, NULL);
6143 ctxt->instate = XML_PARSER_CONTENT;
6144 ctxt->checkIndex = 0;
6145#ifdef DEBUG_PUSH
6146 xmlGenericError(xmlGenericErrorContext,
6147 "HPP: entering CONTENT\n");
6148#endif
6149 break;
6150 case XML_PARSER_IGNORE:
6151 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6152 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6153 NULL, NULL);
6154 ctxt->instate = XML_PARSER_CONTENT;
6155 ctxt->checkIndex = 0;
6156#ifdef DEBUG_PUSH
6157 xmlGenericError(xmlGenericErrorContext,
6158 "HPP: entering CONTENT\n");
6159#endif
6160 break;
6161 case XML_PARSER_PUBLIC_LITERAL:
6162 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6163 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6164 NULL, NULL);
6165 ctxt->instate = XML_PARSER_CONTENT;
6166 ctxt->checkIndex = 0;
6167#ifdef DEBUG_PUSH
6168 xmlGenericError(xmlGenericErrorContext,
6169 "HPP: entering CONTENT\n");
6170#endif
6171 break;
6172
6173 }
6174 }
6175done:
6176 if ((avail == 0) && (terminate)) {
6177 htmlAutoCloseOnEnd(ctxt);
6178 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6179 /*
6180 * SAX: end of the document processing.
6181 */
6182 ctxt->instate = XML_PARSER_EOF;
6183 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6184 ctxt->sax->endDocument(ctxt->userData);
6185 }
6186 }
6187 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6188 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6189 (ctxt->instate == XML_PARSER_EPILOG))) {
6190 xmlDtdPtr dtd;
6191 dtd = xmlGetIntSubset(ctxt->myDoc);
6192 if (dtd == NULL)
6193 ctxt->myDoc->intSubset =
6194 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6195 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6196 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6197 }
6198#ifdef DEBUG_PUSH
6199 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6200#endif
6201 return(ret);
6202}
6203
6204/**
6205 * htmlParseChunk:
6206 * @ctxt: an HTML parser context
6207 * @chunk: an char array
6208 * @size: the size in byte of the chunk
6209 * @terminate: last chunk indicator
6210 *
6211 * Parse a Chunk of memory
6212 *
6213 * Returns zero if no error, the xmlParserErrors otherwise.
6214 */
6215int
6216htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6217 int terminate) {
6218 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6219 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6220 "htmlParseChunk: context error\n", NULL, NULL);
6221 return(XML_ERR_INTERNAL_ERROR);
6222 }
6223 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6224 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6225 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6226 size_t cur = ctxt->input->cur - ctxt->input->base;
6227 int res;
6228
6229 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006230 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006231 if (res < 0) {
6232 ctxt->errNo = XML_PARSER_EOF;
6233 ctxt->disableSAX = 1;
6234 return (XML_PARSER_EOF);
6235 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006236#ifdef DEBUG_PUSH
6237 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6238#endif
6239
6240#if 0
6241 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6242 htmlParseTryOrFinish(ctxt, terminate);
6243#endif
6244 } else if (ctxt->instate != XML_PARSER_EOF) {
6245 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6246 xmlParserInputBufferPtr in = ctxt->input->buf;
6247 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6248 (in->raw != NULL)) {
6249 int nbchars;
6250 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6251 size_t current = ctxt->input->cur - ctxt->input->base;
6252
6253 nbchars = xmlCharEncInput(in, terminate);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006254 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006255 if (nbchars < 0) {
6256 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6257 "encoder error\n", NULL, NULL);
6258 return(XML_ERR_INVALID_ENCODING);
6259 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006260 }
6261 }
6262 }
6263 htmlParseTryOrFinish(ctxt, terminate);
6264 if (terminate) {
6265 if ((ctxt->instate != XML_PARSER_EOF) &&
6266 (ctxt->instate != XML_PARSER_EPILOG) &&
6267 (ctxt->instate != XML_PARSER_MISC)) {
6268 ctxt->errNo = XML_ERR_DOCUMENT_END;
6269 ctxt->wellFormed = 0;
6270 }
6271 if (ctxt->instate != XML_PARSER_EOF) {
6272 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6273 ctxt->sax->endDocument(ctxt->userData);
6274 }
6275 ctxt->instate = XML_PARSER_EOF;
6276 }
6277 return((xmlParserErrors) ctxt->errNo);
6278}
6279
6280/************************************************************************
6281 * *
6282 * User entry points *
6283 * *
6284 ************************************************************************/
6285
6286/**
6287 * htmlCreatePushParserCtxt:
6288 * @sax: a SAX handler
6289 * @user_data: The user data returned on SAX callbacks
6290 * @chunk: a pointer to an array of chars
6291 * @size: number of chars in the array
6292 * @filename: an optional file name or URI
6293 * @enc: an optional encoding
6294 *
6295 * Create a parser context for using the HTML parser in push mode
6296 * The value of @filename is used for fetching external entities
6297 * and error/warning reports.
6298 *
6299 * Returns the new parser context or NULL
6300 */
6301htmlParserCtxtPtr
6302htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6303 const char *chunk, int size, const char *filename,
6304 xmlCharEncoding enc) {
6305 htmlParserCtxtPtr ctxt;
6306 htmlParserInputPtr inputStream;
6307 xmlParserInputBufferPtr buf;
6308
6309 xmlInitParser();
6310
6311 buf = xmlAllocParserInputBuffer(enc);
6312 if (buf == NULL) return(NULL);
6313
6314 ctxt = htmlNewParserCtxt();
6315 if (ctxt == NULL) {
6316 xmlFreeParserInputBuffer(buf);
6317 return(NULL);
6318 }
6319 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6320 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6321 if (sax != NULL) {
6322 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6323 xmlFree(ctxt->sax);
6324 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6325 if (ctxt->sax == NULL) {
6326 xmlFree(buf);
6327 xmlFree(ctxt);
6328 return(NULL);
6329 }
6330 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6331 if (user_data != NULL)
6332 ctxt->userData = user_data;
6333 }
6334 if (filename == NULL) {
6335 ctxt->directory = NULL;
6336 } else {
6337 ctxt->directory = xmlParserGetDirectory(filename);
6338 }
6339
6340 inputStream = htmlNewInputStream(ctxt);
6341 if (inputStream == NULL) {
6342 xmlFreeParserCtxt(ctxt);
6343 xmlFree(buf);
6344 return(NULL);
6345 }
6346
6347 if (filename == NULL)
6348 inputStream->filename = NULL;
6349 else
6350 inputStream->filename = (char *)
6351 xmlCanonicPath((const xmlChar *) filename);
6352 inputStream->buf = buf;
6353 xmlBufResetInput(buf->buffer, inputStream);
6354
6355 inputPush(ctxt, inputStream);
6356
6357 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6358 (ctxt->input->buf != NULL)) {
6359 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6360 size_t cur = ctxt->input->cur - ctxt->input->base;
6361
6362 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6363
6364 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6365#ifdef DEBUG_PUSH
6366 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6367#endif
6368 }
6369 ctxt->progressive = 1;
6370
6371 return(ctxt);
6372}
6373#endif /* LIBXML_PUSH_ENABLED */
6374
6375/**
6376 * htmlSAXParseDoc:
6377 * @cur: a pointer to an array of xmlChar
6378 * @encoding: a free form C string describing the HTML document encoding, or NULL
6379 * @sax: the SAX handler block
6380 * @userData: if using SAX, this pointer will be provided on callbacks.
6381 *
6382 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6383 * to handle parse events. If sax is NULL, fallback to the default DOM
6384 * behavior and return a tree.
6385 *
6386 * Returns the resulting document tree unless SAX is NULL or the document is
6387 * not well formed.
6388 */
6389
6390htmlDocPtr
6391htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6392 htmlSAXHandlerPtr sax, void *userData) {
6393 htmlDocPtr ret;
6394 htmlParserCtxtPtr ctxt;
6395
6396 xmlInitParser();
6397
6398 if (cur == NULL) return(NULL);
6399
6400
6401 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6402 if (ctxt == NULL) return(NULL);
6403 if (sax != NULL) {
6404 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6405 ctxt->sax = sax;
6406 ctxt->userData = userData;
6407 }
6408
6409 htmlParseDocument(ctxt);
6410 ret = ctxt->myDoc;
6411 if (sax != NULL) {
6412 ctxt->sax = NULL;
6413 ctxt->userData = NULL;
6414 }
6415 htmlFreeParserCtxt(ctxt);
6416
6417 return(ret);
6418}
6419
6420/**
6421 * htmlParseDoc:
6422 * @cur: a pointer to an array of xmlChar
6423 * @encoding: a free form C string describing the HTML document encoding, or NULL
6424 *
6425 * parse an HTML in-memory document and build a tree.
6426 *
6427 * Returns the resulting document tree
6428 */
6429
6430htmlDocPtr
6431htmlParseDoc(const xmlChar *cur, const char *encoding) {
6432 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6433}
6434
6435
6436/**
6437 * htmlCreateFileParserCtxt:
6438 * @filename: the filename
6439 * @encoding: a free form C string describing the HTML document encoding, or NULL
6440 *
6441 * Create a parser context for a file content.
6442 * Automatic support for ZLIB/Compress compressed document is provided
6443 * by default if found at compile-time.
6444 *
6445 * Returns the new parser context or NULL
6446 */
6447htmlParserCtxtPtr
6448htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6449{
6450 htmlParserCtxtPtr ctxt;
6451 htmlParserInputPtr inputStream;
6452 char *canonicFilename;
6453 /* htmlCharEncoding enc; */
6454 xmlChar *content, *content_line = (xmlChar *) "charset=";
6455
6456 if (filename == NULL)
6457 return(NULL);
6458
6459 ctxt = htmlNewParserCtxt();
6460 if (ctxt == NULL) {
6461 return(NULL);
6462 }
6463 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6464 if (canonicFilename == NULL) {
6465#ifdef LIBXML_SAX1_ENABLED
6466 if (xmlDefaultSAXHandler.error != NULL) {
6467 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6468 }
6469#endif
6470 xmlFreeParserCtxt(ctxt);
6471 return(NULL);
6472 }
6473
6474 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6475 xmlFree(canonicFilename);
6476 if (inputStream == NULL) {
6477 xmlFreeParserCtxt(ctxt);
6478 return(NULL);
6479 }
6480
6481 inputPush(ctxt, inputStream);
6482
6483 /* set encoding */
6484 if (encoding) {
6485 size_t l = strlen(encoding);
6486
6487 if (l < 1000) {
6488 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6489 if (content) {
6490 strcpy ((char *)content, (char *)content_line);
6491 strcat ((char *)content, (char *)encoding);
6492 htmlCheckEncoding (ctxt, content);
6493 xmlFree (content);
6494 }
6495 }
6496 }
6497
6498 return(ctxt);
6499}
6500
6501/**
6502 * htmlSAXParseFile:
6503 * @filename: the filename
6504 * @encoding: a free form C string describing the HTML document encoding, or NULL
6505 * @sax: the SAX handler block
6506 * @userData: if using SAX, this pointer will be provided on callbacks.
6507 *
6508 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6509 * compressed document is provided by default if found at compile-time.
6510 * It use the given SAX function block to handle the parsing callback.
6511 * If sax is NULL, fallback to the default DOM tree building routines.
6512 *
6513 * Returns the resulting document tree unless SAX is NULL or the document is
6514 * not well formed.
6515 */
6516
6517htmlDocPtr
6518htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6519 void *userData) {
6520 htmlDocPtr ret;
6521 htmlParserCtxtPtr ctxt;
6522 htmlSAXHandlerPtr oldsax = NULL;
6523
6524 xmlInitParser();
6525
6526 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6527 if (ctxt == NULL) return(NULL);
6528 if (sax != NULL) {
6529 oldsax = ctxt->sax;
6530 ctxt->sax = sax;
6531 ctxt->userData = userData;
6532 }
6533
6534 htmlParseDocument(ctxt);
6535
6536 ret = ctxt->myDoc;
6537 if (sax != NULL) {
6538 ctxt->sax = oldsax;
6539 ctxt->userData = NULL;
6540 }
6541 htmlFreeParserCtxt(ctxt);
6542
6543 return(ret);
6544}
6545
6546/**
6547 * htmlParseFile:
6548 * @filename: the filename
6549 * @encoding: a free form C string describing the HTML document encoding, or NULL
6550 *
6551 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6552 * compressed document is provided by default if found at compile-time.
6553 *
6554 * Returns the resulting document tree
6555 */
6556
6557htmlDocPtr
6558htmlParseFile(const char *filename, const char *encoding) {
6559 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6560}
6561
6562/**
6563 * htmlHandleOmittedElem:
6564 * @val: int 0 or 1
6565 *
6566 * Set and return the previous value for handling HTML omitted tags.
6567 *
6568 * Returns the last value for 0 for no handling, 1 for auto insertion.
6569 */
6570
6571int
6572htmlHandleOmittedElem(int val) {
6573 int old = htmlOmittedDefaultValue;
6574
6575 htmlOmittedDefaultValue = val;
6576 return(old);
6577}
6578
6579/**
6580 * htmlElementAllowedHere:
6581 * @parent: HTML parent element
6582 * @elt: HTML element
6583 *
6584 * Checks whether an HTML element may be a direct child of a parent element.
6585 * Note - doesn't check for deprecated elements
6586 *
6587 * Returns 1 if allowed; 0 otherwise.
6588 */
6589int
6590htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6591 const char** p ;
6592
6593 if ( ! elt || ! parent || ! parent->subelts )
6594 return 0 ;
6595
6596 for ( p = parent->subelts; *p; ++p )
6597 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6598 return 1 ;
6599
6600 return 0 ;
6601}
6602/**
6603 * htmlElementStatusHere:
6604 * @parent: HTML parent element
6605 * @elt: HTML element
6606 *
6607 * Checks whether an HTML element may be a direct child of a parent element.
6608 * and if so whether it is valid or deprecated.
6609 *
6610 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6611 */
6612htmlStatus
6613htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6614 if ( ! parent || ! elt )
6615 return HTML_INVALID ;
6616 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6617 return HTML_INVALID ;
6618
6619 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6620}
6621/**
6622 * htmlAttrAllowed:
6623 * @elt: HTML element
6624 * @attr: HTML attribute
6625 * @legacy: whether to allow deprecated attributes
6626 *
6627 * Checks whether an attribute is valid for an element
6628 * Has full knowledge of Required and Deprecated attributes
6629 *
6630 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6631 */
6632htmlStatus
6633htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6634 const char** p ;
6635
6636 if ( !elt || ! attr )
6637 return HTML_INVALID ;
6638
6639 if ( elt->attrs_req )
6640 for ( p = elt->attrs_req; *p; ++p)
6641 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6642 return HTML_REQUIRED ;
6643
6644 if ( elt->attrs_opt )
6645 for ( p = elt->attrs_opt; *p; ++p)
6646 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6647 return HTML_VALID ;
6648
6649 if ( legacy && elt->attrs_depr )
6650 for ( p = elt->attrs_depr; *p; ++p)
6651 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6652 return HTML_DEPRECATED ;
6653
6654 return HTML_INVALID ;
6655}
6656/**
6657 * htmlNodeStatus:
6658 * @node: an htmlNodePtr in a tree
6659 * @legacy: whether to allow deprecated elements (YES is faster here
6660 * for Element nodes)
6661 *
6662 * Checks whether the tree node is valid. Experimental (the author
6663 * only uses the HTML enhancements in a SAX parser)
6664 *
6665 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6666 * legacy allowed) or htmlElementStatusHere (otherwise).
6667 * for Attribute nodes, a return from htmlAttrAllowed
6668 * for other nodes, HTML_NA (no checks performed)
6669 */
6670htmlStatus
6671htmlNodeStatus(const htmlNodePtr node, int legacy) {
6672 if ( ! node )
6673 return HTML_INVALID ;
6674
6675 switch ( node->type ) {
6676 case XML_ELEMENT_NODE:
6677 return legacy
6678 ? ( htmlElementAllowedHere (
6679 htmlTagLookup(node->parent->name) , node->name
6680 ) ? HTML_VALID : HTML_INVALID )
6681 : htmlElementStatusHere(
6682 htmlTagLookup(node->parent->name) ,
6683 htmlTagLookup(node->name) )
6684 ;
6685 case XML_ATTRIBUTE_NODE:
6686 return htmlAttrAllowed(
6687 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6688 default: return HTML_NA ;
6689 }
6690}
6691/************************************************************************
6692 * *
6693 * New set (2.6.0) of simpler and more flexible APIs *
6694 * *
6695 ************************************************************************/
6696/**
6697 * DICT_FREE:
6698 * @str: a string
6699 *
6700 * Free a string if it is not owned by the "dict" dictionary in the
6701 * current scope
6702 */
6703#define DICT_FREE(str) \
6704 if ((str) && ((!dict) || \
6705 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6706 xmlFree((char *)(str));
6707
6708/**
6709 * htmlCtxtReset:
6710 * @ctxt: an HTML parser context
6711 *
6712 * Reset a parser context
6713 */
6714void
6715htmlCtxtReset(htmlParserCtxtPtr ctxt)
6716{
6717 xmlParserInputPtr input;
6718 xmlDictPtr dict;
6719
6720 if (ctxt == NULL)
6721 return;
6722
6723 xmlInitParser();
6724 dict = ctxt->dict;
6725
6726 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6727 xmlFreeInputStream(input);
6728 }
6729 ctxt->inputNr = 0;
6730 ctxt->input = NULL;
6731
6732 ctxt->spaceNr = 0;
6733 if (ctxt->spaceTab != NULL) {
6734 ctxt->spaceTab[0] = -1;
6735 ctxt->space = &ctxt->spaceTab[0];
6736 } else {
6737 ctxt->space = NULL;
6738 }
6739
6740
6741 ctxt->nodeNr = 0;
6742 ctxt->node = NULL;
6743
6744 ctxt->nameNr = 0;
6745 ctxt->name = NULL;
6746
6747 DICT_FREE(ctxt->version);
6748 ctxt->version = NULL;
6749 DICT_FREE(ctxt->encoding);
6750 ctxt->encoding = NULL;
6751 DICT_FREE(ctxt->directory);
6752 ctxt->directory = NULL;
6753 DICT_FREE(ctxt->extSubURI);
6754 ctxt->extSubURI = NULL;
6755 DICT_FREE(ctxt->extSubSystem);
6756 ctxt->extSubSystem = NULL;
6757 if (ctxt->myDoc != NULL)
6758 xmlFreeDoc(ctxt->myDoc);
6759 ctxt->myDoc = NULL;
6760
6761 ctxt->standalone = -1;
6762 ctxt->hasExternalSubset = 0;
6763 ctxt->hasPErefs = 0;
6764 ctxt->html = 1;
6765 ctxt->external = 0;
6766 ctxt->instate = XML_PARSER_START;
6767 ctxt->token = 0;
6768
6769 ctxt->wellFormed = 1;
6770 ctxt->nsWellFormed = 1;
6771 ctxt->disableSAX = 0;
6772 ctxt->valid = 1;
6773 ctxt->vctxt.userData = ctxt;
6774 ctxt->vctxt.error = xmlParserValidityError;
6775 ctxt->vctxt.warning = xmlParserValidityWarning;
6776 ctxt->record_info = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006777 ctxt->checkIndex = 0;
6778 ctxt->inSubset = 0;
6779 ctxt->errNo = XML_ERR_OK;
6780 ctxt->depth = 0;
6781 ctxt->charset = XML_CHAR_ENCODING_NONE;
6782 ctxt->catalogs = NULL;
6783 xmlInitNodeInfoSeq(&ctxt->node_seq);
6784
6785 if (ctxt->attsDefault != NULL) {
6786 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6787 ctxt->attsDefault = NULL;
6788 }
6789 if (ctxt->attsSpecial != NULL) {
6790 xmlHashFree(ctxt->attsSpecial, NULL);
6791 ctxt->attsSpecial = NULL;
6792 }
6793}
6794
6795/**
6796 * htmlCtxtUseOptions:
6797 * @ctxt: an HTML parser context
6798 * @options: a combination of htmlParserOption(s)
6799 *
6800 * Applies the options to the parser context
6801 *
6802 * Returns 0 in case of success, the set of unknown or unimplemented options
6803 * in case of error.
6804 */
6805int
6806htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6807{
6808 if (ctxt == NULL)
6809 return(-1);
6810
6811 if (options & HTML_PARSE_NOWARNING) {
6812 ctxt->sax->warning = NULL;
6813 ctxt->vctxt.warning = NULL;
6814 options -= XML_PARSE_NOWARNING;
6815 ctxt->options |= XML_PARSE_NOWARNING;
6816 }
6817 if (options & HTML_PARSE_NOERROR) {
6818 ctxt->sax->error = NULL;
6819 ctxt->vctxt.error = NULL;
6820 ctxt->sax->fatalError = NULL;
6821 options -= XML_PARSE_NOERROR;
6822 ctxt->options |= XML_PARSE_NOERROR;
6823 }
6824 if (options & HTML_PARSE_PEDANTIC) {
6825 ctxt->pedantic = 1;
6826 options -= XML_PARSE_PEDANTIC;
6827 ctxt->options |= XML_PARSE_PEDANTIC;
6828 } else
6829 ctxt->pedantic = 0;
6830 if (options & XML_PARSE_NOBLANKS) {
6831 ctxt->keepBlanks = 0;
6832 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6833 options -= XML_PARSE_NOBLANKS;
6834 ctxt->options |= XML_PARSE_NOBLANKS;
6835 } else
6836 ctxt->keepBlanks = 1;
6837 if (options & HTML_PARSE_RECOVER) {
6838 ctxt->recovery = 1;
6839 options -= HTML_PARSE_RECOVER;
6840 } else
6841 ctxt->recovery = 0;
6842 if (options & HTML_PARSE_COMPACT) {
6843 ctxt->options |= HTML_PARSE_COMPACT;
6844 options -= HTML_PARSE_COMPACT;
6845 }
6846 if (options & XML_PARSE_HUGE) {
6847 ctxt->options |= XML_PARSE_HUGE;
6848 options -= XML_PARSE_HUGE;
6849 }
6850 if (options & HTML_PARSE_NODEFDTD) {
6851 ctxt->options |= HTML_PARSE_NODEFDTD;
6852 options -= HTML_PARSE_NODEFDTD;
6853 }
6854 if (options & HTML_PARSE_IGNORE_ENC) {
6855 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6856 options -= HTML_PARSE_IGNORE_ENC;
6857 }
6858 if (options & HTML_PARSE_NOIMPLIED) {
6859 ctxt->options |= HTML_PARSE_NOIMPLIED;
6860 options -= HTML_PARSE_NOIMPLIED;
6861 }
6862 ctxt->dictNames = 0;
6863 return (options);
6864}
6865
6866/**
6867 * htmlDoRead:
6868 * @ctxt: an HTML parser context
6869 * @URL: the base URL to use for the document
6870 * @encoding: the document encoding, or NULL
6871 * @options: a combination of htmlParserOption(s)
6872 * @reuse: keep the context for reuse
6873 *
6874 * Common front-end for the htmlRead functions
6875 *
6876 * Returns the resulting document tree or NULL
6877 */
6878static htmlDocPtr
6879htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6880 int options, int reuse)
6881{
6882 htmlDocPtr ret;
6883
6884 htmlCtxtUseOptions(ctxt, options);
6885 ctxt->html = 1;
6886 if (encoding != NULL) {
6887 xmlCharEncodingHandlerPtr hdlr;
6888
6889 hdlr = xmlFindCharEncodingHandler(encoding);
6890 if (hdlr != NULL) {
6891 xmlSwitchToEncoding(ctxt, hdlr);
6892 if (ctxt->input->encoding != NULL)
6893 xmlFree((xmlChar *) ctxt->input->encoding);
6894 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6895 }
6896 }
6897 if ((URL != NULL) && (ctxt->input != NULL) &&
6898 (ctxt->input->filename == NULL))
6899 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6900 htmlParseDocument(ctxt);
6901 ret = ctxt->myDoc;
6902 ctxt->myDoc = NULL;
6903 if (!reuse) {
6904 if ((ctxt->dictNames) &&
6905 (ret != NULL) &&
6906 (ret->dict == ctxt->dict))
6907 ctxt->dict = NULL;
6908 xmlFreeParserCtxt(ctxt);
6909 }
6910 return (ret);
6911}
6912
6913/**
6914 * htmlReadDoc:
6915 * @cur: a pointer to a zero terminated string
6916 * @URL: the base URL to use for the document
6917 * @encoding: the document encoding, or NULL
6918 * @options: a combination of htmlParserOption(s)
6919 *
6920 * parse an XML in-memory document and build a tree.
6921 *
6922 * Returns the resulting document tree
6923 */
6924htmlDocPtr
6925htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6926{
6927 htmlParserCtxtPtr ctxt;
6928
6929 if (cur == NULL)
6930 return (NULL);
6931
6932 xmlInitParser();
6933 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6934 if (ctxt == NULL)
6935 return (NULL);
6936 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6937}
6938
6939/**
6940 * htmlReadFile:
6941 * @filename: a file or URL
6942 * @encoding: the document encoding, or NULL
6943 * @options: a combination of htmlParserOption(s)
6944 *
6945 * parse an XML file from the filesystem or the network.
6946 *
6947 * Returns the resulting document tree
6948 */
6949htmlDocPtr
6950htmlReadFile(const char *filename, const char *encoding, int options)
6951{
6952 htmlParserCtxtPtr ctxt;
6953
6954 xmlInitParser();
6955 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6956 if (ctxt == NULL)
6957 return (NULL);
6958 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6959}
6960
6961/**
6962 * htmlReadMemory:
6963 * @buffer: a pointer to a char array
6964 * @size: the size of the array
6965 * @URL: the base URL to use for the document
6966 * @encoding: the document encoding, or NULL
6967 * @options: a combination of htmlParserOption(s)
6968 *
6969 * parse an XML in-memory document and build a tree.
6970 *
6971 * Returns the resulting document tree
6972 */
6973htmlDocPtr
6974htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6975{
6976 htmlParserCtxtPtr ctxt;
6977
6978 xmlInitParser();
6979 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6980 if (ctxt == NULL)
6981 return (NULL);
6982 htmlDefaultSAXHandlerInit();
6983 if (ctxt->sax != NULL)
6984 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6985 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6986}
6987
6988/**
6989 * htmlReadFd:
6990 * @fd: an open file descriptor
6991 * @URL: the base URL to use for the document
6992 * @encoding: the document encoding, or NULL
6993 * @options: a combination of htmlParserOption(s)
6994 *
Elliott Hughesecdab2a2022-02-23 14:33:50 -08006995 * parse an HTML from a file descriptor and build a tree.
6996 * NOTE that the file descriptor will not be closed when the
6997 * reader is closed or reset.
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006998 *
6999 * Returns the resulting document tree
7000 */
7001htmlDocPtr
7002htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7003{
7004 htmlParserCtxtPtr ctxt;
7005 xmlParserInputBufferPtr input;
Elliott Hughesecdab2a2022-02-23 14:33:50 -08007006 htmlParserInputPtr stream;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08007007
7008 if (fd < 0)
7009 return (NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08007010
7011 xmlInitParser();
7012 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7013 if (input == NULL)
7014 return (NULL);
Elliott Hughesecdab2a2022-02-23 14:33:50 -08007015 input->closecallback = NULL;
7016 ctxt = htmlNewParserCtxt();
Elliott Hughes7fbecab2019-01-10 16:42:03 -08007017 if (ctxt == NULL) {
7018 xmlFreeParserInputBuffer(input);
7019 return (NULL);
7020 }
7021 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7022 if (stream == NULL) {
7023 xmlFreeParserInputBuffer(input);
Elliott Hughesecdab2a2022-02-23 14:33:50 -08007024 htmlFreeParserCtxt(ctxt);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08007025 return (NULL);
7026 }
7027 inputPush(ctxt, stream);
7028 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7029}
7030
7031/**
7032 * htmlReadIO:
7033 * @ioread: an I/O read function
7034 * @ioclose: an I/O close function
7035 * @ioctx: an I/O handler
7036 * @URL: the base URL to use for the document
7037 * @encoding: the document encoding, or NULL
7038 * @options: a combination of htmlParserOption(s)
7039 *
7040 * parse an HTML document from I/O functions and source and build a tree.
7041 *
7042 * Returns the resulting document tree
7043 */
7044htmlDocPtr
7045htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7046 void *ioctx, const char *URL, const char *encoding, int options)
7047{
7048 htmlParserCtxtPtr ctxt;
7049 xmlParserInputBufferPtr input;
7050 xmlParserInputPtr stream;
7051
7052 if (ioread == NULL)
7053 return (NULL);
7054 xmlInitParser();
7055
7056 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7057 XML_CHAR_ENCODING_NONE);
7058 if (input == NULL) {
7059 if (ioclose != NULL)
7060 ioclose(ioctx);
7061 return (NULL);
7062 }
7063 ctxt = htmlNewParserCtxt();
7064 if (ctxt == NULL) {
7065 xmlFreeParserInputBuffer(input);
7066 return (NULL);
7067 }
7068 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7069 if (stream == NULL) {
7070 xmlFreeParserInputBuffer(input);
7071 xmlFreeParserCtxt(ctxt);
7072 return (NULL);
7073 }
7074 inputPush(ctxt, stream);
7075 return (htmlDoRead(ctxt, URL, encoding, options, 0));
7076}
7077
7078/**
7079 * htmlCtxtReadDoc:
7080 * @ctxt: an HTML parser context
7081 * @cur: a pointer to a zero terminated string
7082 * @URL: the base URL to use for the document
7083 * @encoding: the document encoding, or NULL
7084 * @options: a combination of htmlParserOption(s)
7085 *
7086 * parse an XML in-memory document and build a tree.
7087 * This reuses the existing @ctxt parser context
7088 *
7089 * Returns the resulting document tree
7090 */
7091htmlDocPtr
7092htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7093 const char *URL, const char *encoding, int options)
7094{
7095 xmlParserInputPtr stream;
7096
7097 if (cur == NULL)
7098 return (NULL);
7099 if (ctxt == NULL)
7100 return (NULL);
7101 xmlInitParser();
7102
7103 htmlCtxtReset(ctxt);
7104
7105 stream = xmlNewStringInputStream(ctxt, cur);
7106 if (stream == NULL) {
7107 return (NULL);
7108 }
7109 inputPush(ctxt, stream);
7110 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7111}
7112
7113/**
7114 * htmlCtxtReadFile:
7115 * @ctxt: an HTML parser context
7116 * @filename: a file or URL
7117 * @encoding: the document encoding, or NULL
7118 * @options: a combination of htmlParserOption(s)
7119 *
7120 * parse an XML file from the filesystem or the network.
7121 * This reuses the existing @ctxt parser context
7122 *
7123 * Returns the resulting document tree
7124 */
7125htmlDocPtr
7126htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7127 const char *encoding, int options)
7128{
7129 xmlParserInputPtr stream;
7130
7131 if (filename == NULL)
7132 return (NULL);
7133 if (ctxt == NULL)
7134 return (NULL);
7135 xmlInitParser();
7136
7137 htmlCtxtReset(ctxt);
7138
7139 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7140 if (stream == NULL) {
7141 return (NULL);
7142 }
7143 inputPush(ctxt, stream);
7144 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7145}
7146
7147/**
7148 * htmlCtxtReadMemory:
7149 * @ctxt: an HTML parser context
7150 * @buffer: a pointer to a char array
7151 * @size: the size of the array
7152 * @URL: the base URL to use for the document
7153 * @encoding: the document encoding, or NULL
7154 * @options: a combination of htmlParserOption(s)
7155 *
7156 * parse an XML in-memory document and build a tree.
7157 * This reuses the existing @ctxt parser context
7158 *
7159 * Returns the resulting document tree
7160 */
7161htmlDocPtr
7162htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7163 const char *URL, const char *encoding, int options)
7164{
7165 xmlParserInputBufferPtr input;
7166 xmlParserInputPtr stream;
7167
7168 if (ctxt == NULL)
7169 return (NULL);
7170 if (buffer == NULL)
7171 return (NULL);
7172 xmlInitParser();
7173
7174 htmlCtxtReset(ctxt);
7175
7176 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7177 if (input == NULL) {
7178 return(NULL);
7179 }
7180
7181 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7182 if (stream == NULL) {
7183 xmlFreeParserInputBuffer(input);
7184 return(NULL);
7185 }
7186
7187 inputPush(ctxt, stream);
7188 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7189}
7190
7191/**
7192 * htmlCtxtReadFd:
7193 * @ctxt: an HTML parser context
7194 * @fd: an open file descriptor
7195 * @URL: the base URL to use for the document
7196 * @encoding: the document encoding, or NULL
7197 * @options: a combination of htmlParserOption(s)
7198 *
7199 * parse an XML from a file descriptor and build a tree.
7200 * This reuses the existing @ctxt parser context
7201 *
7202 * Returns the resulting document tree
7203 */
7204htmlDocPtr
7205htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7206 const char *URL, const char *encoding, int options)
7207{
7208 xmlParserInputBufferPtr input;
7209 xmlParserInputPtr stream;
7210
7211 if (fd < 0)
7212 return (NULL);
7213 if (ctxt == NULL)
7214 return (NULL);
7215 xmlInitParser();
7216
7217 htmlCtxtReset(ctxt);
7218
7219
7220 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7221 if (input == NULL)
7222 return (NULL);
7223 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7224 if (stream == NULL) {
7225 xmlFreeParserInputBuffer(input);
7226 return (NULL);
7227 }
7228 inputPush(ctxt, stream);
7229 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7230}
7231
7232/**
7233 * htmlCtxtReadIO:
7234 * @ctxt: an HTML parser context
7235 * @ioread: an I/O read function
7236 * @ioclose: an I/O close function
7237 * @ioctx: an I/O handler
7238 * @URL: the base URL to use for the document
7239 * @encoding: the document encoding, or NULL
7240 * @options: a combination of htmlParserOption(s)
7241 *
7242 * parse an HTML document from I/O functions and source and build a tree.
7243 * This reuses the existing @ctxt parser context
7244 *
7245 * Returns the resulting document tree
7246 */
7247htmlDocPtr
7248htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7249 xmlInputCloseCallback ioclose, void *ioctx,
7250 const char *URL,
7251 const char *encoding, int options)
7252{
7253 xmlParserInputBufferPtr input;
7254 xmlParserInputPtr stream;
7255
7256 if (ioread == NULL)
7257 return (NULL);
7258 if (ctxt == NULL)
7259 return (NULL);
7260 xmlInitParser();
7261
7262 htmlCtxtReset(ctxt);
7263
7264 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7265 XML_CHAR_ENCODING_NONE);
7266 if (input == NULL) {
7267 if (ioclose != NULL)
7268 ioclose(ioctx);
7269 return (NULL);
7270 }
7271 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7272 if (stream == NULL) {
7273 xmlFreeParserInputBuffer(input);
7274 return (NULL);
7275 }
7276 inputPush(ctxt, stream);
7277 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7278}
7279
Elliott Hughes7fbecab2019-01-10 16:42:03 -08007280#endif /* LIBXML_HTML_ENABLED */