blob: c9a64c780d9f4fb24aaa56acf5bdaeebee2d10e6 [file] [log] [blame]
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef LIBXML_ZLIB_ENABLED
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#include "buf.h"
48#include "enc.h"
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57static int htmlOmittedDefaultValue = 1;
58
59xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63/************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69/**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
Haibo Huangcfd91dc2020-07-30 23:01:33 -070072 * @extra: extra information
Elliott Hughes7fbecab2019-01-10 16:42:03 -080073 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void LIBXML_ATTR_FORMAT(3,0)
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void LIBXML_ATTR_FORMAT(3,0)
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149}
150
151/************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166static int
167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168{
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196static const xmlChar *
197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213}
214
215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Haibo Huangf0a546b2020-09-01 20:28:19 -0700299#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306#define BASE_PTR ctxt->input->base
307
308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700320/* Imported from XML */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800321
322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
324#define NEXT xmlNextChar(ctxt)
325
326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
Haibo Huangf0a546b2020-09-01 20:28:19 -0700333 ctxt->token = 0; ctxt->input->cur += l; \
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800334 } while (0)
335
336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415static int
416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
420
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
423
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
427 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
431
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700432 /*
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
436 */
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
444 }
445 return((int) *ctxt->input->cur);
446 }
447
448 /*
449 * Humm this is bad, do an automatic flow conversion
450 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
Haibo Huang735158e2021-02-23 17:48:08 -0800460 /*
461 * Don't use UTF-8 encoder which isn't required and
462 * can produce invalid UTF-8.
463 */
464 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465 xmlSwitchToEncoding(ctxt, handler);
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800466 } else {
467 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468 "Unsupported encoding %s", guess, NULL);
469 }
470 }
471 ctxt->charset = XML_CHAR_ENCODING_UTF8;
472 }
473
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700474 /*
475 * We are supposed to handle UTF8, check it's valid
476 * From rfc2044: encoding of the Unicode values on UTF-8:
477 *
478 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
479 * 0000 0000-0000 007F 0xxxxxxx
480 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
481 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
482 *
483 * Check for the 0x110000 limit too
484 */
485 cur = ctxt->input->cur;
486 c = *cur;
487 if (c & 0x80) {
488 if ((c & 0x40) == 0)
489 goto encoding_error;
490 if (cur[1] == 0) {
491 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492 cur = ctxt->input->cur;
493 }
494 if ((cur[1] & 0xc0) != 0x80)
495 goto encoding_error;
496 if ((c & 0xe0) == 0xe0) {
497
498 if (cur[2] == 0) {
499 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500 cur = ctxt->input->cur;
501 }
502 if ((cur[2] & 0xc0) != 0x80)
503 goto encoding_error;
504 if ((c & 0xf0) == 0xf0) {
505 if (cur[3] == 0) {
506 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507 cur = ctxt->input->cur;
508 }
509 if (((c & 0xf8) != 0xf0) ||
510 ((cur[3] & 0xc0) != 0x80))
511 goto encoding_error;
512 /* 4-byte code */
513 *len = 4;
514 val = (cur[0] & 0x7) << 18;
515 val |= (cur[1] & 0x3f) << 12;
516 val |= (cur[2] & 0x3f) << 6;
517 val |= cur[3] & 0x3f;
518 if (val < 0x10000)
519 goto encoding_error;
520 } else {
521 /* 3-byte code */
522 *len = 3;
523 val = (cur[0] & 0xf) << 12;
524 val |= (cur[1] & 0x3f) << 6;
525 val |= cur[2] & 0x3f;
526 if (val < 0x800)
527 goto encoding_error;
528 }
529 } else {
530 /* 2-byte code */
531 *len = 2;
532 val = (cur[0] & 0x1f) << 6;
533 val |= cur[1] & 0x3f;
534 if (val < 0x80)
535 goto encoding_error;
536 }
537 if (!IS_CHAR(val)) {
538 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539 "Char 0x%X out of allowed range\n", val);
540 }
541 return(val);
542 } else {
543 if ((*ctxt->input->cur == 0) &&
544 (ctxt->input->cur < ctxt->input->end)) {
545 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546 "Char 0x%X out of allowed range\n", 0);
547 *len = 1;
548 return(' ');
549 }
550 /* 1-byte code */
551 *len = 1;
552 return((int) *ctxt->input->cur);
553 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800554
555encoding_error:
556 /*
557 * If we detect an UTF8 error that probably mean that the
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700558 * input encoding didn't get properly advertised in the
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800559 * declaration header. Report the error and switch the encoding
560 * to ISO-Latin-1 (if you don't like this policy, just declare the
561 * encoding !)
562 */
563 {
564 char buffer[150];
565
566 if (ctxt->input->end - ctxt->input->cur >= 4) {
567 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 ctxt->input->cur[0], ctxt->input->cur[1],
569 ctxt->input->cur[2], ctxt->input->cur[3]);
570 } else {
571 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572 }
573 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 "Input is not proper UTF-8, indicate encoding !\n",
575 BAD_CAST buffer, NULL);
576 }
577
Haibo Huang735158e2021-02-23 17:48:08 -0800578 /*
579 * Don't switch encodings twice. Note that if there's an encoder, we
580 * shouldn't receive invalid UTF-8 anyway.
581 *
582 * Note that if ctxt->input->buf == NULL, switching encodings is
583 * impossible, see Gitlab issue #34.
584 */
585 if ((ctxt->input->buf != NULL) &&
586 (ctxt->input->buf->encoder == NULL))
587 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800588 *len = 1;
589 return((int) *ctxt->input->cur);
590}
591
592/**
593 * htmlSkipBlankChars:
594 * @ctxt: the HTML parser context
595 *
596 * skip all blanks character found at that point in the input streams.
597 *
598 * Returns the number of space chars skipped
599 */
600
601static int
602htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603 int res = 0;
604
605 while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 if ((*ctxt->input->cur == 0) &&
607 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 xmlPopInput(ctxt);
609 } else {
610 if (*(ctxt->input->cur) == '\n') {
611 ctxt->input->line++; ctxt->input->col = 1;
612 } else ctxt->input->col++;
613 ctxt->input->cur++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800614 if (*ctxt->input->cur == 0)
615 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616 }
617 res++;
618 }
619 return(res);
620}
621
622
623
624/************************************************************************
625 * *
626 * The list of HTML elements and their properties *
627 * *
628 ************************************************************************/
629
630/*
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700631 * Start Tag: 1 means the start tag can be omitted
632 * End Tag: 1 means the end tag can be omitted
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800633 * 2 means it's forbidden (empty elements)
634 * 3 means the tag is stylistic and should be closed easily
635 * Depr: this element is deprecated
636 * DTD: 1 means that this element is valid only in the Loose DTD
637 * 2 means that this element is valid only in the Frameset DTD
638 *
639 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 , subElements , impliedsubelt , Attributes, userdata
641 */
642
643/* Definitions and a couple of vars for HTML Elements */
644
645#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646#define NB_FONTSTYLE 8
647#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648#define NB_PHRASE 10
649#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650#define NB_SPECIAL 16
651#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654#define NB_BLOCK NB_HEADING + NB_LIST + 14
655#define FORMCTRL "input", "select", "textarea", "label", "button"
656#define NB_FORMCTRL 5
657#define PCDATA
658#define NB_PCDATA 0
659#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660#define NB_HEADING 6
661#define LIST "ul", "ol", "dir", "menu"
662#define NB_LIST 4
663#define MODIFIER
664#define NB_MODIFIER 0
665#define FLOW BLOCK,INLINE
666#define NB_FLOW NB_BLOCK + NB_INLINE
667#define EMPTY NULL
668
669
670static const char* const html_flow[] = { FLOW, NULL } ;
671static const char* const html_inline[] = { INLINE, NULL } ;
672
673/* placeholders: elts with content but no subelements */
674static const char* const html_pcdata[] = { NULL } ;
675#define html_cdata html_pcdata
676
677
678/* ... and for HTML Attributes */
679
680#define COREATTRS "id", "class", "style", "title"
681#define NB_COREATTRS 4
682#define I18N "lang", "dir"
683#define NB_I18N 2
684#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685#define NB_EVENTS 9
686#define ATTRS COREATTRS,I18N,EVENTS
687#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688#define CELLHALIGN "align", "char", "charoff"
689#define NB_CELLHALIGN 3
690#define CELLVALIGN "valign"
691#define NB_CELLVALIGN 1
692
693static const char* const html_attrs[] = { ATTRS, NULL } ;
694static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695static const char* const core_attrs[] = { COREATTRS, NULL } ;
696static const char* const i18n_attrs[] = { I18N, NULL } ;
697
698
699/* Other declarations that should go inline ... */
700static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 "tabindex", "onfocus", "onblur", NULL } ;
703static const char* const target_attr[] = { "target", NULL } ;
704static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705static const char* const alt_attr[] = { "alt", NULL } ;
706static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707static const char* const href_attrs[] = { "href", NULL } ;
708static const char* const clear_attrs[] = { "clear", NULL } ;
709static const char* const inline_p[] = { INLINE, "p", NULL } ;
710
711static const char* const flow_param[] = { FLOW, "param", NULL } ;
712static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 "archive", "alt", "name", "height", "width", "align",
714 "hspace", "vspace", NULL } ;
715static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717static const char* const basefont_attrs[] =
718 { "id", "size", "color", "face", NULL } ;
719static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722static const char* const body_depr[] = { "background", "bgcolor", "text",
723 "link", "vlink", "alink", NULL } ;
724static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726
727
728static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729static const char* const col_elt[] = { "col", NULL } ;
730static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733static const char* const compact_attr[] = { "compact", NULL } ;
734static const char* const label_attr[] = { "label", NULL } ;
735static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745static const char* const version_attr[] = { "version", NULL } ;
746static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754static const char* const align_attr[] = { "align", NULL } ;
755static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757static const char* const name_attr[] = { "name", NULL } ;
758static const char* const action_attr[] = { "action", NULL } ;
759static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761static const char* const content_attr[] = { "content", NULL } ;
762static const char* const type_attr[] = { "type", NULL } ;
763static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764static const char* const object_contents[] = { FLOW, "param", NULL } ;
765static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768static const char* const option_elt[] = { "option", NULL } ;
769static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772static const char* const width_attr[] = { "width", NULL } ;
773static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775static const char* const language_attr[] = { "language", NULL } ;
776static const char* const select_content[] = { "optgroup", "option", NULL } ;
777static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782static const char* const tr_elt[] = { "tr", NULL } ;
783static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787static const char* const tr_contents[] = { "th", "td", NULL } ;
788static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789static const char* const li_elt[] = { "li", NULL } ;
790static const char* const ul_depr[] = { "type", "compact", NULL} ;
791static const char* const dir_attr[] = { "dir", NULL} ;
792
793#define DECL (const char**)
794
795static const htmlElemDesc
796html40ElementTable[] = {
797{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
798 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799},
800{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802},
803{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
804 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805},
806{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
807 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
808},
809{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
810 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811},
812{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814},
815{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817},
818{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820},
821{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823},
824{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826},
827{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
828 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829},
830{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832},
833{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
834 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835},
836{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838},
839{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
840 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841},
842{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
843 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844},
845{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847},
848{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
849 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850},
851{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853},
854{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
855 EMPTY , NULL , DECL col_attrs , NULL, NULL
856},
857{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
858 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859},
860{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
861 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862},
863{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865},
866{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
867 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868},
869{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
870 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871},
872{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874},
875{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
876 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877},
878{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
879 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880},
881{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
882 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883},
884{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 EMPTY, NULL, DECL embed_attrs, NULL, NULL
886},
887{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
888 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889},
890{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892},
893{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895},
896{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 EMPTY, NULL, NULL, DECL frame_attrs, NULL
898},
899{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901},
902{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
903 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904},
905{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
906 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907},
908{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910},
911{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
912 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913},
914{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
915 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916},
917{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
918 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919},
920{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
921 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922},
923{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925},
926{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
927 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928},
929{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
930 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931},
932{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934},
935{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937},
938{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
939 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940},
941{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
942 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943},
944{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946},
947{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949},
950{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952},
953{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955},
956{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
957 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958},
959{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961},
962{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964},
965{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
966 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967},
968{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970},
971{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973},
974{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 DECL html_flow, "div", DECL html_attrs, NULL, NULL
976},
977{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979},
980{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982},
983{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
984 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985},
986{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988},
989{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991},
992{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
993 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994},
995{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997},
998{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000},
1001{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003},
1004{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006},
1007{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009},
1010{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012},
1013{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1014 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015},
1016{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018},
1019{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021},
1022{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024},
1025{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1026 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027},
1028{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1029 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030},
1031{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033},
1034{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1035 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036},
1037{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1038 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039},
1040{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042},
1043{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045},
1046{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048},
1049{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051},
1052{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1053 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054},
1055{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1056 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057},
1058{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1059 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060},
1061{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063},
1064{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066},
1067{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069},
1070{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072}
1073};
1074
1075/*
1076 * start tags that imply the end of current element
1077 */
1078static const char * const htmlStartClose[] = {
1079"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1080 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1081 "listing", "xmp", "head", NULL,
1082"head", "p", NULL,
1083"title", "p", NULL,
1084"body", "head", "style", "link", "title", "p", NULL,
1085"frameset", "head", "style", "link", "title", "p", NULL,
1086"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1087 "pre", "listing", "xmp", "head", "li", NULL,
1088"hr", "p", "head", NULL,
1089"h1", "p", "head", NULL,
1090"h2", "p", "head", NULL,
1091"h3", "p", "head", NULL,
1092"h4", "p", "head", NULL,
1093"h5", "p", "head", NULL,
1094"h6", "p", "head", NULL,
1095"dir", "p", "head", NULL,
1096"address", "p", "head", "ul", NULL,
1097"pre", "p", "head", "ul", NULL,
1098"listing", "p", "head", NULL,
1099"xmp", "p", "head", NULL,
1100"blockquote", "p", "head", NULL,
1101"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1102 "xmp", "head", NULL,
1103"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1104 "head", "dd", NULL,
1105"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1106 "head", "dt", NULL,
1107"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1108 "listing", "xmp", NULL,
1109"ol", "p", "head", "ul", NULL,
1110"menu", "p", "head", "ul", NULL,
1111"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1112"div", "p", "head", NULL,
1113"noscript", "script", NULL,
1114"center", "font", "b", "i", "p", "head", NULL,
1115"a", "a", "head", NULL,
1116"caption", "p", NULL,
1117"colgroup", "caption", "colgroup", "col", "p", NULL,
1118"col", "caption", "col", "p", NULL,
1119"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1120 "listing", "xmp", "a", NULL,
1121"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1122"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1123"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1124"thead", "caption", "col", "colgroup", NULL,
1125"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1126 "tbody", "p", NULL,
1127"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1128 "tfoot", "tbody", "p", NULL,
1129"optgroup", "option", NULL,
1130"option", "option", NULL,
1131"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1132 "pre", "listing", "xmp", "a", NULL,
1133/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1134"tt", "head", NULL,
1135"i", "head", NULL,
1136"b", "head", NULL,
1137"u", "head", NULL,
1138"s", "head", NULL,
1139"strike", "head", NULL,
1140"big", "head", NULL,
1141"small", "head", NULL,
1142
1143"em", "head", NULL,
1144"strong", "head", NULL,
1145"dfn", "head", NULL,
1146"code", "head", NULL,
1147"samp", "head", NULL,
1148"kbd", "head", NULL,
1149"var", "head", NULL,
1150"cite", "head", NULL,
1151"abbr", "head", NULL,
1152"acronym", "head", NULL,
1153
1154/* "a" */
1155"img", "head", NULL,
1156/* "applet" */
1157/* "embed" */
1158/* "object" */
1159"font", "head", NULL,
1160/* "basefont" */
1161"br", "head", NULL,
1162/* "script" */
1163"map", "head", NULL,
1164"q", "head", NULL,
1165"sub", "head", NULL,
1166"sup", "head", NULL,
1167"span", "head", NULL,
1168"bdo", "head", NULL,
1169"iframe", "head", NULL,
1170NULL
1171};
1172
1173/*
1174 * The list of HTML elements which are supposed not to have
1175 * CDATA content and where a p element will be implied
1176 *
1177 * TODO: extend that list by reading the HTML SGML DTD on
1178 * implied paragraph
1179 */
1180static const char *const htmlNoContentElements[] = {
1181 "html",
1182 "head",
1183 NULL
1184};
1185
1186/*
1187 * The list of HTML attributes which are of content %Script;
1188 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1189 * it assumes the name starts with 'on'
1190 */
1191static const char *const htmlScriptAttributes[] = {
1192 "onclick",
1193 "ondblclick",
1194 "onmousedown",
1195 "onmouseup",
1196 "onmouseover",
1197 "onmousemove",
1198 "onmouseout",
1199 "onkeypress",
1200 "onkeydown",
1201 "onkeyup",
1202 "onload",
1203 "onunload",
1204 "onfocus",
1205 "onblur",
1206 "onsubmit",
1207 "onreset",
1208 "onchange",
1209 "onselect"
1210};
1211
1212/*
1213 * This table is used by the htmlparser to know what to do with
1214 * broken html pages. By assigning different priorities to different
1215 * elements the parser can decide how to handle extra endtags.
1216 * Endtags are only allowed to close elements with lower or equal
1217 * priority.
1218 */
1219
1220typedef struct {
1221 const char *name;
1222 int priority;
1223} elementPriority;
1224
1225static const elementPriority htmlEndPriority[] = {
1226 {"div", 150},
1227 {"td", 160},
1228 {"th", 160},
1229 {"tr", 170},
1230 {"thead", 180},
1231 {"tbody", 180},
1232 {"tfoot", 180},
1233 {"table", 190},
1234 {"head", 200},
1235 {"body", 200},
1236 {"html", 220},
1237 {NULL, 100} /* Default priority */
1238};
1239
1240static const char** htmlStartCloseIndex[100];
1241static int htmlStartCloseIndexinitialized = 0;
1242
1243/************************************************************************
1244 * *
1245 * functions to handle HTML specific data *
1246 * *
1247 ************************************************************************/
1248
1249/**
1250 * htmlInitAutoClose:
1251 *
1252 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1253 * This is not reentrant. Call xmlInitParser() once before processing in
1254 * case of use in multithreaded programs.
1255 */
1256void
1257htmlInitAutoClose(void) {
1258 int indx, i = 0;
1259
1260 if (htmlStartCloseIndexinitialized) return;
1261
1262 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1263 indx = 0;
1264 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1265 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1266 while (htmlStartClose[i] != NULL) i++;
1267 i++;
1268 }
1269 htmlStartCloseIndexinitialized = 1;
1270}
1271
1272/**
1273 * htmlTagLookup:
1274 * @tag: The tag name in lowercase
1275 *
1276 * Lookup the HTML tag in the ElementTable
1277 *
1278 * Returns the related htmlElemDescPtr or NULL if not found.
1279 */
1280const htmlElemDesc *
1281htmlTagLookup(const xmlChar *tag) {
1282 unsigned int i;
1283
1284 for (i = 0; i < (sizeof(html40ElementTable) /
1285 sizeof(html40ElementTable[0]));i++) {
1286 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1287 return((htmlElemDescPtr) &html40ElementTable[i]);
1288 }
1289 return(NULL);
1290}
1291
1292/**
1293 * htmlGetEndPriority:
1294 * @name: The name of the element to look up the priority for.
1295 *
1296 * Return value: The "endtag" priority.
1297 **/
1298static int
1299htmlGetEndPriority (const xmlChar *name) {
1300 int i = 0;
1301
1302 while ((htmlEndPriority[i].name != NULL) &&
1303 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1304 i++;
1305
1306 return(htmlEndPriority[i].priority);
1307}
1308
1309
1310/**
1311 * htmlCheckAutoClose:
1312 * @newtag: The new tag name
1313 * @oldtag: The old tag name
1314 *
1315 * Checks whether the new tag is one of the registered valid tags for
1316 * closing old.
1317 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1318 *
1319 * Returns 0 if no, 1 if yes.
1320 */
1321static int
1322htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1323{
1324 int i, indx;
1325 const char **closed = NULL;
1326
1327 if (htmlStartCloseIndexinitialized == 0)
1328 htmlInitAutoClose();
1329
1330 /* inefficient, but not a big deal */
1331 for (indx = 0; indx < 100; indx++) {
1332 closed = htmlStartCloseIndex[indx];
1333 if (closed == NULL)
1334 return (0);
1335 if (xmlStrEqual(BAD_CAST * closed, newtag))
1336 break;
1337 }
1338
1339 i = closed - htmlStartClose;
1340 i++;
1341 while (htmlStartClose[i] != NULL) {
1342 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1343 return (1);
1344 }
1345 i++;
1346 }
1347 return (0);
1348}
1349
1350/**
1351 * htmlAutoCloseOnClose:
1352 * @ctxt: an HTML parser context
1353 * @newtag: The new tag name
1354 * @force: force the tag closure
1355 *
1356 * The HTML DTD allows an ending tag to implicitly close other tags.
1357 */
1358static void
1359htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1360{
1361 const htmlElemDesc *info;
1362 int i, priority;
1363
1364 priority = htmlGetEndPriority(newtag);
1365
1366 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1367
1368 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1369 break;
1370 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001371 * A misplaced endtag can only close elements with lower
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001372 * or equal priority, so if we find an element with higher
1373 * priority before we find an element with
1374 * matching name, we just ignore this endtag
1375 */
1376 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1377 return;
1378 }
1379 if (i < 0)
1380 return;
1381
1382 while (!xmlStrEqual(newtag, ctxt->name)) {
1383 info = htmlTagLookup(ctxt->name);
1384 if ((info != NULL) && (info->endTag == 3)) {
1385 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1386 "Opening and ending tag mismatch: %s and %s\n",
1387 newtag, ctxt->name);
1388 }
1389 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1390 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1391 htmlnamePop(ctxt);
1392 }
1393}
1394
1395/**
1396 * htmlAutoCloseOnEnd:
1397 * @ctxt: an HTML parser context
1398 *
1399 * Close all remaining tags at the end of the stream
1400 */
1401static void
1402htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1403{
1404 int i;
1405
1406 if (ctxt->nameNr == 0)
1407 return;
1408 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1409 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1410 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1411 htmlnamePop(ctxt);
1412 }
1413}
1414
1415/**
1416 * htmlAutoClose:
1417 * @ctxt: an HTML parser context
1418 * @newtag: The new tag name or NULL
1419 *
1420 * The HTML DTD allows a tag to implicitly close other tags.
1421 * The list is kept in htmlStartClose array. This function is
1422 * called when a new tag has been detected and generates the
1423 * appropriates closes if possible/needed.
1424 * If newtag is NULL this mean we are at the end of the resource
1425 * and we should check
1426 */
1427static void
1428htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1429{
1430 while ((newtag != NULL) && (ctxt->name != NULL) &&
1431 (htmlCheckAutoClose(newtag, ctxt->name))) {
1432 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1433 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1434 htmlnamePop(ctxt);
1435 }
1436 if (newtag == NULL) {
1437 htmlAutoCloseOnEnd(ctxt);
1438 return;
1439 }
1440 while ((newtag == NULL) && (ctxt->name != NULL) &&
1441 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1442 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1443 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1444 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1445 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1446 htmlnamePop(ctxt);
1447 }
1448}
1449
1450/**
1451 * htmlAutoCloseTag:
1452 * @doc: the HTML document
1453 * @name: The tag name
1454 * @elem: the HTML element
1455 *
1456 * The HTML DTD allows a tag to implicitly close other tags.
1457 * The list is kept in htmlStartClose array. This function checks
1458 * if the element or one of it's children would autoclose the
1459 * given tag.
1460 *
1461 * Returns 1 if autoclose, 0 otherwise
1462 */
1463int
1464htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1465 htmlNodePtr child;
1466
1467 if (elem == NULL) return(1);
1468 if (xmlStrEqual(name, elem->name)) return(0);
1469 if (htmlCheckAutoClose(elem->name, name)) return(1);
1470 child = elem->children;
1471 while (child != NULL) {
1472 if (htmlAutoCloseTag(doc, name, child)) return(1);
1473 child = child->next;
1474 }
1475 return(0);
1476}
1477
1478/**
1479 * htmlIsAutoClosed:
1480 * @doc: the HTML document
1481 * @elem: the HTML element
1482 *
1483 * The HTML DTD allows a tag to implicitly close other tags.
1484 * The list is kept in htmlStartClose array. This function checks
1485 * if a tag is autoclosed by one of it's child
1486 *
1487 * Returns 1 if autoclosed, 0 otherwise
1488 */
1489int
1490htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1491 htmlNodePtr child;
1492
1493 if (elem == NULL) return(1);
1494 child = elem->children;
1495 while (child != NULL) {
1496 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1497 child = child->next;
1498 }
1499 return(0);
1500}
1501
1502/**
1503 * htmlCheckImplied:
1504 * @ctxt: an HTML parser context
1505 * @newtag: The new tag name
1506 *
1507 * The HTML DTD allows a tag to exists only implicitly
1508 * called when a new tag has been detected and generates the
1509 * appropriates implicit tags if missing
1510 */
1511static void
1512htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1513 int i;
1514
1515 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1516 return;
1517 if (!htmlOmittedDefaultValue)
1518 return;
1519 if (xmlStrEqual(newtag, BAD_CAST"html"))
1520 return;
1521 if (ctxt->nameNr <= 0) {
1522 htmlnamePush(ctxt, BAD_CAST"html");
1523 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1524 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1525 }
1526 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1527 return;
1528 if ((ctxt->nameNr <= 1) &&
1529 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1530 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1531 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1532 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1533 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1534 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1535 if (ctxt->html >= 3) {
1536 /* we already saw or generated an <head> before */
1537 return;
1538 }
1539 /*
1540 * dropped OBJECT ... i you put it first BODY will be
1541 * assumed !
1542 */
1543 htmlnamePush(ctxt, BAD_CAST"head");
1544 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1545 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1546 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1547 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1548 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1549 if (ctxt->html >= 10) {
1550 /* we already saw or generated a <body> before */
1551 return;
1552 }
1553 for (i = 0;i < ctxt->nameNr;i++) {
1554 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1555 return;
1556 }
1557 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1558 return;
1559 }
1560 }
1561
1562 htmlnamePush(ctxt, BAD_CAST"body");
1563 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1564 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1565 }
1566}
1567
1568/**
1569 * htmlCheckParagraph
1570 * @ctxt: an HTML parser context
1571 *
1572 * Check whether a p element need to be implied before inserting
1573 * characters in the current element.
1574 *
1575 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1576 * in case of error.
1577 */
1578
1579static int
1580htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1581 const xmlChar *tag;
1582 int i;
1583
1584 if (ctxt == NULL)
1585 return(-1);
1586 tag = ctxt->name;
1587 if (tag == NULL) {
1588 htmlAutoClose(ctxt, BAD_CAST"p");
1589 htmlCheckImplied(ctxt, BAD_CAST"p");
1590 htmlnamePush(ctxt, BAD_CAST"p");
1591 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1592 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1593 return(1);
1594 }
1595 if (!htmlOmittedDefaultValue)
1596 return(0);
1597 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1598 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1599 htmlAutoClose(ctxt, BAD_CAST"p");
1600 htmlCheckImplied(ctxt, BAD_CAST"p");
1601 htmlnamePush(ctxt, BAD_CAST"p");
1602 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1603 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1604 return(1);
1605 }
1606 }
1607 return(0);
1608}
1609
1610/**
1611 * htmlIsScriptAttribute:
1612 * @name: an attribute name
1613 *
1614 * Check if an attribute is of content type Script
1615 *
1616 * Returns 1 is the attribute is a script 0 otherwise
1617 */
1618int
1619htmlIsScriptAttribute(const xmlChar *name) {
1620 unsigned int i;
1621
1622 if (name == NULL)
1623 return(0);
1624 /*
1625 * all script attributes start with 'on'
1626 */
1627 if ((name[0] != 'o') || (name[1] != 'n'))
1628 return(0);
1629 for (i = 0;
1630 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1631 i++) {
1632 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1633 return(1);
1634 }
1635 return(0);
1636}
1637
1638/************************************************************************
1639 * *
1640 * The list of HTML predefined entities *
1641 * *
1642 ************************************************************************/
1643
1644
1645static const htmlEntityDesc html40EntitiesTable[] = {
1646/*
1647 * the 4 absolute ones, plus apostrophe.
1648 */
1649{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1650{ 38, "amp", "ampersand, U+0026 ISOnum" },
1651{ 39, "apos", "single quote" },
1652{ 60, "lt", "less-than sign, U+003C ISOnum" },
1653{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1654
1655/*
1656 * A bunch still in the 128-255 range
1657 * Replacing them depend really on the charset used.
1658 */
1659{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1660{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1661{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1662{ 163, "pound","pound sign, U+00A3 ISOnum" },
1663{ 164, "curren","currency sign, U+00A4 ISOnum" },
1664{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1665{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1666{ 167, "sect", "section sign, U+00A7 ISOnum" },
1667{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1668{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1669{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1670{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1671{ 172, "not", "not sign, U+00AC ISOnum" },
1672{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1673{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1674{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1675{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1676{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1677{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1678{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1679{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1680{ 181, "micro","micro sign, U+00B5 ISOnum" },
1681{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1682{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1683{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1684{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1685{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1686{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1687{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1688{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1689{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1690{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1691{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1692{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1693{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1694{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1695{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1696{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1697{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1698{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1699{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1700{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1701{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1702{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1703{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1704{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1705{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1706{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1707{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1708{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1709{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1710{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1711{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1712{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1713{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1714{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1715{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1716{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1717{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1718{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1719{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1720{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1721{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1722{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1723{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1724{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1725{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1726{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1727{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1728{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1729{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1730{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1731{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1732{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1733{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1734{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1735{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1736{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1737{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1738{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1739{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1740{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1741{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1742{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1743{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1744{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1745{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1746{ 247, "divide","division sign, U+00F7 ISOnum" },
1747{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1748{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1749{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1750{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1751{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1752{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1753{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1754{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1755
1756{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1757{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1758{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1759{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1760{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1761
1762/*
1763 * Anything below should really be kept as entities references
1764 */
1765{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1766
1767{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1768{ 732, "tilde","small tilde, U+02DC ISOdia" },
1769
1770{ 913, "Alpha","greek capital letter alpha, U+0391" },
1771{ 914, "Beta", "greek capital letter beta, U+0392" },
1772{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1773{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1774{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1775{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1776{ 919, "Eta", "greek capital letter eta, U+0397" },
1777{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1778{ 921, "Iota", "greek capital letter iota, U+0399" },
1779{ 922, "Kappa","greek capital letter kappa, U+039A" },
1780{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1781{ 924, "Mu", "greek capital letter mu, U+039C" },
1782{ 925, "Nu", "greek capital letter nu, U+039D" },
1783{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1784{ 927, "Omicron","greek capital letter omicron, U+039F" },
1785{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1786{ 929, "Rho", "greek capital letter rho, U+03A1" },
1787{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1788{ 932, "Tau", "greek capital letter tau, U+03A4" },
1789{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1790{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1791{ 935, "Chi", "greek capital letter chi, U+03A7" },
1792{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1793{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1794
1795{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1796{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1797{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1798{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1799{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1800{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1801{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1802{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1803{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1804{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1805{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1806{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1807{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1808{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1809{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1810{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1811{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1812{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1813{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1814{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1815{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1816{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1817{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1818{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1819{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1820{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1821{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1822{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1823
1824{ 8194, "ensp", "en space, U+2002 ISOpub" },
1825{ 8195, "emsp", "em space, U+2003 ISOpub" },
1826{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1827{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1828{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1829{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1830{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1831{ 8211, "ndash","en dash, U+2013 ISOpub" },
1832{ 8212, "mdash","em dash, U+2014 ISOpub" },
1833{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1834{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1835{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1836{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1837{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1838{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1839{ 8224, "dagger","dagger, U+2020 ISOpub" },
1840{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1841
1842{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1843{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1844
1845{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1846
1847{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1848{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1849
1850{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1851{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1852
1853{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1854{ 8260, "frasl","fraction slash, U+2044 NEW" },
1855
1856{ 8364, "euro", "euro sign, U+20AC NEW" },
1857
1858{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1859{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1860{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1861{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1862{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1863{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1864{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1865{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1866{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1867{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1868{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1869{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1870{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1871{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1872{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1873{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1874
1875{ 8704, "forall","for all, U+2200 ISOtech" },
1876{ 8706, "part", "partial differential, U+2202 ISOtech" },
1877{ 8707, "exist","there exists, U+2203 ISOtech" },
1878{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1879{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1880{ 8712, "isin", "element of, U+2208 ISOtech" },
1881{ 8713, "notin","not an element of, U+2209 ISOtech" },
1882{ 8715, "ni", "contains as member, U+220B ISOtech" },
1883{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1884{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1885{ 8722, "minus","minus sign, U+2212 ISOtech" },
1886{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1887{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1888{ 8733, "prop", "proportional to, U+221D ISOtech" },
1889{ 8734, "infin","infinity, U+221E ISOtech" },
1890{ 8736, "ang", "angle, U+2220 ISOamso" },
1891{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1892{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1893{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1894{ 8746, "cup", "union = cup, U+222A ISOtech" },
1895{ 8747, "int", "integral, U+222B ISOtech" },
1896{ 8756, "there4","therefore, U+2234 ISOtech" },
1897{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1898{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1899{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1900{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1901{ 8801, "equiv","identical to, U+2261 ISOtech" },
1902{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1903{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1904{ 8834, "sub", "subset of, U+2282 ISOtech" },
1905{ 8835, "sup", "superset of, U+2283 ISOtech" },
1906{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1907{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1908{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1909{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1910{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1911{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1912{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1913{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1914{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1915{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1916{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1917{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1918{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1919{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1920
1921{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1922{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1923{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1924{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1925
1926};
1927
1928/************************************************************************
1929 * *
1930 * Commodity functions to handle entities *
1931 * *
1932 ************************************************************************/
1933
1934/*
1935 * Macro used to grow the current buffer.
1936 */
1937#define growBuffer(buffer) { \
1938 xmlChar *tmp; \
1939 buffer##_size *= 2; \
1940 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1941 if (tmp == NULL) { \
1942 htmlErrMemory(ctxt, "growing buffer\n"); \
1943 xmlFree(buffer); \
1944 return(NULL); \
1945 } \
1946 buffer = tmp; \
1947}
1948
1949/**
1950 * htmlEntityLookup:
1951 * @name: the entity name
1952 *
1953 * Lookup the given entity in EntitiesTable
1954 *
1955 * TODO: the linear scan is really ugly, an hash table is really needed.
1956 *
1957 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1958 */
1959const htmlEntityDesc *
1960htmlEntityLookup(const xmlChar *name) {
1961 unsigned int i;
1962
1963 for (i = 0;i < (sizeof(html40EntitiesTable)/
1964 sizeof(html40EntitiesTable[0]));i++) {
1965 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1966 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1967 }
1968 }
1969 return(NULL);
1970}
1971
1972/**
1973 * htmlEntityValueLookup:
1974 * @value: the entity's unicode value
1975 *
1976 * Lookup the given entity in EntitiesTable
1977 *
1978 * TODO: the linear scan is really ugly, an hash table is really needed.
1979 *
1980 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1981 */
1982const htmlEntityDesc *
1983htmlEntityValueLookup(unsigned int value) {
1984 unsigned int i;
1985
1986 for (i = 0;i < (sizeof(html40EntitiesTable)/
1987 sizeof(html40EntitiesTable[0]));i++) {
1988 if (html40EntitiesTable[i].value >= value) {
1989 if (html40EntitiesTable[i].value > value)
1990 break;
1991 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1992 }
1993 }
1994 return(NULL);
1995}
1996
1997/**
1998 * UTF8ToHtml:
1999 * @out: a pointer to an array of bytes to store the result
2000 * @outlen: the length of @out
2001 * @in: a pointer to an array of UTF-8 chars
2002 * @inlen: the length of @in
2003 *
2004 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2005 * plus HTML entities block of chars out.
2006 *
2007 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2008 * The value of @inlen after return is the number of octets consumed
2009 * as the return value is positive, else unpredictable.
2010 * The value of @outlen after return is the number of octets consumed.
2011 */
2012int
2013UTF8ToHtml(unsigned char* out, int *outlen,
2014 const unsigned char* in, int *inlen) {
2015 const unsigned char* processed = in;
2016 const unsigned char* outend;
2017 const unsigned char* outstart = out;
2018 const unsigned char* instart = in;
2019 const unsigned char* inend;
2020 unsigned int c, d;
2021 int trailing;
2022
2023 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2024 if (in == NULL) {
2025 /*
2026 * initialization nothing to do
2027 */
2028 *outlen = 0;
2029 *inlen = 0;
2030 return(0);
2031 }
2032 inend = in + (*inlen);
2033 outend = out + (*outlen);
2034 while (in < inend) {
2035 d = *in++;
2036 if (d < 0x80) { c= d; trailing= 0; }
2037 else if (d < 0xC0) {
2038 /* trailing byte in leading position */
2039 *outlen = out - outstart;
2040 *inlen = processed - instart;
2041 return(-2);
2042 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2043 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2044 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2045 else {
2046 /* no chance for this in Ascii */
2047 *outlen = out - outstart;
2048 *inlen = processed - instart;
2049 return(-2);
2050 }
2051
2052 if (inend - in < trailing) {
2053 break;
2054 }
2055
2056 for ( ; trailing; trailing--) {
2057 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2058 break;
2059 c <<= 6;
2060 c |= d & 0x3F;
2061 }
2062
2063 /* assertion: c is a single UTF-4 value */
2064 if (c < 0x80) {
2065 if (out + 1 >= outend)
2066 break;
2067 *out++ = c;
2068 } else {
2069 int len;
2070 const htmlEntityDesc * ent;
2071 const char *cp;
2072 char nbuf[16];
2073
2074 /*
2075 * Try to lookup a predefined HTML entity for it
2076 */
2077
2078 ent = htmlEntityValueLookup(c);
2079 if (ent == NULL) {
2080 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2081 cp = nbuf;
2082 }
2083 else
2084 cp = ent->name;
2085 len = strlen(cp);
2086 if (out + 2 + len >= outend)
2087 break;
2088 *out++ = '&';
2089 memcpy(out, cp, len);
2090 out += len;
2091 *out++ = ';';
2092 }
2093 processed = in;
2094 }
2095 *outlen = out - outstart;
2096 *inlen = processed - instart;
2097 return(0);
2098}
2099
2100/**
2101 * htmlEncodeEntities:
2102 * @out: a pointer to an array of bytes to store the result
2103 * @outlen: the length of @out
2104 * @in: a pointer to an array of UTF-8 chars
2105 * @inlen: the length of @in
2106 * @quoteChar: the quote character to escape (' or ") or zero.
2107 *
2108 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2109 * plus HTML entities block of chars out.
2110 *
2111 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2112 * The value of @inlen after return is the number of octets consumed
2113 * as the return value is positive, else unpredictable.
2114 * The value of @outlen after return is the number of octets consumed.
2115 */
2116int
2117htmlEncodeEntities(unsigned char* out, int *outlen,
2118 const unsigned char* in, int *inlen, int quoteChar) {
2119 const unsigned char* processed = in;
2120 const unsigned char* outend;
2121 const unsigned char* outstart = out;
2122 const unsigned char* instart = in;
2123 const unsigned char* inend;
2124 unsigned int c, d;
2125 int trailing;
2126
2127 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2128 return(-1);
2129 outend = out + (*outlen);
2130 inend = in + (*inlen);
2131 while (in < inend) {
2132 d = *in++;
2133 if (d < 0x80) { c= d; trailing= 0; }
2134 else if (d < 0xC0) {
2135 /* trailing byte in leading position */
2136 *outlen = out - outstart;
2137 *inlen = processed - instart;
2138 return(-2);
2139 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2140 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2141 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2142 else {
2143 /* no chance for this in Ascii */
2144 *outlen = out - outstart;
2145 *inlen = processed - instart;
2146 return(-2);
2147 }
2148
2149 if (inend - in < trailing)
2150 break;
2151
2152 while (trailing--) {
2153 if (((d= *in++) & 0xC0) != 0x80) {
2154 *outlen = out - outstart;
2155 *inlen = processed - instart;
2156 return(-2);
2157 }
2158 c <<= 6;
2159 c |= d & 0x3F;
2160 }
2161
2162 /* assertion: c is a single UTF-4 value */
2163 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2164 (c != '&') && (c != '<') && (c != '>')) {
2165 if (out >= outend)
2166 break;
2167 *out++ = c;
2168 } else {
2169 const htmlEntityDesc * ent;
2170 const char *cp;
2171 char nbuf[16];
2172 int len;
2173
2174 /*
2175 * Try to lookup a predefined HTML entity for it
2176 */
2177 ent = htmlEntityValueLookup(c);
2178 if (ent == NULL) {
2179 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2180 cp = nbuf;
2181 }
2182 else
2183 cp = ent->name;
2184 len = strlen(cp);
2185 if (out + 2 + len > outend)
2186 break;
2187 *out++ = '&';
2188 memcpy(out, cp, len);
2189 out += len;
2190 *out++ = ';';
2191 }
2192 processed = in;
2193 }
2194 *outlen = out - outstart;
2195 *inlen = processed - instart;
2196 return(0);
2197}
2198
2199/************************************************************************
2200 * *
2201 * Commodity functions to handle streams *
2202 * *
2203 ************************************************************************/
2204
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002205#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002206/**
2207 * htmlNewInputStream:
2208 * @ctxt: an HTML parser context
2209 *
2210 * Create a new input stream structure
2211 * Returns the new input stream or NULL
2212 */
2213static htmlParserInputPtr
2214htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2215 htmlParserInputPtr input;
2216
2217 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2218 if (input == NULL) {
2219 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2220 return(NULL);
2221 }
2222 memset(input, 0, sizeof(htmlParserInput));
2223 input->filename = NULL;
2224 input->directory = NULL;
2225 input->base = NULL;
2226 input->cur = NULL;
2227 input->buf = NULL;
2228 input->line = 1;
2229 input->col = 1;
2230 input->buf = NULL;
2231 input->free = NULL;
2232 input->version = NULL;
2233 input->consumed = 0;
2234 input->length = 0;
2235 return(input);
2236}
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002237#endif
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002238
2239
2240/************************************************************************
2241 * *
2242 * Commodity functions, cleanup needed ? *
2243 * *
2244 ************************************************************************/
2245/*
2246 * all tags allowing pc data from the html 4.01 loose dtd
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002247 * NOTE: it might be more appropriate to integrate this information
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002248 * into the html40ElementTable array but I don't want to risk any
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002249 * binary incompatibility
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002250 */
2251static const char *allowPCData[] = {
2252 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2253 "blockquote", "body", "button", "caption", "center", "cite", "code",
2254 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2255 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2256 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2257 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2258};
2259
2260/**
2261 * areBlanks:
2262 * @ctxt: an HTML parser context
2263 * @str: a xmlChar *
2264 * @len: the size of @str
2265 *
2266 * Is this a sequence of blank chars that one can ignore ?
2267 *
2268 * Returns 1 if ignorable 0 otherwise.
2269 */
2270
2271static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2272 unsigned int i;
2273 int j;
2274 xmlNodePtr lastChild;
2275 xmlDtdPtr dtd;
2276
2277 for (j = 0;j < len;j++)
2278 if (!(IS_BLANK_CH(str[j]))) return(0);
2279
2280 if (CUR == 0) return(1);
2281 if (CUR != '<') return(0);
2282 if (ctxt->name == NULL)
2283 return(1);
2284 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2285 return(1);
2286 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2287 return(1);
2288
2289 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2290 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2291 dtd = xmlGetIntSubset(ctxt->myDoc);
2292 if (dtd != NULL && dtd->ExternalID != NULL) {
2293 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2294 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2295 return(1);
2296 }
2297 }
2298
2299 if (ctxt->node == NULL) return(0);
2300 lastChild = xmlGetLastChild(ctxt->node);
2301 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2302 lastChild = lastChild->prev;
2303 if (lastChild == NULL) {
2304 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2305 (ctxt->node->content != NULL)) return(0);
2306 /* keep ws in constructs like ...<b> </b>...
2307 for all tags "b" allowing PCDATA */
2308 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2309 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2310 return(0);
2311 }
2312 }
2313 } else if (xmlNodeIsText(lastChild)) {
2314 return(0);
2315 } else {
2316 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2317 for all tags "p" allowing PCDATA */
2318 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2319 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2320 return(0);
2321 }
2322 }
2323 }
2324 return(1);
2325}
2326
2327/**
2328 * htmlNewDocNoDtD:
2329 * @URI: URI for the dtd, or NULL
2330 * @ExternalID: the external ID of the DTD, or NULL
2331 *
2332 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2333 * are NULL
2334 *
2335 * Returns a new document, do not initialize the DTD if not provided
2336 */
2337htmlDocPtr
2338htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2339 xmlDocPtr cur;
2340
2341 /*
2342 * Allocate a new document and fill the fields.
2343 */
2344 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2345 if (cur == NULL) {
2346 htmlErrMemory(NULL, "HTML document creation failed\n");
2347 return(NULL);
2348 }
2349 memset(cur, 0, sizeof(xmlDoc));
2350
2351 cur->type = XML_HTML_DOCUMENT_NODE;
2352 cur->version = NULL;
2353 cur->intSubset = NULL;
2354 cur->doc = cur;
2355 cur->name = NULL;
2356 cur->children = NULL;
2357 cur->extSubset = NULL;
2358 cur->oldNs = NULL;
2359 cur->encoding = NULL;
2360 cur->standalone = 1;
2361 cur->compression = 0;
2362 cur->ids = NULL;
2363 cur->refs = NULL;
2364 cur->_private = NULL;
2365 cur->charset = XML_CHAR_ENCODING_UTF8;
2366 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2367 if ((ExternalID != NULL) ||
2368 (URI != NULL))
2369 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2370 return(cur);
2371}
2372
2373/**
2374 * htmlNewDoc:
2375 * @URI: URI for the dtd, or NULL
2376 * @ExternalID: the external ID of the DTD, or NULL
2377 *
2378 * Creates a new HTML document
2379 *
2380 * Returns a new document
2381 */
2382htmlDocPtr
2383htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2384 if ((URI == NULL) && (ExternalID == NULL))
2385 return(htmlNewDocNoDtD(
2386 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2387 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2388
2389 return(htmlNewDocNoDtD(URI, ExternalID));
2390}
2391
2392
2393/************************************************************************
2394 * *
2395 * The parser itself *
2396 * Relates to http://www.w3.org/TR/html40 *
2397 * *
2398 ************************************************************************/
2399
2400/************************************************************************
2401 * *
2402 * The parser itself *
2403 * *
2404 ************************************************************************/
2405
2406static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2407
2408/**
2409 * htmlParseHTMLName:
2410 * @ctxt: an HTML parser context
2411 *
2412 * parse an HTML tag or attribute name, note that we convert it to lowercase
2413 * since HTML names are not case-sensitive.
2414 *
2415 * Returns the Tag Name parsed or NULL
2416 */
2417
2418static const xmlChar *
2419htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2420 int i = 0;
2421 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2422
2423 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2424 (CUR != ':') && (CUR != '.')) return(NULL);
2425
2426 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2427 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2428 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2429 (CUR == '.'))) {
2430 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2431 else loc[i] = CUR;
2432 i++;
2433
2434 NEXT;
2435 }
2436
2437 return(xmlDictLookup(ctxt->dict, loc, i));
2438}
2439
2440
2441/**
2442 * htmlParseHTMLName_nonInvasive:
2443 * @ctxt: an HTML parser context
2444 *
2445 * parse an HTML tag or attribute name, note that we convert it to lowercase
2446 * since HTML names are not case-sensitive, this doesn't consume the data
2447 * from the stream, it's a look-ahead
2448 *
2449 * Returns the Tag Name parsed or NULL
2450 */
2451
2452static const xmlChar *
2453htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2454 int i = 0;
2455 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2456
2457 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2458 (NXT(1) != ':')) return(NULL);
2459
2460 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2461 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2462 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2463 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2464 else loc[i] = NXT(1+i);
2465 i++;
2466 }
2467
2468 return(xmlDictLookup(ctxt->dict, loc, i));
2469}
2470
2471
2472/**
2473 * htmlParseName:
2474 * @ctxt: an HTML parser context
2475 *
2476 * parse an HTML name, this routine is case sensitive.
2477 *
2478 * Returns the Name parsed or NULL
2479 */
2480
2481static const xmlChar *
2482htmlParseName(htmlParserCtxtPtr ctxt) {
2483 const xmlChar *in;
2484 const xmlChar *ret;
2485 int count = 0;
2486
2487 GROW;
2488
2489 /*
2490 * Accelerator for simple ASCII names
2491 */
2492 in = ctxt->input->cur;
2493 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2494 ((*in >= 0x41) && (*in <= 0x5A)) ||
2495 (*in == '_') || (*in == ':')) {
2496 in++;
2497 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2498 ((*in >= 0x41) && (*in <= 0x5A)) ||
2499 ((*in >= 0x30) && (*in <= 0x39)) ||
2500 (*in == '_') || (*in == '-') ||
2501 (*in == ':') || (*in == '.'))
2502 in++;
2503
2504 if (in == ctxt->input->end)
2505 return(NULL);
2506
2507 if ((*in > 0) && (*in < 0x80)) {
2508 count = in - ctxt->input->cur;
2509 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2510 ctxt->input->cur = in;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002511 ctxt->input->col += count;
2512 return(ret);
2513 }
2514 }
2515 return(htmlParseNameComplex(ctxt));
2516}
2517
2518static const xmlChar *
2519htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2520 int len = 0, l;
2521 int c;
2522 int count = 0;
2523 const xmlChar *base = ctxt->input->base;
2524
2525 /*
2526 * Handler for more complex cases
2527 */
2528 GROW;
2529 c = CUR_CHAR(l);
2530 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2531 (!IS_LETTER(c) && (c != '_') &&
2532 (c != ':'))) {
2533 return(NULL);
2534 }
2535
2536 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2537 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2538 (c == '.') || (c == '-') ||
2539 (c == '_') || (c == ':') ||
2540 (IS_COMBINING(c)) ||
2541 (IS_EXTENDER(c)))) {
2542 if (count++ > 100) {
2543 count = 0;
2544 GROW;
2545 }
2546 len += l;
2547 NEXTL(l);
2548 c = CUR_CHAR(l);
2549 if (ctxt->input->base != base) {
2550 /*
2551 * We changed encoding from an unknown encoding
2552 * Input buffer changed location, so we better start again
2553 */
2554 return(htmlParseNameComplex(ctxt));
2555 }
2556 }
2557
2558 if (ctxt->input->cur - ctxt->input->base < len) {
2559 /* Sanity check */
2560 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2561 "unexpected change of input buffer", NULL, NULL);
2562 return (NULL);
2563 }
2564
2565 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2566}
2567
2568
2569/**
2570 * htmlParseHTMLAttribute:
2571 * @ctxt: an HTML parser context
2572 * @stop: a char stop value
2573 *
2574 * parse an HTML attribute value till the stop (quote), if
2575 * stop is 0 then it stops at the first space
2576 *
2577 * Returns the attribute parsed or NULL
2578 */
2579
2580static xmlChar *
2581htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2582 xmlChar *buffer = NULL;
2583 int buffer_size = 0;
2584 xmlChar *out = NULL;
2585 const xmlChar *name = NULL;
2586 const xmlChar *cur = NULL;
2587 const htmlEntityDesc * ent;
2588
2589 /*
2590 * allocate a translation buffer.
2591 */
2592 buffer_size = HTML_PARSER_BUFFER_SIZE;
2593 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2594 if (buffer == NULL) {
2595 htmlErrMemory(ctxt, "buffer allocation failed\n");
2596 return(NULL);
2597 }
2598 out = buffer;
2599
2600 /*
2601 * Ok loop until we reach one of the ending chars
2602 */
2603 while ((CUR != 0) && (CUR != stop)) {
2604 if ((stop == 0) && (CUR == '>')) break;
2605 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2606 if (CUR == '&') {
2607 if (NXT(1) == '#') {
2608 unsigned int c;
2609 int bits;
2610
2611 c = htmlParseCharRef(ctxt);
2612 if (c < 0x80)
2613 { *out++ = c; bits= -6; }
2614 else if (c < 0x800)
2615 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2616 else if (c < 0x10000)
2617 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2618 else
2619 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2620
2621 for ( ; bits >= 0; bits-= 6) {
2622 *out++ = ((c >> bits) & 0x3F) | 0x80;
2623 }
2624
2625 if (out - buffer > buffer_size - 100) {
2626 int indx = out - buffer;
2627
2628 growBuffer(buffer);
2629 out = &buffer[indx];
2630 }
2631 } else {
2632 ent = htmlParseEntityRef(ctxt, &name);
2633 if (name == NULL) {
2634 *out++ = '&';
2635 if (out - buffer > buffer_size - 100) {
2636 int indx = out - buffer;
2637
2638 growBuffer(buffer);
2639 out = &buffer[indx];
2640 }
2641 } else if (ent == NULL) {
2642 *out++ = '&';
2643 cur = name;
2644 while (*cur != 0) {
2645 if (out - buffer > buffer_size - 100) {
2646 int indx = out - buffer;
2647
2648 growBuffer(buffer);
2649 out = &buffer[indx];
2650 }
2651 *out++ = *cur++;
2652 }
2653 } else {
2654 unsigned int c;
2655 int bits;
2656
2657 if (out - buffer > buffer_size - 100) {
2658 int indx = out - buffer;
2659
2660 growBuffer(buffer);
2661 out = &buffer[indx];
2662 }
2663 c = ent->value;
2664 if (c < 0x80)
2665 { *out++ = c; bits= -6; }
2666 else if (c < 0x800)
2667 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2668 else if (c < 0x10000)
2669 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2670 else
2671 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2672
2673 for ( ; bits >= 0; bits-= 6) {
2674 *out++ = ((c >> bits) & 0x3F) | 0x80;
2675 }
2676 }
2677 }
2678 } else {
2679 unsigned int c;
2680 int bits, l;
2681
2682 if (out - buffer > buffer_size - 100) {
2683 int indx = out - buffer;
2684
2685 growBuffer(buffer);
2686 out = &buffer[indx];
2687 }
2688 c = CUR_CHAR(l);
2689 if (c < 0x80)
2690 { *out++ = c; bits= -6; }
2691 else if (c < 0x800)
2692 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2693 else if (c < 0x10000)
2694 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2695 else
2696 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2697
2698 for ( ; bits >= 0; bits-= 6) {
2699 *out++ = ((c >> bits) & 0x3F) | 0x80;
2700 }
2701 NEXT;
2702 }
2703 }
2704 *out = 0;
2705 return(buffer);
2706}
2707
2708/**
2709 * htmlParseEntityRef:
2710 * @ctxt: an HTML parser context
2711 * @str: location to store the entity name
2712 *
2713 * parse an HTML ENTITY references
2714 *
2715 * [68] EntityRef ::= '&' Name ';'
2716 *
2717 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2718 * if non-NULL *str will have to be freed by the caller.
2719 */
2720const htmlEntityDesc *
2721htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2722 const xmlChar *name;
2723 const htmlEntityDesc * ent = NULL;
2724
2725 if (str != NULL) *str = NULL;
2726 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2727
2728 if (CUR == '&') {
2729 NEXT;
2730 name = htmlParseName(ctxt);
2731 if (name == NULL) {
2732 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2733 "htmlParseEntityRef: no name\n", NULL, NULL);
2734 } else {
2735 GROW;
2736 if (CUR == ';') {
2737 if (str != NULL)
2738 *str = name;
2739
2740 /*
2741 * Lookup the entity in the table.
2742 */
2743 ent = htmlEntityLookup(name);
2744 if (ent != NULL) /* OK that's ugly !!! */
2745 NEXT;
2746 } else {
2747 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2748 "htmlParseEntityRef: expecting ';'\n",
2749 NULL, NULL);
2750 if (str != NULL)
2751 *str = name;
2752 }
2753 }
2754 }
2755 return(ent);
2756}
2757
2758/**
2759 * htmlParseAttValue:
2760 * @ctxt: an HTML parser context
2761 *
2762 * parse a value for an attribute
2763 * Note: the parser won't do substitution of entities here, this
2764 * will be handled later in xmlStringGetNodeList, unless it was
2765 * asked for ctxt->replaceEntities != 0
2766 *
2767 * Returns the AttValue parsed or NULL.
2768 */
2769
2770static xmlChar *
2771htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2772 xmlChar *ret = NULL;
2773
2774 if (CUR == '"') {
2775 NEXT;
2776 ret = htmlParseHTMLAttribute(ctxt, '"');
2777 if (CUR != '"') {
2778 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2779 "AttValue: \" expected\n", NULL, NULL);
2780 } else
2781 NEXT;
2782 } else if (CUR == '\'') {
2783 NEXT;
2784 ret = htmlParseHTMLAttribute(ctxt, '\'');
2785 if (CUR != '\'') {
2786 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2787 "AttValue: ' expected\n", NULL, NULL);
2788 } else
2789 NEXT;
2790 } else {
2791 /*
2792 * That's an HTMLism, the attribute value may not be quoted
2793 */
2794 ret = htmlParseHTMLAttribute(ctxt, 0);
2795 if (ret == NULL) {
2796 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2797 "AttValue: no value found\n", NULL, NULL);
2798 }
2799 }
2800 return(ret);
2801}
2802
2803/**
2804 * htmlParseSystemLiteral:
2805 * @ctxt: an HTML parser context
2806 *
2807 * parse an HTML Literal
2808 *
2809 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2810 *
2811 * Returns the SystemLiteral parsed or NULL
2812 */
2813
2814static xmlChar *
2815htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2816 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002817 int err = 0;
2818 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002819 xmlChar *ret = NULL;
2820
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002821 if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002822 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002823 "SystemLiteral \" or ' expected\n", NULL, NULL);
2824 return(NULL);
2825 }
2826 quote = CUR;
2827 NEXT;
2828
2829 if (CUR_PTR < BASE_PTR)
2830 return(ret);
2831 startPosition = CUR_PTR - BASE_PTR;
2832
2833 while ((CUR != 0) && (CUR != quote)) {
2834 /* TODO: Handle UTF-8 */
2835 if (!IS_CHAR_CH(CUR)) {
2836 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2837 "Invalid char in SystemLiteral 0x%X\n", CUR);
2838 err = 1;
2839 }
2840 NEXT;
2841 len++;
2842 }
2843 if (CUR != quote) {
2844 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2845 "Unfinished SystemLiteral\n", NULL, NULL);
2846 } else {
2847 NEXT;
2848 if (err == 0)
2849 ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002850 }
2851
2852 return(ret);
2853}
2854
2855/**
2856 * htmlParsePubidLiteral:
2857 * @ctxt: an HTML parser context
2858 *
2859 * parse an HTML public literal
2860 *
2861 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2862 *
2863 * Returns the PubidLiteral parsed or NULL.
2864 */
2865
2866static xmlChar *
2867htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2868 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002869 int err = 0;
2870 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002871 xmlChar *ret = NULL;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002872
2873 if ((CUR != '"') && (CUR != '\'')) {
2874 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2875 "PubidLiteral \" or ' expected\n", NULL, NULL);
2876 return(NULL);
2877 }
2878 quote = CUR;
2879 NEXT;
2880
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002881 /*
2882 * Name ::= (Letter | '_') (NameChar)*
2883 */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002884 if (CUR_PTR < BASE_PTR)
2885 return(ret);
2886 startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002887
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002888 while ((CUR != 0) && (CUR != quote)) {
2889 if (!IS_PUBIDCHAR_CH(CUR)) {
2890 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891 "Invalid char in PubidLiteral 0x%X\n", CUR);
2892 err = 1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002893 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002894 len++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002895 NEXT;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002896 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002897
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002898 if (CUR != '"') {
2899 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2900 "Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002901 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002902 NEXT;
2903 if (err == 0)
2904 ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002905 }
2906
2907 return(ret);
2908}
2909
2910/**
2911 * htmlParseScript:
2912 * @ctxt: an HTML parser context
2913 *
2914 * parse the content of an HTML SCRIPT or STYLE element
2915 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2916 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2917 * http://www.w3.org/TR/html4/types.html#type-script
2918 * http://www.w3.org/TR/html4/types.html#h-6.15
2919 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2920 *
2921 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2922 * element and the value of intrinsic event attributes. User agents must
2923 * not evaluate script data as HTML markup but instead must pass it on as
2924 * data to a script engine.
2925 * NOTES:
2926 * - The content is passed like CDATA
2927 * - the attributes for style and scripting "onXXX" are also described
2928 * as CDATA but SGML allows entities references in attributes so their
2929 * processing is identical as other attributes
2930 */
2931static void
2932htmlParseScript(htmlParserCtxtPtr ctxt) {
2933 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2934 int nbchar = 0;
2935 int cur,l;
2936
2937 SHRINK;
2938 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002939 while (cur != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002940 if ((cur == '<') && (NXT(1) == '/')) {
2941 /*
2942 * One should break here, the specification is clear:
2943 * Authors should therefore escape "</" within the content.
2944 * Escape mechanisms are specific to each scripting or
2945 * style sheet language.
2946 *
2947 * In recovery mode, only break if end tag match the
2948 * current tag, effectively ignoring all tags inside the
2949 * script/style block and treating the entire block as
2950 * CDATA.
2951 */
2952 if (ctxt->recovery) {
2953 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2954 xmlStrlen(ctxt->name)) == 0)
2955 {
2956 break; /* while */
2957 } else {
2958 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2959 "Element %s embeds close tag\n",
2960 ctxt->name, NULL);
2961 }
2962 } else {
2963 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2964 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2965 {
2966 break; /* while */
2967 }
2968 }
2969 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002970 if (IS_CHAR(cur)) {
2971 COPY_BUF(l,buf,nbchar,cur);
2972 } else {
2973 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2974 "Invalid char in CDATA 0x%X\n", cur);
2975 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002976 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002977 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002978 if (ctxt->sax->cdataBlock!= NULL) {
2979 /*
2980 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2981 */
2982 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2983 } else if (ctxt->sax->characters != NULL) {
2984 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2985 }
2986 nbchar = 0;
2987 }
2988 GROW;
2989 NEXTL(l);
2990 cur = CUR_CHAR(l);
2991 }
2992
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002993 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002994 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002995 if (ctxt->sax->cdataBlock!= NULL) {
2996 /*
2997 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2998 */
2999 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3000 } else if (ctxt->sax->characters != NULL) {
3001 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3002 }
3003 }
3004}
3005
3006
3007/**
3008 * htmlParseCharDataInternal:
3009 * @ctxt: an HTML parser context
3010 * @readahead: optional read ahead character in ascii range
3011 *
3012 * parse a CharData section.
3013 * if we are within a CDATA section ']]>' marks an end of section.
3014 *
3015 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3016 */
3017
3018static void
3019htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3020 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3021 int nbchar = 0;
3022 int cur, l;
3023 int chunk = 0;
3024
3025 if (readahead)
3026 buf[nbchar++] = readahead;
3027
3028 SHRINK;
3029 cur = CUR_CHAR(l);
3030 while (((cur != '<') || (ctxt->token == '<')) &&
3031 ((cur != '&') || (ctxt->token == '&')) &&
3032 (cur != 0)) {
3033 if (!(IS_CHAR(cur))) {
3034 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3035 "Invalid char in CDATA 0x%X\n", cur);
3036 } else {
3037 COPY_BUF(l,buf,nbchar,cur);
3038 }
3039 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003040 buf[nbchar] = 0;
3041
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003042 /*
3043 * Ok the segment is to be consumed as chars.
3044 */
3045 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3046 if (areBlanks(ctxt, buf, nbchar)) {
3047 if (ctxt->keepBlanks) {
3048 if (ctxt->sax->characters != NULL)
3049 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3050 } else {
3051 if (ctxt->sax->ignorableWhitespace != NULL)
3052 ctxt->sax->ignorableWhitespace(ctxt->userData,
3053 buf, nbchar);
3054 }
3055 } else {
3056 htmlCheckParagraph(ctxt);
3057 if (ctxt->sax->characters != NULL)
3058 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3059 }
3060 }
3061 nbchar = 0;
3062 }
3063 NEXTL(l);
3064 chunk++;
3065 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3066 chunk = 0;
3067 SHRINK;
3068 GROW;
3069 }
3070 cur = CUR_CHAR(l);
3071 if (cur == 0) {
3072 SHRINK;
3073 GROW;
3074 cur = CUR_CHAR(l);
3075 }
3076 }
3077 if (nbchar != 0) {
3078 buf[nbchar] = 0;
3079
3080 /*
3081 * Ok the segment is to be consumed as chars.
3082 */
3083 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3084 if (areBlanks(ctxt, buf, nbchar)) {
3085 if (ctxt->keepBlanks) {
3086 if (ctxt->sax->characters != NULL)
3087 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3088 } else {
3089 if (ctxt->sax->ignorableWhitespace != NULL)
3090 ctxt->sax->ignorableWhitespace(ctxt->userData,
3091 buf, nbchar);
3092 }
3093 } else {
3094 htmlCheckParagraph(ctxt);
3095 if (ctxt->sax->characters != NULL)
3096 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3097 }
3098 }
3099 } else {
3100 /*
3101 * Loop detection
3102 */
3103 if (cur == 0)
3104 ctxt->instate = XML_PARSER_EOF;
3105 }
3106}
3107
3108/**
3109 * htmlParseCharData:
3110 * @ctxt: an HTML parser context
3111 *
3112 * parse a CharData section.
3113 * if we are within a CDATA section ']]>' marks an end of section.
3114 *
3115 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3116 */
3117
3118static void
3119htmlParseCharData(htmlParserCtxtPtr ctxt) {
3120 htmlParseCharDataInternal(ctxt, 0);
3121}
3122
3123/**
3124 * htmlParseExternalID:
3125 * @ctxt: an HTML parser context
3126 * @publicID: a xmlChar** receiving PubidLiteral
3127 *
3128 * Parse an External ID or a Public ID
3129 *
3130 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3131 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3132 *
3133 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3134 *
3135 * Returns the function returns SystemLiteral and in the second
3136 * case publicID receives PubidLiteral, is strict is off
3137 * it is possible to return NULL and have publicID set.
3138 */
3139
3140static xmlChar *
3141htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3142 xmlChar *URI = NULL;
3143
3144 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3145 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3146 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3147 SKIP(6);
3148 if (!IS_BLANK_CH(CUR)) {
3149 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3150 "Space required after 'SYSTEM'\n", NULL, NULL);
3151 }
3152 SKIP_BLANKS;
3153 URI = htmlParseSystemLiteral(ctxt);
3154 if (URI == NULL) {
3155 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3156 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3157 }
3158 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3159 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3160 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3161 SKIP(6);
3162 if (!IS_BLANK_CH(CUR)) {
3163 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3164 "Space required after 'PUBLIC'\n", NULL, NULL);
3165 }
3166 SKIP_BLANKS;
3167 *publicID = htmlParsePubidLiteral(ctxt);
3168 if (*publicID == NULL) {
3169 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3170 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3171 NULL, NULL);
3172 }
3173 SKIP_BLANKS;
3174 if ((CUR == '"') || (CUR == '\'')) {
3175 URI = htmlParseSystemLiteral(ctxt);
3176 }
3177 }
3178 return(URI);
3179}
3180
3181/**
3182 * xmlParsePI:
3183 * @ctxt: an XML parser context
3184 *
3185 * parse an XML Processing Instruction.
3186 *
3187 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3188 */
3189static void
3190htmlParsePI(htmlParserCtxtPtr ctxt) {
3191 xmlChar *buf = NULL;
3192 int len = 0;
3193 int size = HTML_PARSER_BUFFER_SIZE;
3194 int cur, l;
3195 const xmlChar *target;
3196 xmlParserInputState state;
3197 int count = 0;
3198
3199 if ((RAW == '<') && (NXT(1) == '?')) {
3200 state = ctxt->instate;
3201 ctxt->instate = XML_PARSER_PI;
3202 /*
3203 * this is a Processing Instruction.
3204 */
3205 SKIP(2);
3206 SHRINK;
3207
3208 /*
3209 * Parse the target name and check for special support like
3210 * namespace.
3211 */
3212 target = htmlParseName(ctxt);
3213 if (target != NULL) {
3214 if (RAW == '>') {
3215 SKIP(1);
3216
3217 /*
3218 * SAX: PI detected.
3219 */
3220 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3221 (ctxt->sax->processingInstruction != NULL))
3222 ctxt->sax->processingInstruction(ctxt->userData,
3223 target, NULL);
3224 ctxt->instate = state;
3225 return;
3226 }
3227 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3228 if (buf == NULL) {
3229 htmlErrMemory(ctxt, NULL);
3230 ctxt->instate = state;
3231 return;
3232 }
3233 cur = CUR;
3234 if (!IS_BLANK(cur)) {
3235 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3236 "ParsePI: PI %s space expected\n", target, NULL);
3237 }
3238 SKIP_BLANKS;
3239 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003240 while ((cur != 0) && (cur != '>')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003241 if (len + 5 >= size) {
3242 xmlChar *tmp;
3243
3244 size *= 2;
3245 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3246 if (tmp == NULL) {
3247 htmlErrMemory(ctxt, NULL);
3248 xmlFree(buf);
3249 ctxt->instate = state;
3250 return;
3251 }
3252 buf = tmp;
3253 }
3254 count++;
3255 if (count > 50) {
3256 GROW;
3257 count = 0;
3258 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003259 if (IS_CHAR(cur)) {
3260 COPY_BUF(l,buf,len,cur);
3261 } else {
3262 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3263 "Invalid char in processing instruction "
3264 "0x%X\n", cur);
3265 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003266 NEXTL(l);
3267 cur = CUR_CHAR(l);
3268 if (cur == 0) {
3269 SHRINK;
3270 GROW;
3271 cur = CUR_CHAR(l);
3272 }
3273 }
3274 buf[len] = 0;
3275 if (cur != '>') {
3276 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3277 "ParsePI: PI %s never end ...\n", target, NULL);
3278 } else {
3279 SKIP(1);
3280
3281 /*
3282 * SAX: PI detected.
3283 */
3284 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3285 (ctxt->sax->processingInstruction != NULL))
3286 ctxt->sax->processingInstruction(ctxt->userData,
3287 target, buf);
3288 }
3289 xmlFree(buf);
3290 } else {
3291 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3292 "PI is not started correctly", NULL, NULL);
3293 }
3294 ctxt->instate = state;
3295 }
3296}
3297
3298/**
3299 * htmlParseComment:
3300 * @ctxt: an HTML parser context
3301 *
3302 * Parse an XML (SGML) comment <!-- .... -->
3303 *
3304 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3305 */
3306static void
3307htmlParseComment(htmlParserCtxtPtr ctxt) {
3308 xmlChar *buf = NULL;
3309 int len;
3310 int size = HTML_PARSER_BUFFER_SIZE;
3311 int q, ql;
3312 int r, rl;
3313 int cur, l;
Haibo Huangd75f3892021-01-05 21:34:50 -08003314 int next, nl;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003315 xmlParserInputState state;
3316
3317 /*
3318 * Check that there is a comment right here.
3319 */
3320 if ((RAW != '<') || (NXT(1) != '!') ||
3321 (NXT(2) != '-') || (NXT(3) != '-')) return;
3322
3323 state = ctxt->instate;
3324 ctxt->instate = XML_PARSER_COMMENT;
3325 SHRINK;
3326 SKIP(4);
3327 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3328 if (buf == NULL) {
3329 htmlErrMemory(ctxt, "buffer allocation failed\n");
3330 ctxt->instate = state;
3331 return;
3332 }
3333 len = 0;
3334 buf[len] = 0;
3335 q = CUR_CHAR(ql);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003336 if (q == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003337 goto unfinished;
3338 NEXTL(ql);
3339 r = CUR_CHAR(rl);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003340 if (r == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003341 goto unfinished;
3342 NEXTL(rl);
3343 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003344 while ((cur != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003345 ((cur != '>') ||
3346 (r != '-') || (q != '-'))) {
Haibo Huangd75f3892021-01-05 21:34:50 -08003347 NEXTL(l);
3348 next = CUR_CHAR(nl);
3349 if (next == 0) {
3350 SHRINK;
3351 GROW;
3352 next = CUR_CHAR(nl);
3353 }
3354
3355 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3356 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3357 "Comment incorrectly closed by '--!>'", NULL, NULL);
3358 cur = '>';
3359 break;
3360 }
3361
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003362 if (len + 5 >= size) {
3363 xmlChar *tmp;
3364
3365 size *= 2;
3366 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3367 if (tmp == NULL) {
3368 xmlFree(buf);
3369 htmlErrMemory(ctxt, "growing buffer failed\n");
3370 ctxt->instate = state;
3371 return;
3372 }
3373 buf = tmp;
3374 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003375 if (IS_CHAR(q)) {
3376 COPY_BUF(ql,buf,len,q);
3377 } else {
3378 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3379 "Invalid char in comment 0x%X\n", q);
3380 }
Haibo Huangd75f3892021-01-05 21:34:50 -08003381
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003382 q = r;
3383 ql = rl;
3384 r = cur;
3385 rl = l;
Haibo Huangd75f3892021-01-05 21:34:50 -08003386 cur = next;
3387 l = nl;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003388 }
3389 buf[len] = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003390 if (cur == '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003391 NEXT;
3392 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3393 (!ctxt->disableSAX))
3394 ctxt->sax->comment(ctxt->userData, buf);
3395 xmlFree(buf);
3396 ctxt->instate = state;
3397 return;
3398 }
3399
3400unfinished:
3401 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3402 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3403 xmlFree(buf);
3404}
3405
3406/**
3407 * htmlParseCharRef:
3408 * @ctxt: an HTML parser context
3409 *
3410 * parse Reference declarations
3411 *
3412 * [66] CharRef ::= '&#' [0-9]+ ';' |
3413 * '&#x' [0-9a-fA-F]+ ';'
3414 *
3415 * Returns the value parsed (as an int)
3416 */
3417int
3418htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3419 int val = 0;
3420
3421 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3422 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3423 "htmlParseCharRef: context error\n",
3424 NULL, NULL);
3425 return(0);
3426 }
3427 if ((CUR == '&') && (NXT(1) == '#') &&
3428 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3429 SKIP(3);
3430 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003431 if ((CUR >= '0') && (CUR <= '9')) {
3432 if (val < 0x110000)
3433 val = val * 16 + (CUR - '0');
3434 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3435 if (val < 0x110000)
3436 val = val * 16 + (CUR - 'a') + 10;
3437 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3438 if (val < 0x110000)
3439 val = val * 16 + (CUR - 'A') + 10;
3440 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003441 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3442 "htmlParseCharRef: missing semicolon\n",
3443 NULL, NULL);
3444 break;
3445 }
3446 NEXT;
3447 }
3448 if (CUR == ';')
3449 NEXT;
3450 } else if ((CUR == '&') && (NXT(1) == '#')) {
3451 SKIP(2);
3452 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003453 if ((CUR >= '0') && (CUR <= '9')) {
3454 if (val < 0x110000)
3455 val = val * 10 + (CUR - '0');
3456 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003457 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3458 "htmlParseCharRef: missing semicolon\n",
3459 NULL, NULL);
3460 break;
3461 }
3462 NEXT;
3463 }
3464 if (CUR == ';')
3465 NEXT;
3466 } else {
3467 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3468 "htmlParseCharRef: invalid value\n", NULL, NULL);
3469 }
3470 /*
3471 * Check the value IS_CHAR ...
3472 */
3473 if (IS_CHAR(val)) {
3474 return(val);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003475 } else if (val >= 0x110000) {
3476 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3477 "htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003478 } else {
3479 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3480 "htmlParseCharRef: invalid xmlChar value %d\n",
3481 val);
3482 }
3483 return(0);
3484}
3485
3486
3487/**
3488 * htmlParseDocTypeDecl:
3489 * @ctxt: an HTML parser context
3490 *
3491 * parse a DOCTYPE declaration
3492 *
3493 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3494 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3495 */
3496
3497static void
3498htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3499 const xmlChar *name;
3500 xmlChar *ExternalID = NULL;
3501 xmlChar *URI = NULL;
3502
3503 /*
3504 * We know that '<!DOCTYPE' has been detected.
3505 */
3506 SKIP(9);
3507
3508 SKIP_BLANKS;
3509
3510 /*
3511 * Parse the DOCTYPE name.
3512 */
3513 name = htmlParseName(ctxt);
3514 if (name == NULL) {
3515 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3516 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3517 NULL, NULL);
3518 }
3519 /*
3520 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3521 */
3522
3523 SKIP_BLANKS;
3524
3525 /*
3526 * Check for SystemID and ExternalID
3527 */
3528 URI = htmlParseExternalID(ctxt, &ExternalID);
3529 SKIP_BLANKS;
3530
3531 /*
3532 * We should be at the end of the DOCTYPE declaration.
3533 */
3534 if (CUR != '>') {
3535 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3536 "DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003537 /* Ignore bogus content */
3538 while ((CUR != 0) && (CUR != '>'))
3539 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003540 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003541 if (CUR == '>')
3542 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003543
3544 /*
3545 * Create or update the document accordingly to the DOCTYPE
3546 */
3547 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3548 (!ctxt->disableSAX))
3549 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3550
3551 /*
3552 * Cleanup, since we don't use all those identifiers
3553 */
3554 if (URI != NULL) xmlFree(URI);
3555 if (ExternalID != NULL) xmlFree(ExternalID);
3556}
3557
3558/**
3559 * htmlParseAttribute:
3560 * @ctxt: an HTML parser context
3561 * @value: a xmlChar ** used to store the value of the attribute
3562 *
3563 * parse an attribute
3564 *
3565 * [41] Attribute ::= Name Eq AttValue
3566 *
3567 * [25] Eq ::= S? '=' S?
3568 *
3569 * With namespace:
3570 *
3571 * [NS 11] Attribute ::= QName Eq AttValue
3572 *
3573 * Also the case QName == xmlns:??? is handled independently as a namespace
3574 * definition.
3575 *
3576 * Returns the attribute name, and the value in *value.
3577 */
3578
3579static const xmlChar *
3580htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3581 const xmlChar *name;
3582 xmlChar *val = NULL;
3583
3584 *value = NULL;
3585 name = htmlParseHTMLName(ctxt);
3586 if (name == NULL) {
3587 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3588 "error parsing attribute name\n", NULL, NULL);
3589 return(NULL);
3590 }
3591
3592 /*
3593 * read the value
3594 */
3595 SKIP_BLANKS;
3596 if (CUR == '=') {
3597 NEXT;
3598 SKIP_BLANKS;
3599 val = htmlParseAttValue(ctxt);
3600 }
3601
3602 *value = val;
3603 return(name);
3604}
3605
3606/**
3607 * htmlCheckEncodingDirect:
3608 * @ctxt: an HTML parser context
3609 * @attvalue: the attribute value
3610 *
3611 * Checks an attribute value to detect
3612 * the encoding
3613 * If a new encoding is detected the parser is switched to decode
3614 * it and pass UTF8
3615 */
3616static void
3617htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3618
3619 if ((ctxt == NULL) || (encoding == NULL) ||
3620 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3621 return;
3622
3623 /* do not change encoding */
3624 if (ctxt->input->encoding != NULL)
3625 return;
3626
3627 if (encoding != NULL) {
3628 xmlCharEncoding enc;
3629 xmlCharEncodingHandlerPtr handler;
3630
3631 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3632
3633 if (ctxt->input->encoding != NULL)
3634 xmlFree((xmlChar *) ctxt->input->encoding);
3635 ctxt->input->encoding = xmlStrdup(encoding);
3636
3637 enc = xmlParseCharEncoding((const char *) encoding);
3638 /*
3639 * registered set of known encodings
3640 */
3641 if (enc != XML_CHAR_ENCODING_ERROR) {
3642 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3643 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3644 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3645 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3646 (ctxt->input->buf != NULL) &&
3647 (ctxt->input->buf->encoder == NULL)) {
3648 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3649 "htmlCheckEncoding: wrong encoding meta\n",
3650 NULL, NULL);
3651 } else {
3652 xmlSwitchEncoding(ctxt, enc);
3653 }
3654 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3655 } else {
3656 /*
3657 * fallback for unknown encodings
3658 */
3659 handler = xmlFindCharEncodingHandler((const char *) encoding);
3660 if (handler != NULL) {
3661 xmlSwitchToEncoding(ctxt, handler);
3662 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3663 } else {
3664 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3665 "htmlCheckEncoding: unknown encoding %s\n",
3666 encoding, NULL);
3667 }
3668 }
3669
3670 if ((ctxt->input->buf != NULL) &&
3671 (ctxt->input->buf->encoder != NULL) &&
3672 (ctxt->input->buf->raw != NULL) &&
3673 (ctxt->input->buf->buffer != NULL)) {
3674 int nbchars;
3675 int processed;
3676
3677 /*
3678 * convert as much as possible to the parser reading buffer.
3679 */
3680 processed = ctxt->input->cur - ctxt->input->base;
3681 xmlBufShrink(ctxt->input->buf->buffer, processed);
3682 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3683 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3684 if (nbchars < 0) {
3685 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3686 "htmlCheckEncoding: encoder error\n",
3687 NULL, NULL);
3688 }
3689 }
3690 }
3691}
3692
3693/**
3694 * htmlCheckEncoding:
3695 * @ctxt: an HTML parser context
3696 * @attvalue: the attribute value
3697 *
3698 * Checks an http-equiv attribute from a Meta tag to detect
3699 * the encoding
3700 * If a new encoding is detected the parser is switched to decode
3701 * it and pass UTF8
3702 */
3703static void
3704htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3705 const xmlChar *encoding;
3706
3707 if (!attvalue)
3708 return;
3709
3710 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3711 if (encoding != NULL) {
3712 encoding += 7;
3713 }
3714 /*
3715 * skip blank
3716 */
3717 if (encoding && IS_BLANK_CH(*encoding))
3718 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3719 if (encoding && *encoding == '=') {
3720 encoding ++;
3721 htmlCheckEncodingDirect(ctxt, encoding);
3722 }
3723}
3724
3725/**
3726 * htmlCheckMeta:
3727 * @ctxt: an HTML parser context
3728 * @atts: the attributes values
3729 *
3730 * Checks an attributes from a Meta tag
3731 */
3732static void
3733htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3734 int i;
3735 const xmlChar *att, *value;
3736 int http = 0;
3737 const xmlChar *content = NULL;
3738
3739 if ((ctxt == NULL) || (atts == NULL))
3740 return;
3741
3742 i = 0;
3743 att = atts[i++];
3744 while (att != NULL) {
3745 value = atts[i++];
3746 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3747 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3748 http = 1;
3749 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3750 htmlCheckEncodingDirect(ctxt, value);
3751 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3752 content = value;
3753 att = atts[i++];
3754 }
3755 if ((http) && (content != NULL))
3756 htmlCheckEncoding(ctxt, content);
3757
3758}
3759
3760/**
3761 * htmlParseStartTag:
3762 * @ctxt: an HTML parser context
3763 *
3764 * parse a start of tag either for rule element or
3765 * EmptyElement. In both case we don't parse the tag closing chars.
3766 *
3767 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3768 *
3769 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3770 *
3771 * With namespace:
3772 *
3773 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3774 *
3775 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3776 *
3777 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3778 */
3779
3780static int
3781htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3782 const xmlChar *name;
3783 const xmlChar *attname;
3784 xmlChar *attvalue;
3785 const xmlChar **atts;
3786 int nbatts = 0;
3787 int maxatts;
3788 int meta = 0;
3789 int i;
3790 int discardtag = 0;
3791
3792 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3793 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3794 "htmlParseStartTag: context error\n", NULL, NULL);
3795 return -1;
3796 }
3797 if (ctxt->instate == XML_PARSER_EOF)
3798 return(-1);
3799 if (CUR != '<') return -1;
3800 NEXT;
3801
3802 atts = ctxt->atts;
3803 maxatts = ctxt->maxatts;
3804
3805 GROW;
3806 name = htmlParseHTMLName(ctxt);
3807 if (name == NULL) {
3808 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3809 "htmlParseStartTag: invalid element name\n",
3810 NULL, NULL);
3811 /* if recover preserve text on classic misconstructs */
3812 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3813 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3814 htmlParseCharDataInternal(ctxt, '<');
3815 return(-1);
3816 }
3817
3818
3819 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003820 while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003821 (ctxt->instate != XML_PARSER_EOF))
3822 NEXT;
3823 return -1;
3824 }
3825 if (xmlStrEqual(name, BAD_CAST"meta"))
3826 meta = 1;
3827
3828 /*
3829 * Check for auto-closure of HTML elements.
3830 */
3831 htmlAutoClose(ctxt, name);
3832
3833 /*
3834 * Check for implied HTML elements.
3835 */
3836 htmlCheckImplied(ctxt, name);
3837
3838 /*
3839 * Avoid html at any level > 0, head at any level != 1
3840 * or any attempt to recurse body
3841 */
3842 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3843 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3844 "htmlParseStartTag: misplaced <html> tag\n",
3845 name, NULL);
3846 discardtag = 1;
3847 ctxt->depth++;
3848 }
3849 if ((ctxt->nameNr != 1) &&
3850 (xmlStrEqual(name, BAD_CAST"head"))) {
3851 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3852 "htmlParseStartTag: misplaced <head> tag\n",
3853 name, NULL);
3854 discardtag = 1;
3855 ctxt->depth++;
3856 }
3857 if (xmlStrEqual(name, BAD_CAST"body")) {
3858 int indx;
3859 for (indx = 0;indx < ctxt->nameNr;indx++) {
3860 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3861 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3862 "htmlParseStartTag: misplaced <body> tag\n",
3863 name, NULL);
3864 discardtag = 1;
3865 ctxt->depth++;
3866 }
3867 }
3868 }
3869
3870 /*
3871 * Now parse the attributes, it ends up with the ending
3872 *
3873 * (S Attribute)* S?
3874 */
3875 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003876 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003877 (CUR != '>') &&
3878 ((CUR != '/') || (NXT(1) != '>'))) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003879 GROW;
3880 attname = htmlParseAttribute(ctxt, &attvalue);
3881 if (attname != NULL) {
3882
3883 /*
3884 * Well formedness requires at most one declaration of an attribute
3885 */
3886 for (i = 0; i < nbatts;i += 2) {
3887 if (xmlStrEqual(atts[i], attname)) {
3888 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3889 "Attribute %s redefined\n", attname, NULL);
3890 if (attvalue != NULL)
3891 xmlFree(attvalue);
3892 goto failed;
3893 }
3894 }
3895
3896 /*
3897 * Add the pair to atts
3898 */
3899 if (atts == NULL) {
3900 maxatts = 22; /* allow for 10 attrs by default */
3901 atts = (const xmlChar **)
3902 xmlMalloc(maxatts * sizeof(xmlChar *));
3903 if (atts == NULL) {
3904 htmlErrMemory(ctxt, NULL);
3905 if (attvalue != NULL)
3906 xmlFree(attvalue);
3907 goto failed;
3908 }
3909 ctxt->atts = atts;
3910 ctxt->maxatts = maxatts;
3911 } else if (nbatts + 4 > maxatts) {
3912 const xmlChar **n;
3913
3914 maxatts *= 2;
3915 n = (const xmlChar **) xmlRealloc((void *) atts,
3916 maxatts * sizeof(const xmlChar *));
3917 if (n == NULL) {
3918 htmlErrMemory(ctxt, NULL);
3919 if (attvalue != NULL)
3920 xmlFree(attvalue);
3921 goto failed;
3922 }
3923 atts = n;
3924 ctxt->atts = atts;
3925 ctxt->maxatts = maxatts;
3926 }
3927 atts[nbatts++] = attname;
3928 atts[nbatts++] = attvalue;
3929 atts[nbatts] = NULL;
3930 atts[nbatts + 1] = NULL;
3931 }
3932 else {
3933 if (attvalue != NULL)
3934 xmlFree(attvalue);
3935 /* Dump the bogus attribute string up to the next blank or
3936 * the end of the tag. */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003937 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003938 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3939 ((CUR != '/') || (NXT(1) != '>')))
3940 NEXT;
3941 }
3942
3943failed:
3944 SKIP_BLANKS;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003945 }
3946
3947 /*
3948 * Handle specific association to the META tag
3949 */
3950 if (meta && (nbatts != 0))
3951 htmlCheckMeta(ctxt, atts);
3952
3953 /*
3954 * SAX: Start of Element !
3955 */
3956 if (!discardtag) {
3957 htmlnamePush(ctxt, name);
3958 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3959 if (nbatts != 0)
3960 ctxt->sax->startElement(ctxt->userData, name, atts);
3961 else
3962 ctxt->sax->startElement(ctxt->userData, name, NULL);
3963 }
3964 }
3965
3966 if (atts != NULL) {
3967 for (i = 1;i < nbatts;i += 2) {
3968 if (atts[i] != NULL)
3969 xmlFree((xmlChar *) atts[i]);
3970 }
3971 }
3972
3973 return(discardtag);
3974}
3975
3976/**
3977 * htmlParseEndTag:
3978 * @ctxt: an HTML parser context
3979 *
3980 * parse an end of tag
3981 *
3982 * [42] ETag ::= '</' Name S? '>'
3983 *
3984 * With namespace
3985 *
3986 * [NS 9] ETag ::= '</' QName S? '>'
3987 *
3988 * Returns 1 if the current level should be closed.
3989 */
3990
3991static int
3992htmlParseEndTag(htmlParserCtxtPtr ctxt)
3993{
3994 const xmlChar *name;
3995 const xmlChar *oldname;
3996 int i, ret;
3997
3998 if ((CUR != '<') || (NXT(1) != '/')) {
3999 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4000 "htmlParseEndTag: '</' not found\n", NULL, NULL);
4001 return (0);
4002 }
4003 SKIP(2);
4004
4005 name = htmlParseHTMLName(ctxt);
4006 if (name == NULL)
4007 return (0);
4008 /*
4009 * We should definitely be at the ending "S? '>'" part
4010 */
4011 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004012 if (CUR != '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004013 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4014 "End tag : expected '>'\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004015 /* Skip to next '>' */
4016 while ((CUR != 0) && (CUR != '>'))
4017 NEXT;
4018 }
4019 if (CUR == '>')
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004020 NEXT;
4021
4022 /*
4023 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4024 * out now.
4025 */
4026 if ((ctxt->depth > 0) &&
4027 (xmlStrEqual(name, BAD_CAST "html") ||
4028 xmlStrEqual(name, BAD_CAST "body") ||
4029 xmlStrEqual(name, BAD_CAST "head"))) {
4030 ctxt->depth--;
4031 return (0);
4032 }
4033
4034 /*
4035 * If the name read is not one of the element in the parsing stack
4036 * then return, it's just an error.
4037 */
4038 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4039 if (xmlStrEqual(name, ctxt->nameTab[i]))
4040 break;
4041 }
4042 if (i < 0) {
4043 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4044 "Unexpected end tag : %s\n", name, NULL);
4045 return (0);
4046 }
4047
4048
4049 /*
4050 * Check for auto-closure of HTML elements.
4051 */
4052
4053 htmlAutoCloseOnClose(ctxt, name);
4054
4055 /*
4056 * Well formedness constraints, opening and closing must match.
4057 * With the exception that the autoclose may have popped stuff out
4058 * of the stack.
4059 */
4060 if (!xmlStrEqual(name, ctxt->name)) {
4061 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4062 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4063 "Opening and ending tag mismatch: %s and %s\n",
4064 name, ctxt->name);
4065 }
4066 }
4067
4068 /*
4069 * SAX: End of Tag
4070 */
4071 oldname = ctxt->name;
4072 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4073 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4074 ctxt->sax->endElement(ctxt->userData, name);
4075 htmlNodeInfoPop(ctxt);
4076 htmlnamePop(ctxt);
4077 ret = 1;
4078 } else {
4079 ret = 0;
4080 }
4081
4082 return (ret);
4083}
4084
4085
4086/**
4087 * htmlParseReference:
4088 * @ctxt: an HTML parser context
4089 *
4090 * parse and handle entity references in content,
4091 * this will end-up in a call to character() since this is either a
4092 * CharRef, or a predefined entity.
4093 */
4094static void
4095htmlParseReference(htmlParserCtxtPtr ctxt) {
4096 const htmlEntityDesc * ent;
4097 xmlChar out[6];
4098 const xmlChar *name;
4099 if (CUR != '&') return;
4100
4101 if (NXT(1) == '#') {
4102 unsigned int c;
4103 int bits, i = 0;
4104
4105 c = htmlParseCharRef(ctxt);
4106 if (c == 0)
4107 return;
4108
4109 if (c < 0x80) { out[i++]= c; bits= -6; }
4110 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4111 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4112 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4113
4114 for ( ; bits >= 0; bits-= 6) {
4115 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4116 }
4117 out[i] = 0;
4118
4119 htmlCheckParagraph(ctxt);
4120 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4121 ctxt->sax->characters(ctxt->userData, out, i);
4122 } else {
4123 ent = htmlParseEntityRef(ctxt, &name);
4124 if (name == NULL) {
4125 htmlCheckParagraph(ctxt);
4126 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4127 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4128 return;
4129 }
4130 if ((ent == NULL) || !(ent->value > 0)) {
4131 htmlCheckParagraph(ctxt);
4132 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4133 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4134 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4135 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4136 }
4137 } else {
4138 unsigned int c;
4139 int bits, i = 0;
4140
4141 c = ent->value;
4142 if (c < 0x80)
4143 { out[i++]= c; bits= -6; }
4144 else if (c < 0x800)
4145 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4146 else if (c < 0x10000)
4147 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4148 else
4149 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4150
4151 for ( ; bits >= 0; bits-= 6) {
4152 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4153 }
4154 out[i] = 0;
4155
4156 htmlCheckParagraph(ctxt);
4157 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4158 ctxt->sax->characters(ctxt->userData, out, i);
4159 }
4160 }
4161}
4162
4163/**
4164 * htmlParseContent:
4165 * @ctxt: an HTML parser context
4166 *
4167 * Parse a content: comment, sub-element, reference or text.
4168 * Kept for compatibility with old code
4169 */
4170
4171static void
4172htmlParseContent(htmlParserCtxtPtr ctxt) {
4173 xmlChar *currentNode;
4174 int depth;
4175 const xmlChar *name;
4176
4177 currentNode = xmlStrdup(ctxt->name);
4178 depth = ctxt->nameNr;
4179 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004180 GROW;
4181
4182 if (ctxt->instate == XML_PARSER_EOF)
4183 break;
4184
4185 /*
4186 * Our tag or one of it's parent or children is ending.
4187 */
4188 if ((CUR == '<') && (NXT(1) == '/')) {
4189 if (htmlParseEndTag(ctxt) &&
4190 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4191 if (currentNode != NULL)
4192 xmlFree(currentNode);
4193 return;
4194 }
4195 continue; /* while */
4196 }
4197
4198 else if ((CUR == '<') &&
4199 ((IS_ASCII_LETTER(NXT(1))) ||
4200 (NXT(1) == '_') || (NXT(1) == ':'))) {
4201 name = htmlParseHTMLName_nonInvasive(ctxt);
4202 if (name == NULL) {
4203 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4204 "htmlParseStartTag: invalid element name\n",
4205 NULL, NULL);
4206 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004207 while ((CUR != 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004208 NEXT;
4209
4210 if (currentNode != NULL)
4211 xmlFree(currentNode);
4212 return;
4213 }
4214
4215 if (ctxt->name != NULL) {
4216 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4217 htmlAutoClose(ctxt, name);
4218 continue;
4219 }
4220 }
4221 }
4222
4223 /*
4224 * Has this node been popped out during parsing of
4225 * the next element
4226 */
4227 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4228 (!xmlStrEqual(currentNode, ctxt->name)))
4229 {
4230 if (currentNode != NULL) xmlFree(currentNode);
4231 return;
4232 }
4233
4234 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4235 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4236 /*
4237 * Handle SCRIPT/STYLE separately
4238 */
4239 htmlParseScript(ctxt);
4240 } else {
4241 /*
4242 * Sometimes DOCTYPE arrives in the middle of the document
4243 */
4244 if ((CUR == '<') && (NXT(1) == '!') &&
4245 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4246 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4247 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4248 (UPP(8) == 'E')) {
4249 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4250 "Misplaced DOCTYPE declaration\n",
4251 BAD_CAST "DOCTYPE" , NULL);
4252 htmlParseDocTypeDecl(ctxt);
4253 }
4254
4255 /*
4256 * First case : a comment
4257 */
4258 if ((CUR == '<') && (NXT(1) == '!') &&
4259 (NXT(2) == '-') && (NXT(3) == '-')) {
4260 htmlParseComment(ctxt);
4261 }
4262
4263 /*
4264 * Second case : a Processing Instruction.
4265 */
4266 else if ((CUR == '<') && (NXT(1) == '?')) {
4267 htmlParsePI(ctxt);
4268 }
4269
4270 /*
4271 * Third case : a sub-element.
4272 */
4273 else if (CUR == '<') {
4274 htmlParseElement(ctxt);
4275 }
4276
4277 /*
4278 * Fourth case : a reference. If if has not been resolved,
4279 * parsing returns it's Name, create the node
4280 */
4281 else if (CUR == '&') {
4282 htmlParseReference(ctxt);
4283 }
4284
4285 /*
4286 * Fifth case : end of the resource
4287 */
4288 else if (CUR == 0) {
4289 htmlAutoCloseOnEnd(ctxt);
4290 break;
4291 }
4292
4293 /*
4294 * Last case, text. Note that References are handled directly.
4295 */
4296 else {
4297 htmlParseCharData(ctxt);
4298 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004299 }
4300 GROW;
4301 }
4302 if (currentNode != NULL) xmlFree(currentNode);
4303}
4304
4305/**
4306 * htmlParseElement:
4307 * @ctxt: an HTML parser context
4308 *
4309 * parse an HTML element, this is highly recursive
4310 * this is kept for compatibility with previous code versions
4311 *
4312 * [39] element ::= EmptyElemTag | STag content ETag
4313 *
4314 * [41] Attribute ::= Name Eq AttValue
4315 */
4316
4317void
4318htmlParseElement(htmlParserCtxtPtr ctxt) {
4319 const xmlChar *name;
4320 xmlChar *currentNode = NULL;
4321 const htmlElemDesc * info;
4322 htmlParserNodeInfo node_info;
4323 int failed;
4324 int depth;
4325 const xmlChar *oldptr;
4326
4327 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4328 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4329 "htmlParseElement: context error\n", NULL, NULL);
4330 return;
4331 }
4332
4333 if (ctxt->instate == XML_PARSER_EOF)
4334 return;
4335
4336 /* Capture start position */
4337 if (ctxt->record_info) {
4338 node_info.begin_pos = ctxt->input->consumed +
4339 (CUR_PTR - ctxt->input->base);
4340 node_info.begin_line = ctxt->input->line;
4341 }
4342
4343 failed = htmlParseStartTag(ctxt);
4344 name = ctxt->name;
4345 if ((failed == -1) || (name == NULL)) {
4346 if (CUR == '>')
4347 NEXT;
4348 return;
4349 }
4350
4351 /*
4352 * Lookup the info for that element.
4353 */
4354 info = htmlTagLookup(name);
4355 if (info == NULL) {
4356 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4357 "Tag %s invalid\n", name, NULL);
4358 }
4359
4360 /*
4361 * Check for an Empty Element labeled the XML/SGML way
4362 */
4363 if ((CUR == '/') && (NXT(1) == '>')) {
4364 SKIP(2);
4365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4366 ctxt->sax->endElement(ctxt->userData, name);
4367 htmlnamePop(ctxt);
4368 return;
4369 }
4370
4371 if (CUR == '>') {
4372 NEXT;
4373 } else {
4374 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4375 "Couldn't find end of Start Tag %s\n", name, NULL);
4376
4377 /*
4378 * end of parsing of this node.
4379 */
4380 if (xmlStrEqual(name, ctxt->name)) {
4381 nodePop(ctxt);
4382 htmlnamePop(ctxt);
4383 }
4384
4385 /*
4386 * Capture end position and add node
4387 */
4388 if (ctxt->record_info) {
4389 node_info.end_pos = ctxt->input->consumed +
4390 (CUR_PTR - ctxt->input->base);
4391 node_info.end_line = ctxt->input->line;
4392 node_info.node = ctxt->node;
4393 xmlParserAddNodeInfo(ctxt, &node_info);
4394 }
4395 return;
4396 }
4397
4398 /*
4399 * Check for an Empty Element from DTD definition
4400 */
4401 if ((info != NULL) && (info->empty)) {
4402 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4403 ctxt->sax->endElement(ctxt->userData, name);
4404 htmlnamePop(ctxt);
4405 return;
4406 }
4407
4408 /*
4409 * Parse the content of the element:
4410 */
4411 currentNode = xmlStrdup(ctxt->name);
4412 depth = ctxt->nameNr;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004413 while (CUR != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004414 oldptr = ctxt->input->cur;
4415 htmlParseContent(ctxt);
4416 if (oldptr==ctxt->input->cur) break;
4417 if (ctxt->nameNr < depth) break;
4418 }
4419
4420 /*
4421 * Capture end position and add node
4422 */
4423 if ( currentNode != NULL && ctxt->record_info ) {
4424 node_info.end_pos = ctxt->input->consumed +
4425 (CUR_PTR - ctxt->input->base);
4426 node_info.end_line = ctxt->input->line;
4427 node_info.node = ctxt->node;
4428 xmlParserAddNodeInfo(ctxt, &node_info);
4429 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004430 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004431 htmlAutoCloseOnEnd(ctxt);
4432 }
4433
4434 if (currentNode != NULL)
4435 xmlFree(currentNode);
4436}
4437
4438static void
4439htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4440 /*
4441 * Capture end position and add node
4442 */
4443 if ( ctxt->node != NULL && ctxt->record_info ) {
4444 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4445 (CUR_PTR - ctxt->input->base);
4446 ctxt->nodeInfo->end_line = ctxt->input->line;
4447 ctxt->nodeInfo->node = ctxt->node;
4448 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4449 htmlNodeInfoPop(ctxt);
4450 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004451 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004452 htmlAutoCloseOnEnd(ctxt);
4453 }
4454}
4455
4456/**
4457 * htmlParseElementInternal:
4458 * @ctxt: an HTML parser context
4459 *
4460 * parse an HTML element, new version, non recursive
4461 *
4462 * [39] element ::= EmptyElemTag | STag content ETag
4463 *
4464 * [41] Attribute ::= Name Eq AttValue
4465 */
4466
4467static void
4468htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4469 const xmlChar *name;
4470 const htmlElemDesc * info;
4471 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4472 int failed;
4473
4474 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4475 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4476 "htmlParseElementInternal: context error\n", NULL, NULL);
4477 return;
4478 }
4479
4480 if (ctxt->instate == XML_PARSER_EOF)
4481 return;
4482
4483 /* Capture start position */
4484 if (ctxt->record_info) {
4485 node_info.begin_pos = ctxt->input->consumed +
4486 (CUR_PTR - ctxt->input->base);
4487 node_info.begin_line = ctxt->input->line;
4488 }
4489
4490 failed = htmlParseStartTag(ctxt);
4491 name = ctxt->name;
4492 if ((failed == -1) || (name == NULL)) {
4493 if (CUR == '>')
4494 NEXT;
4495 return;
4496 }
4497
4498 /*
4499 * Lookup the info for that element.
4500 */
4501 info = htmlTagLookup(name);
4502 if (info == NULL) {
4503 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4504 "Tag %s invalid\n", name, NULL);
4505 }
4506
4507 /*
4508 * Check for an Empty Element labeled the XML/SGML way
4509 */
4510 if ((CUR == '/') && (NXT(1) == '>')) {
4511 SKIP(2);
4512 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4513 ctxt->sax->endElement(ctxt->userData, name);
4514 htmlnamePop(ctxt);
4515 return;
4516 }
4517
4518 if (CUR == '>') {
4519 NEXT;
4520 } else {
4521 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4522 "Couldn't find end of Start Tag %s\n", name, NULL);
4523
4524 /*
4525 * end of parsing of this node.
4526 */
4527 if (xmlStrEqual(name, ctxt->name)) {
4528 nodePop(ctxt);
4529 htmlnamePop(ctxt);
4530 }
4531
4532 if (ctxt->record_info)
4533 htmlNodeInfoPush(ctxt, &node_info);
4534 htmlParserFinishElementParsing(ctxt);
4535 return;
4536 }
4537
4538 /*
4539 * Check for an Empty Element from DTD definition
4540 */
4541 if ((info != NULL) && (info->empty)) {
4542 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4543 ctxt->sax->endElement(ctxt->userData, name);
4544 htmlnamePop(ctxt);
4545 return;
4546 }
4547
4548 if (ctxt->record_info)
4549 htmlNodeInfoPush(ctxt, &node_info);
4550}
4551
4552/**
4553 * htmlParseContentInternal:
4554 * @ctxt: an HTML parser context
4555 *
4556 * Parse a content: comment, sub-element, reference or text.
4557 * New version for non recursive htmlParseElementInternal
4558 */
4559
4560static void
4561htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4562 xmlChar *currentNode;
4563 int depth;
4564 const xmlChar *name;
4565
4566 currentNode = xmlStrdup(ctxt->name);
4567 depth = ctxt->nameNr;
4568 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004569 GROW;
4570
4571 if (ctxt->instate == XML_PARSER_EOF)
4572 break;
4573
4574 /*
4575 * Our tag or one of it's parent or children is ending.
4576 */
4577 if ((CUR == '<') && (NXT(1) == '/')) {
4578 if (htmlParseEndTag(ctxt) &&
4579 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4580 if (currentNode != NULL)
4581 xmlFree(currentNode);
4582
4583 currentNode = xmlStrdup(ctxt->name);
4584 depth = ctxt->nameNr;
4585 }
4586 continue; /* while */
4587 }
4588
4589 else if ((CUR == '<') &&
4590 ((IS_ASCII_LETTER(NXT(1))) ||
4591 (NXT(1) == '_') || (NXT(1) == ':'))) {
4592 name = htmlParseHTMLName_nonInvasive(ctxt);
4593 if (name == NULL) {
4594 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4595 "htmlParseStartTag: invalid element name\n",
4596 NULL, NULL);
4597 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004598 while ((CUR == 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004599 NEXT;
4600
4601 htmlParserFinishElementParsing(ctxt);
4602 if (currentNode != NULL)
4603 xmlFree(currentNode);
4604
4605 currentNode = xmlStrdup(ctxt->name);
4606 depth = ctxt->nameNr;
4607 continue;
4608 }
4609
4610 if (ctxt->name != NULL) {
4611 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4612 htmlAutoClose(ctxt, name);
4613 continue;
4614 }
4615 }
4616 }
4617
4618 /*
4619 * Has this node been popped out during parsing of
4620 * the next element
4621 */
4622 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4623 (!xmlStrEqual(currentNode, ctxt->name)))
4624 {
4625 htmlParserFinishElementParsing(ctxt);
4626 if (currentNode != NULL) xmlFree(currentNode);
4627
4628 currentNode = xmlStrdup(ctxt->name);
4629 depth = ctxt->nameNr;
4630 continue;
4631 }
4632
4633 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4634 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4635 /*
4636 * Handle SCRIPT/STYLE separately
4637 */
4638 htmlParseScript(ctxt);
4639 } else {
4640 /*
4641 * Sometimes DOCTYPE arrives in the middle of the document
4642 */
4643 if ((CUR == '<') && (NXT(1) == '!') &&
4644 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4645 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4646 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4647 (UPP(8) == 'E')) {
4648 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4649 "Misplaced DOCTYPE declaration\n",
4650 BAD_CAST "DOCTYPE" , NULL);
4651 htmlParseDocTypeDecl(ctxt);
4652 }
4653
4654 /*
4655 * First case : a comment
4656 */
4657 if ((CUR == '<') && (NXT(1) == '!') &&
4658 (NXT(2) == '-') && (NXT(3) == '-')) {
4659 htmlParseComment(ctxt);
4660 }
4661
4662 /*
4663 * Second case : a Processing Instruction.
4664 */
4665 else if ((CUR == '<') && (NXT(1) == '?')) {
4666 htmlParsePI(ctxt);
4667 }
4668
4669 /*
4670 * Third case : a sub-element.
4671 */
4672 else if (CUR == '<') {
4673 htmlParseElementInternal(ctxt);
4674 if (currentNode != NULL) xmlFree(currentNode);
4675
4676 currentNode = xmlStrdup(ctxt->name);
4677 depth = ctxt->nameNr;
4678 }
4679
4680 /*
4681 * Fourth case : a reference. If if has not been resolved,
4682 * parsing returns it's Name, create the node
4683 */
4684 else if (CUR == '&') {
4685 htmlParseReference(ctxt);
4686 }
4687
4688 /*
4689 * Fifth case : end of the resource
4690 */
4691 else if (CUR == 0) {
4692 htmlAutoCloseOnEnd(ctxt);
4693 break;
4694 }
4695
4696 /*
4697 * Last case, text. Note that References are handled directly.
4698 */
4699 else {
4700 htmlParseCharData(ctxt);
4701 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004702 }
4703 GROW;
4704 }
4705 if (currentNode != NULL) xmlFree(currentNode);
4706}
4707
4708/**
4709 * htmlParseContent:
4710 * @ctxt: an HTML parser context
4711 *
4712 * Parse a content: comment, sub-element, reference or text.
4713 * This is the entry point when called from parser.c
4714 */
4715
4716void
4717__htmlParseContent(void *ctxt) {
4718 if (ctxt != NULL)
4719 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4720}
4721
4722/**
4723 * htmlParseDocument:
4724 * @ctxt: an HTML parser context
4725 *
4726 * parse an HTML document (and build a tree if using the standard SAX
4727 * interface).
4728 *
4729 * Returns 0, -1 in case of error. the parser context is augmented
4730 * as a result of the parsing.
4731 */
4732
4733int
4734htmlParseDocument(htmlParserCtxtPtr ctxt) {
4735 xmlChar start[4];
4736 xmlCharEncoding enc;
4737 xmlDtdPtr dtd;
4738
4739 xmlInitParser();
4740
4741 htmlDefaultSAXHandlerInit();
4742
4743 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4744 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4745 "htmlParseDocument: context error\n", NULL, NULL);
4746 return(XML_ERR_INTERNAL_ERROR);
4747 }
4748 ctxt->html = 1;
4749 ctxt->linenumbers = 1;
4750 GROW;
4751 /*
4752 * SAX: beginning of the document processing.
4753 */
4754 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4755 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4756
4757 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4758 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4759 /*
4760 * Get the 4 first bytes and decode the charset
4761 * if enc != XML_CHAR_ENCODING_NONE
4762 * plug some encoding conversion routines.
4763 */
4764 start[0] = RAW;
4765 start[1] = NXT(1);
4766 start[2] = NXT(2);
4767 start[3] = NXT(3);
4768 enc = xmlDetectCharEncoding(&start[0], 4);
4769 if (enc != XML_CHAR_ENCODING_NONE) {
4770 xmlSwitchEncoding(ctxt, enc);
4771 }
4772 }
4773
4774 /*
4775 * Wipe out everything which is before the first '<'
4776 */
4777 SKIP_BLANKS;
4778 if (CUR == 0) {
4779 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4780 "Document is empty\n", NULL, NULL);
4781 }
4782
4783 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4784 ctxt->sax->startDocument(ctxt->userData);
4785
4786
4787 /*
4788 * Parse possible comments and PIs before any content
4789 */
4790 while (((CUR == '<') && (NXT(1) == '!') &&
4791 (NXT(2) == '-') && (NXT(3) == '-')) ||
4792 ((CUR == '<') && (NXT(1) == '?'))) {
4793 htmlParseComment(ctxt);
4794 htmlParsePI(ctxt);
4795 SKIP_BLANKS;
4796 }
4797
4798
4799 /*
4800 * Then possibly doc type declaration(s) and more Misc
4801 * (doctypedecl Misc*)?
4802 */
4803 if ((CUR == '<') && (NXT(1) == '!') &&
4804 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4805 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4806 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4807 (UPP(8) == 'E')) {
4808 htmlParseDocTypeDecl(ctxt);
4809 }
4810 SKIP_BLANKS;
4811
4812 /*
4813 * Parse possible comments and PIs before any content
4814 */
4815 while (((CUR == '<') && (NXT(1) == '!') &&
4816 (NXT(2) == '-') && (NXT(3) == '-')) ||
4817 ((CUR == '<') && (NXT(1) == '?'))) {
4818 htmlParseComment(ctxt);
4819 htmlParsePI(ctxt);
4820 SKIP_BLANKS;
4821 }
4822
4823 /*
4824 * Time to start parsing the tree itself
4825 */
4826 htmlParseContentInternal(ctxt);
4827
4828 /*
4829 * autoclose
4830 */
4831 if (CUR == 0)
4832 htmlAutoCloseOnEnd(ctxt);
4833
4834
4835 /*
4836 * SAX: end of the document processing.
4837 */
4838 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4839 ctxt->sax->endDocument(ctxt->userData);
4840
4841 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4842 dtd = xmlGetIntSubset(ctxt->myDoc);
4843 if (dtd == NULL)
4844 ctxt->myDoc->intSubset =
4845 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4846 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4847 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4848 }
4849 if (! ctxt->wellFormed) return(-1);
4850 return(0);
4851}
4852
4853
4854/************************************************************************
4855 * *
4856 * Parser contexts handling *
4857 * *
4858 ************************************************************************/
4859
4860/**
4861 * htmlInitParserCtxt:
4862 * @ctxt: an HTML parser context
4863 *
4864 * Initialize a parser context
4865 *
4866 * Returns 0 in case of success and -1 in case of error
4867 */
4868
4869static int
4870htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4871{
4872 htmlSAXHandler *sax;
4873
4874 if (ctxt == NULL) return(-1);
4875 memset(ctxt, 0, sizeof(htmlParserCtxt));
4876
4877 ctxt->dict = xmlDictCreate();
4878 if (ctxt->dict == NULL) {
4879 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4880 return(-1);
4881 }
4882 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4883 if (sax == NULL) {
4884 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4885 return(-1);
4886 }
4887 else
4888 memset(sax, 0, sizeof(htmlSAXHandler));
4889
4890 /* Allocate the Input stack */
4891 ctxt->inputTab = (htmlParserInputPtr *)
4892 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4893 if (ctxt->inputTab == NULL) {
4894 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4895 ctxt->inputNr = 0;
4896 ctxt->inputMax = 0;
4897 ctxt->input = NULL;
4898 return(-1);
4899 }
4900 ctxt->inputNr = 0;
4901 ctxt->inputMax = 5;
4902 ctxt->input = NULL;
4903 ctxt->version = NULL;
4904 ctxt->encoding = NULL;
4905 ctxt->standalone = -1;
4906 ctxt->instate = XML_PARSER_START;
4907
4908 /* Allocate the Node stack */
4909 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4910 if (ctxt->nodeTab == NULL) {
4911 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4912 ctxt->nodeNr = 0;
4913 ctxt->nodeMax = 0;
4914 ctxt->node = NULL;
4915 ctxt->inputNr = 0;
4916 ctxt->inputMax = 0;
4917 ctxt->input = NULL;
4918 return(-1);
4919 }
4920 ctxt->nodeNr = 0;
4921 ctxt->nodeMax = 10;
4922 ctxt->node = NULL;
4923
4924 /* Allocate the Name stack */
4925 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4926 if (ctxt->nameTab == NULL) {
4927 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4928 ctxt->nameNr = 0;
4929 ctxt->nameMax = 0;
4930 ctxt->name = NULL;
4931 ctxt->nodeNr = 0;
4932 ctxt->nodeMax = 0;
4933 ctxt->node = NULL;
4934 ctxt->inputNr = 0;
4935 ctxt->inputMax = 0;
4936 ctxt->input = NULL;
4937 return(-1);
4938 }
4939 ctxt->nameNr = 0;
4940 ctxt->nameMax = 10;
4941 ctxt->name = NULL;
4942
4943 ctxt->nodeInfoTab = NULL;
4944 ctxt->nodeInfoNr = 0;
4945 ctxt->nodeInfoMax = 0;
4946
4947 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4948 else {
4949 ctxt->sax = sax;
4950 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4951 }
4952 ctxt->userData = ctxt;
4953 ctxt->myDoc = NULL;
4954 ctxt->wellFormed = 1;
4955 ctxt->replaceEntities = 0;
4956 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4957 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4958 ctxt->html = 1;
4959 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4960 ctxt->vctxt.userData = ctxt;
4961 ctxt->vctxt.error = xmlParserValidityError;
4962 ctxt->vctxt.warning = xmlParserValidityWarning;
4963 ctxt->record_info = 0;
4964 ctxt->validate = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004965 ctxt->checkIndex = 0;
4966 ctxt->catalogs = NULL;
4967 xmlInitNodeInfoSeq(&ctxt->node_seq);
4968 return(0);
4969}
4970
4971/**
4972 * htmlFreeParserCtxt:
4973 * @ctxt: an HTML parser context
4974 *
4975 * Free all the memory used by a parser context. However the parsed
4976 * document in ctxt->myDoc is not freed.
4977 */
4978
4979void
4980htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4981{
4982 xmlFreeParserCtxt(ctxt);
4983}
4984
4985/**
4986 * htmlNewParserCtxt:
4987 *
4988 * Allocate and initialize a new parser context.
4989 *
4990 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4991 */
4992
4993htmlParserCtxtPtr
4994htmlNewParserCtxt(void)
4995{
4996 xmlParserCtxtPtr ctxt;
4997
4998 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4999 if (ctxt == NULL) {
5000 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5001 return(NULL);
5002 }
5003 memset(ctxt, 0, sizeof(xmlParserCtxt));
5004 if (htmlInitParserCtxt(ctxt) < 0) {
5005 htmlFreeParserCtxt(ctxt);
5006 return(NULL);
5007 }
5008 return(ctxt);
5009}
5010
5011/**
5012 * htmlCreateMemoryParserCtxt:
5013 * @buffer: a pointer to a char array
5014 * @size: the size of the array
5015 *
5016 * Create a parser context for an HTML in-memory document.
5017 *
5018 * Returns the new parser context or NULL
5019 */
5020htmlParserCtxtPtr
5021htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5022 xmlParserCtxtPtr ctxt;
5023 xmlParserInputPtr input;
5024 xmlParserInputBufferPtr buf;
5025
5026 if (buffer == NULL)
5027 return(NULL);
5028 if (size <= 0)
5029 return(NULL);
5030
5031 ctxt = htmlNewParserCtxt();
5032 if (ctxt == NULL)
5033 return(NULL);
5034
5035 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5036 if (buf == NULL) return(NULL);
5037
5038 input = xmlNewInputStream(ctxt);
5039 if (input == NULL) {
5040 xmlFreeParserCtxt(ctxt);
5041 return(NULL);
5042 }
5043
5044 input->filename = NULL;
5045 input->buf = buf;
5046 xmlBufResetInput(buf->buffer, input);
5047
5048 inputPush(ctxt, input);
5049 return(ctxt);
5050}
5051
5052/**
5053 * htmlCreateDocParserCtxt:
5054 * @cur: a pointer to an array of xmlChar
5055 * @encoding: a free form C string describing the HTML document encoding, or NULL
5056 *
5057 * Create a parser context for an HTML document.
5058 *
5059 * TODO: check the need to add encoding handling there
5060 *
5061 * Returns the new parser context or NULL
5062 */
5063static htmlParserCtxtPtr
5064htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5065 int len;
5066 htmlParserCtxtPtr ctxt;
5067
5068 if (cur == NULL)
5069 return(NULL);
5070 len = xmlStrlen(cur);
5071 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5072 if (ctxt == NULL)
5073 return(NULL);
5074
5075 if (encoding != NULL) {
5076 xmlCharEncoding enc;
5077 xmlCharEncodingHandlerPtr handler;
5078
5079 if (ctxt->input->encoding != NULL)
5080 xmlFree((xmlChar *) ctxt->input->encoding);
5081 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5082
5083 enc = xmlParseCharEncoding(encoding);
5084 /*
5085 * registered set of known encodings
5086 */
5087 if (enc != XML_CHAR_ENCODING_ERROR) {
5088 xmlSwitchEncoding(ctxt, enc);
5089 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5090 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5091 "Unsupported encoding %s\n",
5092 (const xmlChar *) encoding, NULL);
5093 }
5094 } else {
5095 /*
5096 * fallback for unknown encodings
5097 */
5098 handler = xmlFindCharEncodingHandler((const char *) encoding);
5099 if (handler != NULL) {
5100 xmlSwitchToEncoding(ctxt, handler);
5101 } else {
5102 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5103 "Unsupported encoding %s\n",
5104 (const xmlChar *) encoding, NULL);
5105 }
5106 }
5107 }
5108 return(ctxt);
5109}
5110
5111#ifdef LIBXML_PUSH_ENABLED
5112/************************************************************************
5113 * *
5114 * Progressive parsing interfaces *
5115 * *
5116 ************************************************************************/
5117
5118/**
5119 * htmlParseLookupSequence:
5120 * @ctxt: an HTML parser context
5121 * @first: the first char to lookup
5122 * @next: the next char to lookup or zero
5123 * @third: the next char to lookup or zero
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005124 * @ignoreattrval: skip over attribute values
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005125 *
5126 * Try to find if a sequence (first, next, third) or just (first next) or
5127 * (first) is available in the input stream.
5128 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5129 * to avoid rescanning sequences of bytes, it DOES change the state of the
5130 * parser, do not use liberally.
5131 * This is basically similar to xmlParseLookupSequence()
5132 *
5133 * Returns the index to the current parsing point if the full sequence
5134 * is available, -1 otherwise.
5135 */
5136static int
5137htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005138 xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005139{
5140 int base, len;
5141 htmlParserInputPtr in;
5142 const xmlChar *buf;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005143 int invalue = 0;
5144 char valdellim = 0x0;
5145
5146 in = ctxt->input;
5147 if (in == NULL)
5148 return (-1);
5149
5150 base = in->cur - in->base;
5151 if (base < 0)
5152 return (-1);
5153
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005154 if (ctxt->checkIndex > base) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005155 base = ctxt->checkIndex;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005156 /* Abuse hasPErefs member to restore current state. */
5157 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5158 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005159
5160 if (in->buf == NULL) {
5161 buf = in->base;
5162 len = in->length;
5163 } else {
5164 buf = xmlBufContent(in->buf->buffer);
5165 len = xmlBufUse(in->buf->buffer);
5166 }
5167
5168 /* take into account the sequence length */
5169 if (third)
5170 len -= 2;
5171 else if (next)
5172 len--;
5173 for (; base < len; base++) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005174 if (ignoreattrval) {
5175 if (buf[base] == '"' || buf[base] == '\'') {
5176 if (invalue) {
5177 if (buf[base] == valdellim) {
5178 invalue = 0;
5179 continue;
5180 }
5181 } else {
5182 valdellim = buf[base];
5183 invalue = 1;
5184 continue;
5185 }
5186 } else if (invalue) {
5187 continue;
5188 }
5189 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005190 if (buf[base] == first) {
5191 if (third != 0) {
5192 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5193 continue;
5194 } else if (next != 0) {
5195 if (buf[base + 1] != next)
5196 continue;
5197 }
5198 ctxt->checkIndex = 0;
5199#ifdef DEBUG_PUSH
5200 if (next == 0)
5201 xmlGenericError(xmlGenericErrorContext,
5202 "HPP: lookup '%c' found at %d\n",
5203 first, base);
5204 else if (third == 0)
5205 xmlGenericError(xmlGenericErrorContext,
5206 "HPP: lookup '%c%c' found at %d\n",
5207 first, next, base);
5208 else
5209 xmlGenericError(xmlGenericErrorContext,
5210 "HPP: lookup '%c%c%c' found at %d\n",
5211 first, next, third, base);
5212#endif
5213 return (base - (in->cur - in->base));
5214 }
5215 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005216 ctxt->checkIndex = base;
5217 /* Abuse hasPErefs member to track current state. */
5218 if (invalue)
5219 ctxt->hasPErefs |= 1;
5220 else
5221 ctxt->hasPErefs &= ~1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005222#ifdef DEBUG_PUSH
5223 if (next == 0)
5224 xmlGenericError(xmlGenericErrorContext,
5225 "HPP: lookup '%c' failed\n", first);
5226 else if (third == 0)
5227 xmlGenericError(xmlGenericErrorContext,
5228 "HPP: lookup '%c%c' failed\n", first, next);
5229 else
5230 xmlGenericError(xmlGenericErrorContext,
5231 "HPP: lookup '%c%c%c' failed\n", first, next,
5232 third);
5233#endif
5234 return (-1);
5235}
5236
5237/**
Haibo Huangd75f3892021-01-05 21:34:50 -08005238 * htmlParseLookupCommentEnd:
5239 * @ctxt: an HTML parser context
5240 *
5241 * Try to find a comment end tag in the input stream
5242 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5243 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5244 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5245 * to avoid rescanning sequences of bytes, it DOES change the state of the
5246 * parser, do not use liberally.
5247 * This wraps to htmlParseLookupSequence()
5248 *
5249 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5250 */
5251static int
5252htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5253{
5254 int mark = 0;
5255 int cur = CUR_PTR - BASE_PTR;
5256
5257 while (mark >= 0) {
5258 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5259 if ((mark < 0) ||
5260 (NXT(mark+2) == '>') ||
5261 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5262 return mark;
5263 }
5264 ctxt->checkIndex = cur + mark + 1;
5265 }
5266 return mark;
5267}
5268
5269
5270/**
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005271 * htmlParseTryOrFinish:
5272 * @ctxt: an HTML parser context
5273 * @terminate: last chunk indicator
5274 *
5275 * Try to progress on parsing
5276 *
5277 * Returns zero if no parsing was possible
5278 */
5279static int
5280htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5281 int ret = 0;
5282 htmlParserInputPtr in;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005283 ptrdiff_t avail = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005284 xmlChar cur, next;
5285
5286 htmlParserNodeInfo node_info;
5287
5288#ifdef DEBUG_PUSH
5289 switch (ctxt->instate) {
5290 case XML_PARSER_EOF:
5291 xmlGenericError(xmlGenericErrorContext,
5292 "HPP: try EOF\n"); break;
5293 case XML_PARSER_START:
5294 xmlGenericError(xmlGenericErrorContext,
5295 "HPP: try START\n"); break;
5296 case XML_PARSER_MISC:
5297 xmlGenericError(xmlGenericErrorContext,
5298 "HPP: try MISC\n");break;
5299 case XML_PARSER_COMMENT:
5300 xmlGenericError(xmlGenericErrorContext,
5301 "HPP: try COMMENT\n");break;
5302 case XML_PARSER_PROLOG:
5303 xmlGenericError(xmlGenericErrorContext,
5304 "HPP: try PROLOG\n");break;
5305 case XML_PARSER_START_TAG:
5306 xmlGenericError(xmlGenericErrorContext,
5307 "HPP: try START_TAG\n");break;
5308 case XML_PARSER_CONTENT:
5309 xmlGenericError(xmlGenericErrorContext,
5310 "HPP: try CONTENT\n");break;
5311 case XML_PARSER_CDATA_SECTION:
5312 xmlGenericError(xmlGenericErrorContext,
5313 "HPP: try CDATA_SECTION\n");break;
5314 case XML_PARSER_END_TAG:
5315 xmlGenericError(xmlGenericErrorContext,
5316 "HPP: try END_TAG\n");break;
5317 case XML_PARSER_ENTITY_DECL:
5318 xmlGenericError(xmlGenericErrorContext,
5319 "HPP: try ENTITY_DECL\n");break;
5320 case XML_PARSER_ENTITY_VALUE:
5321 xmlGenericError(xmlGenericErrorContext,
5322 "HPP: try ENTITY_VALUE\n");break;
5323 case XML_PARSER_ATTRIBUTE_VALUE:
5324 xmlGenericError(xmlGenericErrorContext,
5325 "HPP: try ATTRIBUTE_VALUE\n");break;
5326 case XML_PARSER_DTD:
5327 xmlGenericError(xmlGenericErrorContext,
5328 "HPP: try DTD\n");break;
5329 case XML_PARSER_EPILOG:
5330 xmlGenericError(xmlGenericErrorContext,
5331 "HPP: try EPILOG\n");break;
5332 case XML_PARSER_PI:
5333 xmlGenericError(xmlGenericErrorContext,
5334 "HPP: try PI\n");break;
5335 case XML_PARSER_SYSTEM_LITERAL:
5336 xmlGenericError(xmlGenericErrorContext,
5337 "HPP: try SYSTEM_LITERAL\n");break;
5338 }
5339#endif
5340
5341 while (1) {
5342
5343 in = ctxt->input;
5344 if (in == NULL) break;
5345 if (in->buf == NULL)
5346 avail = in->length - (in->cur - in->base);
5347 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005348 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5349 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005350 if ((avail == 0) && (terminate)) {
5351 htmlAutoCloseOnEnd(ctxt);
5352 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5353 /*
5354 * SAX: end of the document processing.
5355 */
5356 ctxt->instate = XML_PARSER_EOF;
5357 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5358 ctxt->sax->endDocument(ctxt->userData);
5359 }
5360 }
5361 if (avail < 1)
5362 goto done;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005363 /*
5364 * This is done to make progress and avoid an infinite loop
5365 * if a parsing attempt was aborted by hitting a NUL byte. After
5366 * changing htmlCurrentChar, this probably isn't necessary anymore.
5367 * We should consider removing this check.
5368 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005369 cur = in->cur[0];
5370 if (cur == 0) {
5371 SKIP(1);
5372 continue;
5373 }
5374
5375 switch (ctxt->instate) {
5376 case XML_PARSER_EOF:
5377 /*
5378 * Document parsing is done !
5379 */
5380 goto done;
5381 case XML_PARSER_START:
5382 /*
5383 * Very first chars read from the document flow.
5384 */
5385 cur = in->cur[0];
5386 if (IS_BLANK_CH(cur)) {
5387 SKIP_BLANKS;
5388 if (in->buf == NULL)
5389 avail = in->length - (in->cur - in->base);
5390 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005391 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5392 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005393 }
5394 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5395 ctxt->sax->setDocumentLocator(ctxt->userData,
5396 &xmlDefaultSAXLocator);
5397 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5398 (!ctxt->disableSAX))
5399 ctxt->sax->startDocument(ctxt->userData);
5400
5401 cur = in->cur[0];
5402 next = in->cur[1];
5403 if ((cur == '<') && (next == '!') &&
5404 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5405 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5406 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5407 (UPP(8) == 'E')) {
5408 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005409 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005410 goto done;
5411#ifdef DEBUG_PUSH
5412 xmlGenericError(xmlGenericErrorContext,
5413 "HPP: Parsing internal subset\n");
5414#endif
5415 htmlParseDocTypeDecl(ctxt);
5416 ctxt->instate = XML_PARSER_PROLOG;
5417#ifdef DEBUG_PUSH
5418 xmlGenericError(xmlGenericErrorContext,
5419 "HPP: entering PROLOG\n");
5420#endif
5421 } else {
5422 ctxt->instate = XML_PARSER_MISC;
5423#ifdef DEBUG_PUSH
5424 xmlGenericError(xmlGenericErrorContext,
5425 "HPP: entering MISC\n");
5426#endif
5427 }
5428 break;
5429 case XML_PARSER_MISC:
5430 SKIP_BLANKS;
5431 if (in->buf == NULL)
5432 avail = in->length - (in->cur - in->base);
5433 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005434 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5435 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005436 /*
5437 * no chars in buffer
5438 */
5439 if (avail < 1)
5440 goto done;
5441 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005442 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005443 */
5444 if (avail < 2) {
5445 if (!terminate)
5446 goto done;
5447 else
5448 next = ' ';
5449 } else {
5450 next = in->cur[1];
5451 }
5452 cur = in->cur[0];
5453 if ((cur == '<') && (next == '!') &&
5454 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005455 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005456 goto done;
5457#ifdef DEBUG_PUSH
5458 xmlGenericError(xmlGenericErrorContext,
5459 "HPP: Parsing Comment\n");
5460#endif
5461 htmlParseComment(ctxt);
5462 ctxt->instate = XML_PARSER_MISC;
5463 } else if ((cur == '<') && (next == '?')) {
5464 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005465 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005466 goto done;
5467#ifdef DEBUG_PUSH
5468 xmlGenericError(xmlGenericErrorContext,
5469 "HPP: Parsing PI\n");
5470#endif
5471 htmlParsePI(ctxt);
5472 ctxt->instate = XML_PARSER_MISC;
5473 } else if ((cur == '<') && (next == '!') &&
5474 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5475 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5476 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5477 (UPP(8) == 'E')) {
5478 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005479 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005480 goto done;
5481#ifdef DEBUG_PUSH
5482 xmlGenericError(xmlGenericErrorContext,
5483 "HPP: Parsing internal subset\n");
5484#endif
5485 htmlParseDocTypeDecl(ctxt);
5486 ctxt->instate = XML_PARSER_PROLOG;
5487#ifdef DEBUG_PUSH
5488 xmlGenericError(xmlGenericErrorContext,
5489 "HPP: entering PROLOG\n");
5490#endif
5491 } else if ((cur == '<') && (next == '!') &&
5492 (avail < 9)) {
5493 goto done;
5494 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005495 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005496#ifdef DEBUG_PUSH
5497 xmlGenericError(xmlGenericErrorContext,
5498 "HPP: entering START_TAG\n");
5499#endif
5500 }
5501 break;
5502 case XML_PARSER_PROLOG:
5503 SKIP_BLANKS;
5504 if (in->buf == NULL)
5505 avail = in->length - (in->cur - in->base);
5506 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005507 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5508 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005509 if (avail < 2)
5510 goto done;
5511 cur = in->cur[0];
5512 next = in->cur[1];
5513 if ((cur == '<') && (next == '!') &&
5514 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005515 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005516 goto done;
5517#ifdef DEBUG_PUSH
5518 xmlGenericError(xmlGenericErrorContext,
5519 "HPP: Parsing Comment\n");
5520#endif
5521 htmlParseComment(ctxt);
5522 ctxt->instate = XML_PARSER_PROLOG;
5523 } else if ((cur == '<') && (next == '?')) {
5524 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005525 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005526 goto done;
5527#ifdef DEBUG_PUSH
5528 xmlGenericError(xmlGenericErrorContext,
5529 "HPP: Parsing PI\n");
5530#endif
5531 htmlParsePI(ctxt);
5532 ctxt->instate = XML_PARSER_PROLOG;
5533 } else if ((cur == '<') && (next == '!') &&
5534 (avail < 4)) {
5535 goto done;
5536 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005537 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005538#ifdef DEBUG_PUSH
5539 xmlGenericError(xmlGenericErrorContext,
5540 "HPP: entering START_TAG\n");
5541#endif
5542 }
5543 break;
5544 case XML_PARSER_EPILOG:
5545 if (in->buf == NULL)
5546 avail = in->length - (in->cur - in->base);
5547 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005548 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5549 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005550 if (avail < 1)
5551 goto done;
5552 cur = in->cur[0];
5553 if (IS_BLANK_CH(cur)) {
5554 htmlParseCharData(ctxt);
5555 goto done;
5556 }
5557 if (avail < 2)
5558 goto done;
5559 next = in->cur[1];
5560 if ((cur == '<') && (next == '!') &&
5561 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005562 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005563 goto done;
5564#ifdef DEBUG_PUSH
5565 xmlGenericError(xmlGenericErrorContext,
5566 "HPP: Parsing Comment\n");
5567#endif
5568 htmlParseComment(ctxt);
5569 ctxt->instate = XML_PARSER_EPILOG;
5570 } else if ((cur == '<') && (next == '?')) {
5571 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005572 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005573 goto done;
5574#ifdef DEBUG_PUSH
5575 xmlGenericError(xmlGenericErrorContext,
5576 "HPP: Parsing PI\n");
5577#endif
5578 htmlParsePI(ctxt);
5579 ctxt->instate = XML_PARSER_EPILOG;
5580 } else if ((cur == '<') && (next == '!') &&
5581 (avail < 4)) {
5582 goto done;
5583 } else {
5584 ctxt->errNo = XML_ERR_DOCUMENT_END;
5585 ctxt->wellFormed = 0;
5586 ctxt->instate = XML_PARSER_EOF;
5587#ifdef DEBUG_PUSH
5588 xmlGenericError(xmlGenericErrorContext,
5589 "HPP: entering EOF\n");
5590#endif
5591 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5592 ctxt->sax->endDocument(ctxt->userData);
5593 goto done;
5594 }
5595 break;
5596 case XML_PARSER_START_TAG: {
5597 const xmlChar *name;
5598 int failed;
5599 const htmlElemDesc * info;
5600
5601 /*
5602 * no chars in buffer
5603 */
5604 if (avail < 1)
5605 goto done;
5606 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005607 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005608 */
5609 if (avail < 2) {
5610 if (!terminate)
5611 goto done;
5612 else
5613 next = ' ';
5614 } else {
5615 next = in->cur[1];
5616 }
5617 cur = in->cur[0];
5618 if (cur != '<') {
5619 ctxt->instate = XML_PARSER_CONTENT;
5620#ifdef DEBUG_PUSH
5621 xmlGenericError(xmlGenericErrorContext,
5622 "HPP: entering CONTENT\n");
5623#endif
5624 break;
5625 }
5626 if (next == '/') {
5627 ctxt->instate = XML_PARSER_END_TAG;
5628 ctxt->checkIndex = 0;
5629#ifdef DEBUG_PUSH
5630 xmlGenericError(xmlGenericErrorContext,
5631 "HPP: entering END_TAG\n");
5632#endif
5633 break;
5634 }
5635 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005636 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005637 goto done;
5638
5639 /* Capture start position */
5640 if (ctxt->record_info) {
5641 node_info.begin_pos = ctxt->input->consumed +
5642 (CUR_PTR - ctxt->input->base);
5643 node_info.begin_line = ctxt->input->line;
5644 }
5645
5646
5647 failed = htmlParseStartTag(ctxt);
5648 name = ctxt->name;
5649 if ((failed == -1) ||
5650 (name == NULL)) {
5651 if (CUR == '>')
5652 NEXT;
5653 break;
5654 }
5655
5656 /*
5657 * Lookup the info for that element.
5658 */
5659 info = htmlTagLookup(name);
5660 if (info == NULL) {
5661 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5662 "Tag %s invalid\n", name, NULL);
5663 }
5664
5665 /*
5666 * Check for an Empty Element labeled the XML/SGML way
5667 */
5668 if ((CUR == '/') && (NXT(1) == '>')) {
5669 SKIP(2);
5670 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5671 ctxt->sax->endElement(ctxt->userData, name);
5672 htmlnamePop(ctxt);
5673 ctxt->instate = XML_PARSER_CONTENT;
5674#ifdef DEBUG_PUSH
5675 xmlGenericError(xmlGenericErrorContext,
5676 "HPP: entering CONTENT\n");
5677#endif
5678 break;
5679 }
5680
5681 if (CUR == '>') {
5682 NEXT;
5683 } else {
5684 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5685 "Couldn't find end of Start Tag %s\n",
5686 name, NULL);
5687
5688 /*
5689 * end of parsing of this node.
5690 */
5691 if (xmlStrEqual(name, ctxt->name)) {
5692 nodePop(ctxt);
5693 htmlnamePop(ctxt);
5694 }
5695
5696 if (ctxt->record_info)
5697 htmlNodeInfoPush(ctxt, &node_info);
5698
5699 ctxt->instate = XML_PARSER_CONTENT;
5700#ifdef DEBUG_PUSH
5701 xmlGenericError(xmlGenericErrorContext,
5702 "HPP: entering CONTENT\n");
5703#endif
5704 break;
5705 }
5706
5707 /*
5708 * Check for an Empty Element from DTD definition
5709 */
5710 if ((info != NULL) && (info->empty)) {
5711 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5712 ctxt->sax->endElement(ctxt->userData, name);
5713 htmlnamePop(ctxt);
5714 }
5715
5716 if (ctxt->record_info)
5717 htmlNodeInfoPush(ctxt, &node_info);
5718
5719 ctxt->instate = XML_PARSER_CONTENT;
5720#ifdef DEBUG_PUSH
5721 xmlGenericError(xmlGenericErrorContext,
5722 "HPP: entering CONTENT\n");
5723#endif
5724 break;
5725 }
5726 case XML_PARSER_CONTENT: {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005727 xmlChar chr[2] = { 0, 0 };
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005728
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005729 /*
5730 * Handle preparsed entities and charRef
5731 */
5732 if (ctxt->token != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005733 chr[0] = (xmlChar) ctxt->token;
5734 htmlCheckParagraph(ctxt);
5735 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5736 ctxt->sax->characters(ctxt->userData, chr, 1);
5737 ctxt->token = 0;
5738 ctxt->checkIndex = 0;
5739 }
5740 if ((avail == 1) && (terminate)) {
5741 cur = in->cur[0];
5742 if ((cur != '<') && (cur != '&')) {
5743 if (ctxt->sax != NULL) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005744 chr[0] = cur;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005745 if (IS_BLANK_CH(cur)) {
5746 if (ctxt->keepBlanks) {
5747 if (ctxt->sax->characters != NULL)
5748 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005749 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005750 } else {
5751 if (ctxt->sax->ignorableWhitespace != NULL)
5752 ctxt->sax->ignorableWhitespace(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005753 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005754 }
5755 } else {
5756 htmlCheckParagraph(ctxt);
5757 if (ctxt->sax->characters != NULL)
5758 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005759 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005760 }
5761 }
5762 ctxt->token = 0;
5763 ctxt->checkIndex = 0;
5764 in->cur++;
5765 break;
5766 }
5767 }
5768 if (avail < 2)
5769 goto done;
5770 cur = in->cur[0];
5771 next = in->cur[1];
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005772 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5773 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5774 /*
5775 * Handle SCRIPT/STYLE separately
5776 */
5777 if (!terminate) {
5778 int idx;
5779 xmlChar val;
5780
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005781 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005782 if (idx < 0)
5783 goto done;
5784 val = in->cur[idx + 2];
5785 if (val == 0) /* bad cut of input */
5786 goto done;
5787 }
5788 htmlParseScript(ctxt);
5789 if ((cur == '<') && (next == '/')) {
5790 ctxt->instate = XML_PARSER_END_TAG;
5791 ctxt->checkIndex = 0;
5792#ifdef DEBUG_PUSH
5793 xmlGenericError(xmlGenericErrorContext,
5794 "HPP: entering END_TAG\n");
5795#endif
5796 break;
5797 }
5798 } else {
5799 /*
5800 * Sometimes DOCTYPE arrives in the middle of the document
5801 */
5802 if ((cur == '<') && (next == '!') &&
5803 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5804 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5805 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5806 (UPP(8) == 'E')) {
5807 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005808 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005809 goto done;
5810 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5811 "Misplaced DOCTYPE declaration\n",
5812 BAD_CAST "DOCTYPE" , NULL);
5813 htmlParseDocTypeDecl(ctxt);
5814 } else if ((cur == '<') && (next == '!') &&
5815 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005816 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005817 goto done;
5818#ifdef DEBUG_PUSH
5819 xmlGenericError(xmlGenericErrorContext,
5820 "HPP: Parsing Comment\n");
5821#endif
5822 htmlParseComment(ctxt);
5823 ctxt->instate = XML_PARSER_CONTENT;
5824 } else if ((cur == '<') && (next == '?')) {
5825 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005826 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005827 goto done;
5828#ifdef DEBUG_PUSH
5829 xmlGenericError(xmlGenericErrorContext,
5830 "HPP: Parsing PI\n");
5831#endif
5832 htmlParsePI(ctxt);
5833 ctxt->instate = XML_PARSER_CONTENT;
5834 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5835 goto done;
5836 } else if ((cur == '<') && (next == '/')) {
5837 ctxt->instate = XML_PARSER_END_TAG;
5838 ctxt->checkIndex = 0;
5839#ifdef DEBUG_PUSH
5840 xmlGenericError(xmlGenericErrorContext,
5841 "HPP: entering END_TAG\n");
5842#endif
5843 break;
5844 } else if (cur == '<') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005845 if ((!terminate) && (next == 0))
5846 goto done;
5847 /*
5848 * Only switch to START_TAG if the next character
5849 * starts a valid name. Otherwise, htmlParseStartTag
5850 * might return without consuming all characters
5851 * up to the final '>'.
5852 */
5853 if ((IS_ASCII_LETTER(next)) ||
5854 (next == '_') || (next == ':') || (next == '.')) {
5855 ctxt->instate = XML_PARSER_START_TAG;
5856 ctxt->checkIndex = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005857#ifdef DEBUG_PUSH
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005858 xmlGenericError(xmlGenericErrorContext,
5859 "HPP: entering START_TAG\n");
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005860#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005861 } else {
5862 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
5863 "htmlParseTryOrFinish: "
5864 "invalid element name\n",
5865 NULL, NULL);
5866 htmlCheckParagraph(ctxt);
5867 if ((ctxt->sax != NULL) &&
5868 (ctxt->sax->characters != NULL))
5869 ctxt->sax->characters(ctxt->userData,
5870 in->cur, 1);
5871 NEXT;
5872 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005873 break;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005874 } else {
5875 /*
5876 * check that the text sequence is complete
5877 * before handing out the data to the parser
5878 * to avoid problems with erroneous end of
5879 * data detection.
5880 */
5881 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005882 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005883 goto done;
5884 ctxt->checkIndex = 0;
5885#ifdef DEBUG_PUSH
5886 xmlGenericError(xmlGenericErrorContext,
5887 "HPP: Parsing char data\n");
5888#endif
Haibo Huangca689272021-02-09 16:43:43 -08005889 while ((ctxt->instate != XML_PARSER_EOF) &&
5890 (cur != '<') && (in->cur < in->end)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005891 if (cur == '&') {
5892 htmlParseReference(ctxt);
5893 } else {
5894 htmlParseCharData(ctxt);
5895 }
5896 cur = in->cur[0];
5897 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005898 }
5899 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005900
5901 break;
5902 }
5903 case XML_PARSER_END_TAG:
5904 if (avail < 2)
5905 goto done;
5906 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005907 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005908 goto done;
5909 htmlParseEndTag(ctxt);
5910 if (ctxt->nameNr == 0) {
5911 ctxt->instate = XML_PARSER_EPILOG;
5912 } else {
5913 ctxt->instate = XML_PARSER_CONTENT;
5914 }
5915 ctxt->checkIndex = 0;
5916#ifdef DEBUG_PUSH
5917 xmlGenericError(xmlGenericErrorContext,
5918 "HPP: entering CONTENT\n");
5919#endif
5920 break;
5921 case XML_PARSER_CDATA_SECTION:
5922 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5923 "HPP: internal error, state == CDATA\n",
5924 NULL, NULL);
5925 ctxt->instate = XML_PARSER_CONTENT;
5926 ctxt->checkIndex = 0;
5927#ifdef DEBUG_PUSH
5928 xmlGenericError(xmlGenericErrorContext,
5929 "HPP: entering CONTENT\n");
5930#endif
5931 break;
5932 case XML_PARSER_DTD:
5933 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5934 "HPP: internal error, state == DTD\n",
5935 NULL, NULL);
5936 ctxt->instate = XML_PARSER_CONTENT;
5937 ctxt->checkIndex = 0;
5938#ifdef DEBUG_PUSH
5939 xmlGenericError(xmlGenericErrorContext,
5940 "HPP: entering CONTENT\n");
5941#endif
5942 break;
5943 case XML_PARSER_COMMENT:
5944 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5945 "HPP: internal error, state == COMMENT\n",
5946 NULL, NULL);
5947 ctxt->instate = XML_PARSER_CONTENT;
5948 ctxt->checkIndex = 0;
5949#ifdef DEBUG_PUSH
5950 xmlGenericError(xmlGenericErrorContext,
5951 "HPP: entering CONTENT\n");
5952#endif
5953 break;
5954 case XML_PARSER_PI:
5955 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5956 "HPP: internal error, state == PI\n",
5957 NULL, NULL);
5958 ctxt->instate = XML_PARSER_CONTENT;
5959 ctxt->checkIndex = 0;
5960#ifdef DEBUG_PUSH
5961 xmlGenericError(xmlGenericErrorContext,
5962 "HPP: entering CONTENT\n");
5963#endif
5964 break;
5965 case XML_PARSER_ENTITY_DECL:
5966 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5967 "HPP: internal error, state == ENTITY_DECL\n",
5968 NULL, NULL);
5969 ctxt->instate = XML_PARSER_CONTENT;
5970 ctxt->checkIndex = 0;
5971#ifdef DEBUG_PUSH
5972 xmlGenericError(xmlGenericErrorContext,
5973 "HPP: entering CONTENT\n");
5974#endif
5975 break;
5976 case XML_PARSER_ENTITY_VALUE:
5977 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5978 "HPP: internal error, state == ENTITY_VALUE\n",
5979 NULL, NULL);
5980 ctxt->instate = XML_PARSER_CONTENT;
5981 ctxt->checkIndex = 0;
5982#ifdef DEBUG_PUSH
5983 xmlGenericError(xmlGenericErrorContext,
5984 "HPP: entering DTD\n");
5985#endif
5986 break;
5987 case XML_PARSER_ATTRIBUTE_VALUE:
5988 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5989 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5990 NULL, NULL);
5991 ctxt->instate = XML_PARSER_START_TAG;
5992 ctxt->checkIndex = 0;
5993#ifdef DEBUG_PUSH
5994 xmlGenericError(xmlGenericErrorContext,
5995 "HPP: entering START_TAG\n");
5996#endif
5997 break;
5998 case XML_PARSER_SYSTEM_LITERAL:
5999 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6000 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6001 NULL, NULL);
6002 ctxt->instate = XML_PARSER_CONTENT;
6003 ctxt->checkIndex = 0;
6004#ifdef DEBUG_PUSH
6005 xmlGenericError(xmlGenericErrorContext,
6006 "HPP: entering CONTENT\n");
6007#endif
6008 break;
6009 case XML_PARSER_IGNORE:
6010 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6011 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6012 NULL, NULL);
6013 ctxt->instate = XML_PARSER_CONTENT;
6014 ctxt->checkIndex = 0;
6015#ifdef DEBUG_PUSH
6016 xmlGenericError(xmlGenericErrorContext,
6017 "HPP: entering CONTENT\n");
6018#endif
6019 break;
6020 case XML_PARSER_PUBLIC_LITERAL:
6021 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6022 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6023 NULL, NULL);
6024 ctxt->instate = XML_PARSER_CONTENT;
6025 ctxt->checkIndex = 0;
6026#ifdef DEBUG_PUSH
6027 xmlGenericError(xmlGenericErrorContext,
6028 "HPP: entering CONTENT\n");
6029#endif
6030 break;
6031
6032 }
6033 }
6034done:
6035 if ((avail == 0) && (terminate)) {
6036 htmlAutoCloseOnEnd(ctxt);
6037 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6038 /*
6039 * SAX: end of the document processing.
6040 */
6041 ctxt->instate = XML_PARSER_EOF;
6042 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6043 ctxt->sax->endDocument(ctxt->userData);
6044 }
6045 }
6046 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6047 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6048 (ctxt->instate == XML_PARSER_EPILOG))) {
6049 xmlDtdPtr dtd;
6050 dtd = xmlGetIntSubset(ctxt->myDoc);
6051 if (dtd == NULL)
6052 ctxt->myDoc->intSubset =
6053 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6054 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6055 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6056 }
6057#ifdef DEBUG_PUSH
6058 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6059#endif
6060 return(ret);
6061}
6062
6063/**
6064 * htmlParseChunk:
6065 * @ctxt: an HTML parser context
6066 * @chunk: an char array
6067 * @size: the size in byte of the chunk
6068 * @terminate: last chunk indicator
6069 *
6070 * Parse a Chunk of memory
6071 *
6072 * Returns zero if no error, the xmlParserErrors otherwise.
6073 */
6074int
6075htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6076 int terminate) {
6077 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6078 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6079 "htmlParseChunk: context error\n", NULL, NULL);
6080 return(XML_ERR_INTERNAL_ERROR);
6081 }
6082 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6083 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6084 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6085 size_t cur = ctxt->input->cur - ctxt->input->base;
6086 int res;
6087
6088 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006089 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006090 if (res < 0) {
6091 ctxt->errNo = XML_PARSER_EOF;
6092 ctxt->disableSAX = 1;
6093 return (XML_PARSER_EOF);
6094 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006095#ifdef DEBUG_PUSH
6096 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6097#endif
6098
6099#if 0
6100 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6101 htmlParseTryOrFinish(ctxt, terminate);
6102#endif
6103 } else if (ctxt->instate != XML_PARSER_EOF) {
6104 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6105 xmlParserInputBufferPtr in = ctxt->input->buf;
6106 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6107 (in->raw != NULL)) {
6108 int nbchars;
6109 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6110 size_t current = ctxt->input->cur - ctxt->input->base;
6111
6112 nbchars = xmlCharEncInput(in, terminate);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006113 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006114 if (nbchars < 0) {
6115 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6116 "encoder error\n", NULL, NULL);
6117 return(XML_ERR_INVALID_ENCODING);
6118 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006119 }
6120 }
6121 }
6122 htmlParseTryOrFinish(ctxt, terminate);
6123 if (terminate) {
6124 if ((ctxt->instate != XML_PARSER_EOF) &&
6125 (ctxt->instate != XML_PARSER_EPILOG) &&
6126 (ctxt->instate != XML_PARSER_MISC)) {
6127 ctxt->errNo = XML_ERR_DOCUMENT_END;
6128 ctxt->wellFormed = 0;
6129 }
6130 if (ctxt->instate != XML_PARSER_EOF) {
6131 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6132 ctxt->sax->endDocument(ctxt->userData);
6133 }
6134 ctxt->instate = XML_PARSER_EOF;
6135 }
6136 return((xmlParserErrors) ctxt->errNo);
6137}
6138
6139/************************************************************************
6140 * *
6141 * User entry points *
6142 * *
6143 ************************************************************************/
6144
6145/**
6146 * htmlCreatePushParserCtxt:
6147 * @sax: a SAX handler
6148 * @user_data: The user data returned on SAX callbacks
6149 * @chunk: a pointer to an array of chars
6150 * @size: number of chars in the array
6151 * @filename: an optional file name or URI
6152 * @enc: an optional encoding
6153 *
6154 * Create a parser context for using the HTML parser in push mode
6155 * The value of @filename is used for fetching external entities
6156 * and error/warning reports.
6157 *
6158 * Returns the new parser context or NULL
6159 */
6160htmlParserCtxtPtr
6161htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6162 const char *chunk, int size, const char *filename,
6163 xmlCharEncoding enc) {
6164 htmlParserCtxtPtr ctxt;
6165 htmlParserInputPtr inputStream;
6166 xmlParserInputBufferPtr buf;
6167
6168 xmlInitParser();
6169
6170 buf = xmlAllocParserInputBuffer(enc);
6171 if (buf == NULL) return(NULL);
6172
6173 ctxt = htmlNewParserCtxt();
6174 if (ctxt == NULL) {
6175 xmlFreeParserInputBuffer(buf);
6176 return(NULL);
6177 }
6178 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6179 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6180 if (sax != NULL) {
6181 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6182 xmlFree(ctxt->sax);
6183 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6184 if (ctxt->sax == NULL) {
6185 xmlFree(buf);
6186 xmlFree(ctxt);
6187 return(NULL);
6188 }
6189 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6190 if (user_data != NULL)
6191 ctxt->userData = user_data;
6192 }
6193 if (filename == NULL) {
6194 ctxt->directory = NULL;
6195 } else {
6196 ctxt->directory = xmlParserGetDirectory(filename);
6197 }
6198
6199 inputStream = htmlNewInputStream(ctxt);
6200 if (inputStream == NULL) {
6201 xmlFreeParserCtxt(ctxt);
6202 xmlFree(buf);
6203 return(NULL);
6204 }
6205
6206 if (filename == NULL)
6207 inputStream->filename = NULL;
6208 else
6209 inputStream->filename = (char *)
6210 xmlCanonicPath((const xmlChar *) filename);
6211 inputStream->buf = buf;
6212 xmlBufResetInput(buf->buffer, inputStream);
6213
6214 inputPush(ctxt, inputStream);
6215
6216 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6217 (ctxt->input->buf != NULL)) {
6218 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6219 size_t cur = ctxt->input->cur - ctxt->input->base;
6220
6221 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6222
6223 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6224#ifdef DEBUG_PUSH
6225 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6226#endif
6227 }
6228 ctxt->progressive = 1;
6229
6230 return(ctxt);
6231}
6232#endif /* LIBXML_PUSH_ENABLED */
6233
6234/**
6235 * htmlSAXParseDoc:
6236 * @cur: a pointer to an array of xmlChar
6237 * @encoding: a free form C string describing the HTML document encoding, or NULL
6238 * @sax: the SAX handler block
6239 * @userData: if using SAX, this pointer will be provided on callbacks.
6240 *
6241 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6242 * to handle parse events. If sax is NULL, fallback to the default DOM
6243 * behavior and return a tree.
6244 *
6245 * Returns the resulting document tree unless SAX is NULL or the document is
6246 * not well formed.
6247 */
6248
6249htmlDocPtr
6250htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6251 htmlSAXHandlerPtr sax, void *userData) {
6252 htmlDocPtr ret;
6253 htmlParserCtxtPtr ctxt;
6254
6255 xmlInitParser();
6256
6257 if (cur == NULL) return(NULL);
6258
6259
6260 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6261 if (ctxt == NULL) return(NULL);
6262 if (sax != NULL) {
6263 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6264 ctxt->sax = sax;
6265 ctxt->userData = userData;
6266 }
6267
6268 htmlParseDocument(ctxt);
6269 ret = ctxt->myDoc;
6270 if (sax != NULL) {
6271 ctxt->sax = NULL;
6272 ctxt->userData = NULL;
6273 }
6274 htmlFreeParserCtxt(ctxt);
6275
6276 return(ret);
6277}
6278
6279/**
6280 * htmlParseDoc:
6281 * @cur: a pointer to an array of xmlChar
6282 * @encoding: a free form C string describing the HTML document encoding, or NULL
6283 *
6284 * parse an HTML in-memory document and build a tree.
6285 *
6286 * Returns the resulting document tree
6287 */
6288
6289htmlDocPtr
6290htmlParseDoc(const xmlChar *cur, const char *encoding) {
6291 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6292}
6293
6294
6295/**
6296 * htmlCreateFileParserCtxt:
6297 * @filename: the filename
6298 * @encoding: a free form C string describing the HTML document encoding, or NULL
6299 *
6300 * Create a parser context for a file content.
6301 * Automatic support for ZLIB/Compress compressed document is provided
6302 * by default if found at compile-time.
6303 *
6304 * Returns the new parser context or NULL
6305 */
6306htmlParserCtxtPtr
6307htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6308{
6309 htmlParserCtxtPtr ctxt;
6310 htmlParserInputPtr inputStream;
6311 char *canonicFilename;
6312 /* htmlCharEncoding enc; */
6313 xmlChar *content, *content_line = (xmlChar *) "charset=";
6314
6315 if (filename == NULL)
6316 return(NULL);
6317
6318 ctxt = htmlNewParserCtxt();
6319 if (ctxt == NULL) {
6320 return(NULL);
6321 }
6322 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6323 if (canonicFilename == NULL) {
6324#ifdef LIBXML_SAX1_ENABLED
6325 if (xmlDefaultSAXHandler.error != NULL) {
6326 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6327 }
6328#endif
6329 xmlFreeParserCtxt(ctxt);
6330 return(NULL);
6331 }
6332
6333 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6334 xmlFree(canonicFilename);
6335 if (inputStream == NULL) {
6336 xmlFreeParserCtxt(ctxt);
6337 return(NULL);
6338 }
6339
6340 inputPush(ctxt, inputStream);
6341
6342 /* set encoding */
6343 if (encoding) {
6344 size_t l = strlen(encoding);
6345
6346 if (l < 1000) {
6347 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6348 if (content) {
6349 strcpy ((char *)content, (char *)content_line);
6350 strcat ((char *)content, (char *)encoding);
6351 htmlCheckEncoding (ctxt, content);
6352 xmlFree (content);
6353 }
6354 }
6355 }
6356
6357 return(ctxt);
6358}
6359
6360/**
6361 * htmlSAXParseFile:
6362 * @filename: the filename
6363 * @encoding: a free form C string describing the HTML document encoding, or NULL
6364 * @sax: the SAX handler block
6365 * @userData: if using SAX, this pointer will be provided on callbacks.
6366 *
6367 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6368 * compressed document is provided by default if found at compile-time.
6369 * It use the given SAX function block to handle the parsing callback.
6370 * If sax is NULL, fallback to the default DOM tree building routines.
6371 *
6372 * Returns the resulting document tree unless SAX is NULL or the document is
6373 * not well formed.
6374 */
6375
6376htmlDocPtr
6377htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6378 void *userData) {
6379 htmlDocPtr ret;
6380 htmlParserCtxtPtr ctxt;
6381 htmlSAXHandlerPtr oldsax = NULL;
6382
6383 xmlInitParser();
6384
6385 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6386 if (ctxt == NULL) return(NULL);
6387 if (sax != NULL) {
6388 oldsax = ctxt->sax;
6389 ctxt->sax = sax;
6390 ctxt->userData = userData;
6391 }
6392
6393 htmlParseDocument(ctxt);
6394
6395 ret = ctxt->myDoc;
6396 if (sax != NULL) {
6397 ctxt->sax = oldsax;
6398 ctxt->userData = NULL;
6399 }
6400 htmlFreeParserCtxt(ctxt);
6401
6402 return(ret);
6403}
6404
6405/**
6406 * htmlParseFile:
6407 * @filename: the filename
6408 * @encoding: a free form C string describing the HTML document encoding, or NULL
6409 *
6410 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6411 * compressed document is provided by default if found at compile-time.
6412 *
6413 * Returns the resulting document tree
6414 */
6415
6416htmlDocPtr
6417htmlParseFile(const char *filename, const char *encoding) {
6418 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6419}
6420
6421/**
6422 * htmlHandleOmittedElem:
6423 * @val: int 0 or 1
6424 *
6425 * Set and return the previous value for handling HTML omitted tags.
6426 *
6427 * Returns the last value for 0 for no handling, 1 for auto insertion.
6428 */
6429
6430int
6431htmlHandleOmittedElem(int val) {
6432 int old = htmlOmittedDefaultValue;
6433
6434 htmlOmittedDefaultValue = val;
6435 return(old);
6436}
6437
6438/**
6439 * htmlElementAllowedHere:
6440 * @parent: HTML parent element
6441 * @elt: HTML element
6442 *
6443 * Checks whether an HTML element may be a direct child of a parent element.
6444 * Note - doesn't check for deprecated elements
6445 *
6446 * Returns 1 if allowed; 0 otherwise.
6447 */
6448int
6449htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6450 const char** p ;
6451
6452 if ( ! elt || ! parent || ! parent->subelts )
6453 return 0 ;
6454
6455 for ( p = parent->subelts; *p; ++p )
6456 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6457 return 1 ;
6458
6459 return 0 ;
6460}
6461/**
6462 * htmlElementStatusHere:
6463 * @parent: HTML parent element
6464 * @elt: HTML element
6465 *
6466 * Checks whether an HTML element may be a direct child of a parent element.
6467 * and if so whether it is valid or deprecated.
6468 *
6469 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6470 */
6471htmlStatus
6472htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6473 if ( ! parent || ! elt )
6474 return HTML_INVALID ;
6475 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6476 return HTML_INVALID ;
6477
6478 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6479}
6480/**
6481 * htmlAttrAllowed:
6482 * @elt: HTML element
6483 * @attr: HTML attribute
6484 * @legacy: whether to allow deprecated attributes
6485 *
6486 * Checks whether an attribute is valid for an element
6487 * Has full knowledge of Required and Deprecated attributes
6488 *
6489 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6490 */
6491htmlStatus
6492htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6493 const char** p ;
6494
6495 if ( !elt || ! attr )
6496 return HTML_INVALID ;
6497
6498 if ( elt->attrs_req )
6499 for ( p = elt->attrs_req; *p; ++p)
6500 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6501 return HTML_REQUIRED ;
6502
6503 if ( elt->attrs_opt )
6504 for ( p = elt->attrs_opt; *p; ++p)
6505 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6506 return HTML_VALID ;
6507
6508 if ( legacy && elt->attrs_depr )
6509 for ( p = elt->attrs_depr; *p; ++p)
6510 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6511 return HTML_DEPRECATED ;
6512
6513 return HTML_INVALID ;
6514}
6515/**
6516 * htmlNodeStatus:
6517 * @node: an htmlNodePtr in a tree
6518 * @legacy: whether to allow deprecated elements (YES is faster here
6519 * for Element nodes)
6520 *
6521 * Checks whether the tree node is valid. Experimental (the author
6522 * only uses the HTML enhancements in a SAX parser)
6523 *
6524 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6525 * legacy allowed) or htmlElementStatusHere (otherwise).
6526 * for Attribute nodes, a return from htmlAttrAllowed
6527 * for other nodes, HTML_NA (no checks performed)
6528 */
6529htmlStatus
6530htmlNodeStatus(const htmlNodePtr node, int legacy) {
6531 if ( ! node )
6532 return HTML_INVALID ;
6533
6534 switch ( node->type ) {
6535 case XML_ELEMENT_NODE:
6536 return legacy
6537 ? ( htmlElementAllowedHere (
6538 htmlTagLookup(node->parent->name) , node->name
6539 ) ? HTML_VALID : HTML_INVALID )
6540 : htmlElementStatusHere(
6541 htmlTagLookup(node->parent->name) ,
6542 htmlTagLookup(node->name) )
6543 ;
6544 case XML_ATTRIBUTE_NODE:
6545 return htmlAttrAllowed(
6546 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6547 default: return HTML_NA ;
6548 }
6549}
6550/************************************************************************
6551 * *
6552 * New set (2.6.0) of simpler and more flexible APIs *
6553 * *
6554 ************************************************************************/
6555/**
6556 * DICT_FREE:
6557 * @str: a string
6558 *
6559 * Free a string if it is not owned by the "dict" dictionary in the
6560 * current scope
6561 */
6562#define DICT_FREE(str) \
6563 if ((str) && ((!dict) || \
6564 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6565 xmlFree((char *)(str));
6566
6567/**
6568 * htmlCtxtReset:
6569 * @ctxt: an HTML parser context
6570 *
6571 * Reset a parser context
6572 */
6573void
6574htmlCtxtReset(htmlParserCtxtPtr ctxt)
6575{
6576 xmlParserInputPtr input;
6577 xmlDictPtr dict;
6578
6579 if (ctxt == NULL)
6580 return;
6581
6582 xmlInitParser();
6583 dict = ctxt->dict;
6584
6585 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6586 xmlFreeInputStream(input);
6587 }
6588 ctxt->inputNr = 0;
6589 ctxt->input = NULL;
6590
6591 ctxt->spaceNr = 0;
6592 if (ctxt->spaceTab != NULL) {
6593 ctxt->spaceTab[0] = -1;
6594 ctxt->space = &ctxt->spaceTab[0];
6595 } else {
6596 ctxt->space = NULL;
6597 }
6598
6599
6600 ctxt->nodeNr = 0;
6601 ctxt->node = NULL;
6602
6603 ctxt->nameNr = 0;
6604 ctxt->name = NULL;
6605
6606 DICT_FREE(ctxt->version);
6607 ctxt->version = NULL;
6608 DICT_FREE(ctxt->encoding);
6609 ctxt->encoding = NULL;
6610 DICT_FREE(ctxt->directory);
6611 ctxt->directory = NULL;
6612 DICT_FREE(ctxt->extSubURI);
6613 ctxt->extSubURI = NULL;
6614 DICT_FREE(ctxt->extSubSystem);
6615 ctxt->extSubSystem = NULL;
6616 if (ctxt->myDoc != NULL)
6617 xmlFreeDoc(ctxt->myDoc);
6618 ctxt->myDoc = NULL;
6619
6620 ctxt->standalone = -1;
6621 ctxt->hasExternalSubset = 0;
6622 ctxt->hasPErefs = 0;
6623 ctxt->html = 1;
6624 ctxt->external = 0;
6625 ctxt->instate = XML_PARSER_START;
6626 ctxt->token = 0;
6627
6628 ctxt->wellFormed = 1;
6629 ctxt->nsWellFormed = 1;
6630 ctxt->disableSAX = 0;
6631 ctxt->valid = 1;
6632 ctxt->vctxt.userData = ctxt;
6633 ctxt->vctxt.error = xmlParserValidityError;
6634 ctxt->vctxt.warning = xmlParserValidityWarning;
6635 ctxt->record_info = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006636 ctxt->checkIndex = 0;
6637 ctxt->inSubset = 0;
6638 ctxt->errNo = XML_ERR_OK;
6639 ctxt->depth = 0;
6640 ctxt->charset = XML_CHAR_ENCODING_NONE;
6641 ctxt->catalogs = NULL;
6642 xmlInitNodeInfoSeq(&ctxt->node_seq);
6643
6644 if (ctxt->attsDefault != NULL) {
6645 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6646 ctxt->attsDefault = NULL;
6647 }
6648 if (ctxt->attsSpecial != NULL) {
6649 xmlHashFree(ctxt->attsSpecial, NULL);
6650 ctxt->attsSpecial = NULL;
6651 }
6652}
6653
6654/**
6655 * htmlCtxtUseOptions:
6656 * @ctxt: an HTML parser context
6657 * @options: a combination of htmlParserOption(s)
6658 *
6659 * Applies the options to the parser context
6660 *
6661 * Returns 0 in case of success, the set of unknown or unimplemented options
6662 * in case of error.
6663 */
6664int
6665htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6666{
6667 if (ctxt == NULL)
6668 return(-1);
6669
6670 if (options & HTML_PARSE_NOWARNING) {
6671 ctxt->sax->warning = NULL;
6672 ctxt->vctxt.warning = NULL;
6673 options -= XML_PARSE_NOWARNING;
6674 ctxt->options |= XML_PARSE_NOWARNING;
6675 }
6676 if (options & HTML_PARSE_NOERROR) {
6677 ctxt->sax->error = NULL;
6678 ctxt->vctxt.error = NULL;
6679 ctxt->sax->fatalError = NULL;
6680 options -= XML_PARSE_NOERROR;
6681 ctxt->options |= XML_PARSE_NOERROR;
6682 }
6683 if (options & HTML_PARSE_PEDANTIC) {
6684 ctxt->pedantic = 1;
6685 options -= XML_PARSE_PEDANTIC;
6686 ctxt->options |= XML_PARSE_PEDANTIC;
6687 } else
6688 ctxt->pedantic = 0;
6689 if (options & XML_PARSE_NOBLANKS) {
6690 ctxt->keepBlanks = 0;
6691 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6692 options -= XML_PARSE_NOBLANKS;
6693 ctxt->options |= XML_PARSE_NOBLANKS;
6694 } else
6695 ctxt->keepBlanks = 1;
6696 if (options & HTML_PARSE_RECOVER) {
6697 ctxt->recovery = 1;
6698 options -= HTML_PARSE_RECOVER;
6699 } else
6700 ctxt->recovery = 0;
6701 if (options & HTML_PARSE_COMPACT) {
6702 ctxt->options |= HTML_PARSE_COMPACT;
6703 options -= HTML_PARSE_COMPACT;
6704 }
6705 if (options & XML_PARSE_HUGE) {
6706 ctxt->options |= XML_PARSE_HUGE;
6707 options -= XML_PARSE_HUGE;
6708 }
6709 if (options & HTML_PARSE_NODEFDTD) {
6710 ctxt->options |= HTML_PARSE_NODEFDTD;
6711 options -= HTML_PARSE_NODEFDTD;
6712 }
6713 if (options & HTML_PARSE_IGNORE_ENC) {
6714 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6715 options -= HTML_PARSE_IGNORE_ENC;
6716 }
6717 if (options & HTML_PARSE_NOIMPLIED) {
6718 ctxt->options |= HTML_PARSE_NOIMPLIED;
6719 options -= HTML_PARSE_NOIMPLIED;
6720 }
6721 ctxt->dictNames = 0;
6722 return (options);
6723}
6724
6725/**
6726 * htmlDoRead:
6727 * @ctxt: an HTML parser context
6728 * @URL: the base URL to use for the document
6729 * @encoding: the document encoding, or NULL
6730 * @options: a combination of htmlParserOption(s)
6731 * @reuse: keep the context for reuse
6732 *
6733 * Common front-end for the htmlRead functions
6734 *
6735 * Returns the resulting document tree or NULL
6736 */
6737static htmlDocPtr
6738htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6739 int options, int reuse)
6740{
6741 htmlDocPtr ret;
6742
6743 htmlCtxtUseOptions(ctxt, options);
6744 ctxt->html = 1;
6745 if (encoding != NULL) {
6746 xmlCharEncodingHandlerPtr hdlr;
6747
6748 hdlr = xmlFindCharEncodingHandler(encoding);
6749 if (hdlr != NULL) {
6750 xmlSwitchToEncoding(ctxt, hdlr);
6751 if (ctxt->input->encoding != NULL)
6752 xmlFree((xmlChar *) ctxt->input->encoding);
6753 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6754 }
6755 }
6756 if ((URL != NULL) && (ctxt->input != NULL) &&
6757 (ctxt->input->filename == NULL))
6758 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6759 htmlParseDocument(ctxt);
6760 ret = ctxt->myDoc;
6761 ctxt->myDoc = NULL;
6762 if (!reuse) {
6763 if ((ctxt->dictNames) &&
6764 (ret != NULL) &&
6765 (ret->dict == ctxt->dict))
6766 ctxt->dict = NULL;
6767 xmlFreeParserCtxt(ctxt);
6768 }
6769 return (ret);
6770}
6771
6772/**
6773 * htmlReadDoc:
6774 * @cur: a pointer to a zero terminated string
6775 * @URL: the base URL to use for the document
6776 * @encoding: the document encoding, or NULL
6777 * @options: a combination of htmlParserOption(s)
6778 *
6779 * parse an XML in-memory document and build a tree.
6780 *
6781 * Returns the resulting document tree
6782 */
6783htmlDocPtr
6784htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6785{
6786 htmlParserCtxtPtr ctxt;
6787
6788 if (cur == NULL)
6789 return (NULL);
6790
6791 xmlInitParser();
6792 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6793 if (ctxt == NULL)
6794 return (NULL);
6795 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6796}
6797
6798/**
6799 * htmlReadFile:
6800 * @filename: a file or URL
6801 * @encoding: the document encoding, or NULL
6802 * @options: a combination of htmlParserOption(s)
6803 *
6804 * parse an XML file from the filesystem or the network.
6805 *
6806 * Returns the resulting document tree
6807 */
6808htmlDocPtr
6809htmlReadFile(const char *filename, const char *encoding, int options)
6810{
6811 htmlParserCtxtPtr ctxt;
6812
6813 xmlInitParser();
6814 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6815 if (ctxt == NULL)
6816 return (NULL);
6817 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6818}
6819
6820/**
6821 * htmlReadMemory:
6822 * @buffer: a pointer to a char array
6823 * @size: the size of the array
6824 * @URL: the base URL to use for the document
6825 * @encoding: the document encoding, or NULL
6826 * @options: a combination of htmlParserOption(s)
6827 *
6828 * parse an XML in-memory document and build a tree.
6829 *
6830 * Returns the resulting document tree
6831 */
6832htmlDocPtr
6833htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6834{
6835 htmlParserCtxtPtr ctxt;
6836
6837 xmlInitParser();
6838 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6839 if (ctxt == NULL)
6840 return (NULL);
6841 htmlDefaultSAXHandlerInit();
6842 if (ctxt->sax != NULL)
6843 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6844 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6845}
6846
6847/**
6848 * htmlReadFd:
6849 * @fd: an open file descriptor
6850 * @URL: the base URL to use for the document
6851 * @encoding: the document encoding, or NULL
6852 * @options: a combination of htmlParserOption(s)
6853 *
6854 * parse an XML from a file descriptor and build a tree.
6855 *
6856 * Returns the resulting document tree
6857 */
6858htmlDocPtr
6859htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6860{
6861 htmlParserCtxtPtr ctxt;
6862 xmlParserInputBufferPtr input;
6863 xmlParserInputPtr stream;
6864
6865 if (fd < 0)
6866 return (NULL);
6867 xmlInitParser();
6868
6869 xmlInitParser();
6870 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6871 if (input == NULL)
6872 return (NULL);
6873 ctxt = xmlNewParserCtxt();
6874 if (ctxt == NULL) {
6875 xmlFreeParserInputBuffer(input);
6876 return (NULL);
6877 }
6878 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6879 if (stream == NULL) {
6880 xmlFreeParserInputBuffer(input);
6881 xmlFreeParserCtxt(ctxt);
6882 return (NULL);
6883 }
6884 inputPush(ctxt, stream);
6885 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6886}
6887
6888/**
6889 * htmlReadIO:
6890 * @ioread: an I/O read function
6891 * @ioclose: an I/O close function
6892 * @ioctx: an I/O handler
6893 * @URL: the base URL to use for the document
6894 * @encoding: the document encoding, or NULL
6895 * @options: a combination of htmlParserOption(s)
6896 *
6897 * parse an HTML document from I/O functions and source and build a tree.
6898 *
6899 * Returns the resulting document tree
6900 */
6901htmlDocPtr
6902htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6903 void *ioctx, const char *URL, const char *encoding, int options)
6904{
6905 htmlParserCtxtPtr ctxt;
6906 xmlParserInputBufferPtr input;
6907 xmlParserInputPtr stream;
6908
6909 if (ioread == NULL)
6910 return (NULL);
6911 xmlInitParser();
6912
6913 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6914 XML_CHAR_ENCODING_NONE);
6915 if (input == NULL) {
6916 if (ioclose != NULL)
6917 ioclose(ioctx);
6918 return (NULL);
6919 }
6920 ctxt = htmlNewParserCtxt();
6921 if (ctxt == NULL) {
6922 xmlFreeParserInputBuffer(input);
6923 return (NULL);
6924 }
6925 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6926 if (stream == NULL) {
6927 xmlFreeParserInputBuffer(input);
6928 xmlFreeParserCtxt(ctxt);
6929 return (NULL);
6930 }
6931 inputPush(ctxt, stream);
6932 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6933}
6934
6935/**
6936 * htmlCtxtReadDoc:
6937 * @ctxt: an HTML parser context
6938 * @cur: a pointer to a zero terminated string
6939 * @URL: the base URL to use for the document
6940 * @encoding: the document encoding, or NULL
6941 * @options: a combination of htmlParserOption(s)
6942 *
6943 * parse an XML in-memory document and build a tree.
6944 * This reuses the existing @ctxt parser context
6945 *
6946 * Returns the resulting document tree
6947 */
6948htmlDocPtr
6949htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6950 const char *URL, const char *encoding, int options)
6951{
6952 xmlParserInputPtr stream;
6953
6954 if (cur == NULL)
6955 return (NULL);
6956 if (ctxt == NULL)
6957 return (NULL);
6958 xmlInitParser();
6959
6960 htmlCtxtReset(ctxt);
6961
6962 stream = xmlNewStringInputStream(ctxt, cur);
6963 if (stream == NULL) {
6964 return (NULL);
6965 }
6966 inputPush(ctxt, stream);
6967 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6968}
6969
6970/**
6971 * htmlCtxtReadFile:
6972 * @ctxt: an HTML parser context
6973 * @filename: a file or URL
6974 * @encoding: the document encoding, or NULL
6975 * @options: a combination of htmlParserOption(s)
6976 *
6977 * parse an XML file from the filesystem or the network.
6978 * This reuses the existing @ctxt parser context
6979 *
6980 * Returns the resulting document tree
6981 */
6982htmlDocPtr
6983htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6984 const char *encoding, int options)
6985{
6986 xmlParserInputPtr stream;
6987
6988 if (filename == NULL)
6989 return (NULL);
6990 if (ctxt == NULL)
6991 return (NULL);
6992 xmlInitParser();
6993
6994 htmlCtxtReset(ctxt);
6995
6996 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6997 if (stream == NULL) {
6998 return (NULL);
6999 }
7000 inputPush(ctxt, stream);
7001 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7002}
7003
7004/**
7005 * htmlCtxtReadMemory:
7006 * @ctxt: an HTML parser context
7007 * @buffer: a pointer to a char array
7008 * @size: the size of the array
7009 * @URL: the base URL to use for the document
7010 * @encoding: the document encoding, or NULL
7011 * @options: a combination of htmlParserOption(s)
7012 *
7013 * parse an XML in-memory document and build a tree.
7014 * This reuses the existing @ctxt parser context
7015 *
7016 * Returns the resulting document tree
7017 */
7018htmlDocPtr
7019htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7020 const char *URL, const char *encoding, int options)
7021{
7022 xmlParserInputBufferPtr input;
7023 xmlParserInputPtr stream;
7024
7025 if (ctxt == NULL)
7026 return (NULL);
7027 if (buffer == NULL)
7028 return (NULL);
7029 xmlInitParser();
7030
7031 htmlCtxtReset(ctxt);
7032
7033 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7034 if (input == NULL) {
7035 return(NULL);
7036 }
7037
7038 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7039 if (stream == NULL) {
7040 xmlFreeParserInputBuffer(input);
7041 return(NULL);
7042 }
7043
7044 inputPush(ctxt, stream);
7045 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7046}
7047
7048/**
7049 * htmlCtxtReadFd:
7050 * @ctxt: an HTML parser context
7051 * @fd: an open file descriptor
7052 * @URL: the base URL to use for the document
7053 * @encoding: the document encoding, or NULL
7054 * @options: a combination of htmlParserOption(s)
7055 *
7056 * parse an XML from a file descriptor and build a tree.
7057 * This reuses the existing @ctxt parser context
7058 *
7059 * Returns the resulting document tree
7060 */
7061htmlDocPtr
7062htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7063 const char *URL, const char *encoding, int options)
7064{
7065 xmlParserInputBufferPtr input;
7066 xmlParserInputPtr stream;
7067
7068 if (fd < 0)
7069 return (NULL);
7070 if (ctxt == NULL)
7071 return (NULL);
7072 xmlInitParser();
7073
7074 htmlCtxtReset(ctxt);
7075
7076
7077 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7078 if (input == NULL)
7079 return (NULL);
7080 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7081 if (stream == NULL) {
7082 xmlFreeParserInputBuffer(input);
7083 return (NULL);
7084 }
7085 inputPush(ctxt, stream);
7086 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7087}
7088
7089/**
7090 * htmlCtxtReadIO:
7091 * @ctxt: an HTML parser context
7092 * @ioread: an I/O read function
7093 * @ioclose: an I/O close function
7094 * @ioctx: an I/O handler
7095 * @URL: the base URL to use for the document
7096 * @encoding: the document encoding, or NULL
7097 * @options: a combination of htmlParserOption(s)
7098 *
7099 * parse an HTML document from I/O functions and source and build a tree.
7100 * This reuses the existing @ctxt parser context
7101 *
7102 * Returns the resulting document tree
7103 */
7104htmlDocPtr
7105htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7106 xmlInputCloseCallback ioclose, void *ioctx,
7107 const char *URL,
7108 const char *encoding, int options)
7109{
7110 xmlParserInputBufferPtr input;
7111 xmlParserInputPtr stream;
7112
7113 if (ioread == NULL)
7114 return (NULL);
7115 if (ctxt == NULL)
7116 return (NULL);
7117 xmlInitParser();
7118
7119 htmlCtxtReset(ctxt);
7120
7121 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7122 XML_CHAR_ENCODING_NONE);
7123 if (input == NULL) {
7124 if (ioclose != NULL)
7125 ioclose(ioctx);
7126 return (NULL);
7127 }
7128 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7129 if (stream == NULL) {
7130 xmlFreeParserInputBuffer(input);
7131 return (NULL);
7132 }
7133 inputPush(ctxt, stream);
7134 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7135}
7136
7137#define bottom_HTMLparser
7138#include "elfgcchack.h"
7139#endif /* LIBXML_HTML_ENABLED */