blob: 14cc56fa6de7c592c4b521c74e3ae6ca0e8f3e4c [file] [log] [blame]
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef LIBXML_ZLIB_ENABLED
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#include "buf.h"
48#include "enc.h"
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57static int htmlOmittedDefaultValue = 1;
58
59xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63/************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69/**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
Haibo Huangcfd91dc2020-07-30 23:01:33 -070072 * @extra: extra information
Elliott Hughes7fbecab2019-01-10 16:42:03 -080073 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void LIBXML_ATTR_FORMAT(3,0)
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void LIBXML_ATTR_FORMAT(3,0)
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149}
150
151/************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166static int
167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168{
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196static const xmlChar *
197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213}
214
215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Haibo Huangf0a546b2020-09-01 20:28:19 -0700299#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306#define BASE_PTR ctxt->input->base
307
308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700320/* Imported from XML */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800321
322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
324#define NEXT xmlNextChar(ctxt)
325
326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
Haibo Huangf0a546b2020-09-01 20:28:19 -0700333 ctxt->token = 0; ctxt->input->cur += l; \
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800334 } while (0)
335
336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415static int
416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
420
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
423
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
427 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
431
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700432 /*
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
436 */
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
444 }
445 return((int) *ctxt->input->cur);
446 }
447
448 /*
449 * Humm this is bad, do an automatic flow conversion
450 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
460 xmlSwitchToEncoding(ctxt, handler);
461 } else {
462 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
463 "Unsupported encoding %s", guess, NULL);
464 }
465 }
466 ctxt->charset = XML_CHAR_ENCODING_UTF8;
467 }
468
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700469 /*
470 * We are supposed to handle UTF8, check it's valid
471 * From rfc2044: encoding of the Unicode values on UTF-8:
472 *
473 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
474 * 0000 0000-0000 007F 0xxxxxxx
475 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
476 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
477 *
478 * Check for the 0x110000 limit too
479 */
480 cur = ctxt->input->cur;
481 c = *cur;
482 if (c & 0x80) {
483 if ((c & 0x40) == 0)
484 goto encoding_error;
485 if (cur[1] == 0) {
486 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
487 cur = ctxt->input->cur;
488 }
489 if ((cur[1] & 0xc0) != 0x80)
490 goto encoding_error;
491 if ((c & 0xe0) == 0xe0) {
492
493 if (cur[2] == 0) {
494 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
495 cur = ctxt->input->cur;
496 }
497 if ((cur[2] & 0xc0) != 0x80)
498 goto encoding_error;
499 if ((c & 0xf0) == 0xf0) {
500 if (cur[3] == 0) {
501 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
502 cur = ctxt->input->cur;
503 }
504 if (((c & 0xf8) != 0xf0) ||
505 ((cur[3] & 0xc0) != 0x80))
506 goto encoding_error;
507 /* 4-byte code */
508 *len = 4;
509 val = (cur[0] & 0x7) << 18;
510 val |= (cur[1] & 0x3f) << 12;
511 val |= (cur[2] & 0x3f) << 6;
512 val |= cur[3] & 0x3f;
513 if (val < 0x10000)
514 goto encoding_error;
515 } else {
516 /* 3-byte code */
517 *len = 3;
518 val = (cur[0] & 0xf) << 12;
519 val |= (cur[1] & 0x3f) << 6;
520 val |= cur[2] & 0x3f;
521 if (val < 0x800)
522 goto encoding_error;
523 }
524 } else {
525 /* 2-byte code */
526 *len = 2;
527 val = (cur[0] & 0x1f) << 6;
528 val |= cur[1] & 0x3f;
529 if (val < 0x80)
530 goto encoding_error;
531 }
532 if (!IS_CHAR(val)) {
533 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
534 "Char 0x%X out of allowed range\n", val);
535 }
536 return(val);
537 } else {
538 if ((*ctxt->input->cur == 0) &&
539 (ctxt->input->cur < ctxt->input->end)) {
540 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
541 "Char 0x%X out of allowed range\n", 0);
542 *len = 1;
543 return(' ');
544 }
545 /* 1-byte code */
546 *len = 1;
547 return((int) *ctxt->input->cur);
548 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800549
550encoding_error:
551 /*
552 * If we detect an UTF8 error that probably mean that the
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700553 * input encoding didn't get properly advertised in the
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800554 * declaration header. Report the error and switch the encoding
555 * to ISO-Latin-1 (if you don't like this policy, just declare the
556 * encoding !)
557 */
558 {
559 char buffer[150];
560
561 if (ctxt->input->end - ctxt->input->cur >= 4) {
562 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
563 ctxt->input->cur[0], ctxt->input->cur[1],
564 ctxt->input->cur[2], ctxt->input->cur[3]);
565 } else {
566 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
567 }
568 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
569 "Input is not proper UTF-8, indicate encoding !\n",
570 BAD_CAST buffer, NULL);
571 }
572
573 ctxt->charset = XML_CHAR_ENCODING_8859_1;
574 *len = 1;
575 return((int) *ctxt->input->cur);
576}
577
578/**
579 * htmlSkipBlankChars:
580 * @ctxt: the HTML parser context
581 *
582 * skip all blanks character found at that point in the input streams.
583 *
584 * Returns the number of space chars skipped
585 */
586
587static int
588htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
589 int res = 0;
590
591 while (IS_BLANK_CH(*(ctxt->input->cur))) {
592 if ((*ctxt->input->cur == 0) &&
593 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
594 xmlPopInput(ctxt);
595 } else {
596 if (*(ctxt->input->cur) == '\n') {
597 ctxt->input->line++; ctxt->input->col = 1;
598 } else ctxt->input->col++;
599 ctxt->input->cur++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800600 if (*ctxt->input->cur == 0)
601 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
602 }
603 res++;
604 }
605 return(res);
606}
607
608
609
610/************************************************************************
611 * *
612 * The list of HTML elements and their properties *
613 * *
614 ************************************************************************/
615
616/*
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700617 * Start Tag: 1 means the start tag can be omitted
618 * End Tag: 1 means the end tag can be omitted
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800619 * 2 means it's forbidden (empty elements)
620 * 3 means the tag is stylistic and should be closed easily
621 * Depr: this element is deprecated
622 * DTD: 1 means that this element is valid only in the Loose DTD
623 * 2 means that this element is valid only in the Frameset DTD
624 *
625 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
626 , subElements , impliedsubelt , Attributes, userdata
627 */
628
629/* Definitions and a couple of vars for HTML Elements */
630
631#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
632#define NB_FONTSTYLE 8
633#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
634#define NB_PHRASE 10
635#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
636#define NB_SPECIAL 16
637#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
638#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
639#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
640#define NB_BLOCK NB_HEADING + NB_LIST + 14
641#define FORMCTRL "input", "select", "textarea", "label", "button"
642#define NB_FORMCTRL 5
643#define PCDATA
644#define NB_PCDATA 0
645#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
646#define NB_HEADING 6
647#define LIST "ul", "ol", "dir", "menu"
648#define NB_LIST 4
649#define MODIFIER
650#define NB_MODIFIER 0
651#define FLOW BLOCK,INLINE
652#define NB_FLOW NB_BLOCK + NB_INLINE
653#define EMPTY NULL
654
655
656static const char* const html_flow[] = { FLOW, NULL } ;
657static const char* const html_inline[] = { INLINE, NULL } ;
658
659/* placeholders: elts with content but no subelements */
660static const char* const html_pcdata[] = { NULL } ;
661#define html_cdata html_pcdata
662
663
664/* ... and for HTML Attributes */
665
666#define COREATTRS "id", "class", "style", "title"
667#define NB_COREATTRS 4
668#define I18N "lang", "dir"
669#define NB_I18N 2
670#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
671#define NB_EVENTS 9
672#define ATTRS COREATTRS,I18N,EVENTS
673#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
674#define CELLHALIGN "align", "char", "charoff"
675#define NB_CELLHALIGN 3
676#define CELLVALIGN "valign"
677#define NB_CELLVALIGN 1
678
679static const char* const html_attrs[] = { ATTRS, NULL } ;
680static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
681static const char* const core_attrs[] = { COREATTRS, NULL } ;
682static const char* const i18n_attrs[] = { I18N, NULL } ;
683
684
685/* Other declarations that should go inline ... */
686static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
687 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
688 "tabindex", "onfocus", "onblur", NULL } ;
689static const char* const target_attr[] = { "target", NULL } ;
690static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
691static const char* const alt_attr[] = { "alt", NULL } ;
692static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
693static const char* const href_attrs[] = { "href", NULL } ;
694static const char* const clear_attrs[] = { "clear", NULL } ;
695static const char* const inline_p[] = { INLINE, "p", NULL } ;
696
697static const char* const flow_param[] = { FLOW, "param", NULL } ;
698static const char* const applet_attrs[] = { COREATTRS , "codebase",
699 "archive", "alt", "name", "height", "width", "align",
700 "hspace", "vspace", NULL } ;
701static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
702 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
703static const char* const basefont_attrs[] =
704 { "id", "size", "color", "face", NULL } ;
705static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
706static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
707static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
708static const char* const body_depr[] = { "background", "bgcolor", "text",
709 "link", "vlink", "alink", NULL } ;
710static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
711 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
712
713
714static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
715static const char* const col_elt[] = { "col", NULL } ;
716static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
717static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
718static const char* const dl_contents[] = { "dt", "dd", NULL } ;
719static const char* const compact_attr[] = { "compact", NULL } ;
720static const char* const label_attr[] = { "label", NULL } ;
721static const char* const fieldset_contents[] = { FLOW, "legend" } ;
722static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
723static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
724static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
725static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
726static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
727static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
728static const char* const head_attrs[] = { I18N, "profile", NULL } ;
729static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
730static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
731static const char* const version_attr[] = { "version", NULL } ;
732static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
733static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
734static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
735static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
736static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
737static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
738static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
739static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
740static const char* const align_attr[] = { "align", NULL } ;
741static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
742static const char* const map_contents[] = { BLOCK, "area", NULL } ;
743static const char* const name_attr[] = { "name", NULL } ;
744static const char* const action_attr[] = { "action", NULL } ;
745static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
746static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
747static const char* const content_attr[] = { "content", NULL } ;
748static const char* const type_attr[] = { "type", NULL } ;
749static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
750static const char* const object_contents[] = { FLOW, "param", NULL } ;
751static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
752static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
753static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
754static const char* const option_elt[] = { "option", NULL } ;
755static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
756static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
757static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
758static const char* const width_attr[] = { "width", NULL } ;
759static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
760static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
761static const char* const language_attr[] = { "language", NULL } ;
762static const char* const select_content[] = { "optgroup", "option", NULL } ;
763static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
764static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
765static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
766static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
767static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
768static const char* const tr_elt[] = { "tr", NULL } ;
769static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
770static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
771static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
772static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
773static const char* const tr_contents[] = { "th", "td", NULL } ;
774static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
775static const char* const li_elt[] = { "li", NULL } ;
776static const char* const ul_depr[] = { "type", "compact", NULL} ;
777static const char* const dir_attr[] = { "dir", NULL} ;
778
779#define DECL (const char**)
780
781static const htmlElemDesc
782html40ElementTable[] = {
783{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
784 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
785},
786{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
787 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
788},
789{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791},
792{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
793 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
794},
795{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
796 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
797},
798{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
799 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
800},
801{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803},
804{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
805 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
806},
807{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
808 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
809},
810{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
811 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
812},
813{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
814 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
815},
816{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
817 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
818},
819{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
820 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
821},
822{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
823 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
824},
825{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
826 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
827},
828{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
829 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
830},
831{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
832 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
833},
834{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
835 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
836},
837{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
838 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
839},
840{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
841 EMPTY , NULL , DECL col_attrs , NULL, NULL
842},
843{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
844 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
845},
846{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
847 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
848},
849{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
850 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
851},
852{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
853 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
854},
855{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
856 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
857},
858{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
859 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
860},
861{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
862 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
863},
864{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
865 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
866},
867{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
868 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
869},
870{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
871 EMPTY, NULL, DECL embed_attrs, NULL, NULL
872},
873{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
874 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
875},
876{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
877 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
878},
879{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
880 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
881},
882{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
883 EMPTY, NULL, NULL, DECL frame_attrs, NULL
884},
885{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
886 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
887},
888{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890},
891{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
895 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
896},
897{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
898 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
899},
900{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
901 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
902},
903{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
904 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
905},
906{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
907 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
908},
909{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
910 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
911},
912{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
913 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
914},
915{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
916 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
917},
918{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
919 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
920},
921{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
922 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
923},
924{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
925 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
926},
927{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
928 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
929},
930{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
931 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
932},
933{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
934 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
935},
936{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
937 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
938},
939{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
940 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
941},
942{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
943 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
944},
945{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
946 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
947},
948{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
949 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
950},
951{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
952 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
953},
954{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
955 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
956},
957{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
958 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
959},
960{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
961 DECL html_flow, "div", DECL html_attrs, NULL, NULL
962},
963{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
964 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
965},
966{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
967 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
968},
969{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
970 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
971},
972{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
973 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
974},
975{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
976 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
977},
978{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
979 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
980},
981{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
982 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
983},
984{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
985 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
986},
987{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
988 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
989},
990{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992},
993{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
994 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
995},
996{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
997 DECL select_content, NULL, DECL select_attrs, NULL, NULL
998},
999{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1000 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1001},
1002{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004},
1005{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1006 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1007},
1008{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1009 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1010},
1011{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1012 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1013},
1014{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1015 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1016},
1017{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1018 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1019},
1020{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1021 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1022},
1023{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1024 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1025},
1026{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1027 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1028},
1029{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1030 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1031},
1032{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1033 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1034},
1035{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1036 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1037},
1038{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1039 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1040},
1041{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1042 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1043},
1044{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1045 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1046},
1047{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1048 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1049},
1050{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1051 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1052},
1053{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1054 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1055},
1056{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1057 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1058}
1059};
1060
1061/*
1062 * start tags that imply the end of current element
1063 */
1064static const char * const htmlStartClose[] = {
1065"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1066 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1067 "listing", "xmp", "head", NULL,
1068"head", "p", NULL,
1069"title", "p", NULL,
1070"body", "head", "style", "link", "title", "p", NULL,
1071"frameset", "head", "style", "link", "title", "p", NULL,
1072"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1073 "pre", "listing", "xmp", "head", "li", NULL,
1074"hr", "p", "head", NULL,
1075"h1", "p", "head", NULL,
1076"h2", "p", "head", NULL,
1077"h3", "p", "head", NULL,
1078"h4", "p", "head", NULL,
1079"h5", "p", "head", NULL,
1080"h6", "p", "head", NULL,
1081"dir", "p", "head", NULL,
1082"address", "p", "head", "ul", NULL,
1083"pre", "p", "head", "ul", NULL,
1084"listing", "p", "head", NULL,
1085"xmp", "p", "head", NULL,
1086"blockquote", "p", "head", NULL,
1087"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1088 "xmp", "head", NULL,
1089"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1090 "head", "dd", NULL,
1091"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1092 "head", "dt", NULL,
1093"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1094 "listing", "xmp", NULL,
1095"ol", "p", "head", "ul", NULL,
1096"menu", "p", "head", "ul", NULL,
1097"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1098"div", "p", "head", NULL,
1099"noscript", "script", NULL,
1100"center", "font", "b", "i", "p", "head", NULL,
1101"a", "a", "head", NULL,
1102"caption", "p", NULL,
1103"colgroup", "caption", "colgroup", "col", "p", NULL,
1104"col", "caption", "col", "p", NULL,
1105"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1106 "listing", "xmp", "a", NULL,
1107"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1108"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1109"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1110"thead", "caption", "col", "colgroup", NULL,
1111"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1112 "tbody", "p", NULL,
1113"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1114 "tfoot", "tbody", "p", NULL,
1115"optgroup", "option", NULL,
1116"option", "option", NULL,
1117"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1118 "pre", "listing", "xmp", "a", NULL,
1119/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1120"tt", "head", NULL,
1121"i", "head", NULL,
1122"b", "head", NULL,
1123"u", "head", NULL,
1124"s", "head", NULL,
1125"strike", "head", NULL,
1126"big", "head", NULL,
1127"small", "head", NULL,
1128
1129"em", "head", NULL,
1130"strong", "head", NULL,
1131"dfn", "head", NULL,
1132"code", "head", NULL,
1133"samp", "head", NULL,
1134"kbd", "head", NULL,
1135"var", "head", NULL,
1136"cite", "head", NULL,
1137"abbr", "head", NULL,
1138"acronym", "head", NULL,
1139
1140/* "a" */
1141"img", "head", NULL,
1142/* "applet" */
1143/* "embed" */
1144/* "object" */
1145"font", "head", NULL,
1146/* "basefont" */
1147"br", "head", NULL,
1148/* "script" */
1149"map", "head", NULL,
1150"q", "head", NULL,
1151"sub", "head", NULL,
1152"sup", "head", NULL,
1153"span", "head", NULL,
1154"bdo", "head", NULL,
1155"iframe", "head", NULL,
1156NULL
1157};
1158
1159/*
1160 * The list of HTML elements which are supposed not to have
1161 * CDATA content and where a p element will be implied
1162 *
1163 * TODO: extend that list by reading the HTML SGML DTD on
1164 * implied paragraph
1165 */
1166static const char *const htmlNoContentElements[] = {
1167 "html",
1168 "head",
1169 NULL
1170};
1171
1172/*
1173 * The list of HTML attributes which are of content %Script;
1174 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1175 * it assumes the name starts with 'on'
1176 */
1177static const char *const htmlScriptAttributes[] = {
1178 "onclick",
1179 "ondblclick",
1180 "onmousedown",
1181 "onmouseup",
1182 "onmouseover",
1183 "onmousemove",
1184 "onmouseout",
1185 "onkeypress",
1186 "onkeydown",
1187 "onkeyup",
1188 "onload",
1189 "onunload",
1190 "onfocus",
1191 "onblur",
1192 "onsubmit",
1193 "onreset",
1194 "onchange",
1195 "onselect"
1196};
1197
1198/*
1199 * This table is used by the htmlparser to know what to do with
1200 * broken html pages. By assigning different priorities to different
1201 * elements the parser can decide how to handle extra endtags.
1202 * Endtags are only allowed to close elements with lower or equal
1203 * priority.
1204 */
1205
1206typedef struct {
1207 const char *name;
1208 int priority;
1209} elementPriority;
1210
1211static const elementPriority htmlEndPriority[] = {
1212 {"div", 150},
1213 {"td", 160},
1214 {"th", 160},
1215 {"tr", 170},
1216 {"thead", 180},
1217 {"tbody", 180},
1218 {"tfoot", 180},
1219 {"table", 190},
1220 {"head", 200},
1221 {"body", 200},
1222 {"html", 220},
1223 {NULL, 100} /* Default priority */
1224};
1225
1226static const char** htmlStartCloseIndex[100];
1227static int htmlStartCloseIndexinitialized = 0;
1228
1229/************************************************************************
1230 * *
1231 * functions to handle HTML specific data *
1232 * *
1233 ************************************************************************/
1234
1235/**
1236 * htmlInitAutoClose:
1237 *
1238 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1239 * This is not reentrant. Call xmlInitParser() once before processing in
1240 * case of use in multithreaded programs.
1241 */
1242void
1243htmlInitAutoClose(void) {
1244 int indx, i = 0;
1245
1246 if (htmlStartCloseIndexinitialized) return;
1247
1248 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1249 indx = 0;
1250 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1251 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1252 while (htmlStartClose[i] != NULL) i++;
1253 i++;
1254 }
1255 htmlStartCloseIndexinitialized = 1;
1256}
1257
1258/**
1259 * htmlTagLookup:
1260 * @tag: The tag name in lowercase
1261 *
1262 * Lookup the HTML tag in the ElementTable
1263 *
1264 * Returns the related htmlElemDescPtr or NULL if not found.
1265 */
1266const htmlElemDesc *
1267htmlTagLookup(const xmlChar *tag) {
1268 unsigned int i;
1269
1270 for (i = 0; i < (sizeof(html40ElementTable) /
1271 sizeof(html40ElementTable[0]));i++) {
1272 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1273 return((htmlElemDescPtr) &html40ElementTable[i]);
1274 }
1275 return(NULL);
1276}
1277
1278/**
1279 * htmlGetEndPriority:
1280 * @name: The name of the element to look up the priority for.
1281 *
1282 * Return value: The "endtag" priority.
1283 **/
1284static int
1285htmlGetEndPriority (const xmlChar *name) {
1286 int i = 0;
1287
1288 while ((htmlEndPriority[i].name != NULL) &&
1289 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1290 i++;
1291
1292 return(htmlEndPriority[i].priority);
1293}
1294
1295
1296/**
1297 * htmlCheckAutoClose:
1298 * @newtag: The new tag name
1299 * @oldtag: The old tag name
1300 *
1301 * Checks whether the new tag is one of the registered valid tags for
1302 * closing old.
1303 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1304 *
1305 * Returns 0 if no, 1 if yes.
1306 */
1307static int
1308htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1309{
1310 int i, indx;
1311 const char **closed = NULL;
1312
1313 if (htmlStartCloseIndexinitialized == 0)
1314 htmlInitAutoClose();
1315
1316 /* inefficient, but not a big deal */
1317 for (indx = 0; indx < 100; indx++) {
1318 closed = htmlStartCloseIndex[indx];
1319 if (closed == NULL)
1320 return (0);
1321 if (xmlStrEqual(BAD_CAST * closed, newtag))
1322 break;
1323 }
1324
1325 i = closed - htmlStartClose;
1326 i++;
1327 while (htmlStartClose[i] != NULL) {
1328 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1329 return (1);
1330 }
1331 i++;
1332 }
1333 return (0);
1334}
1335
1336/**
1337 * htmlAutoCloseOnClose:
1338 * @ctxt: an HTML parser context
1339 * @newtag: The new tag name
1340 * @force: force the tag closure
1341 *
1342 * The HTML DTD allows an ending tag to implicitly close other tags.
1343 */
1344static void
1345htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1346{
1347 const htmlElemDesc *info;
1348 int i, priority;
1349
1350 priority = htmlGetEndPriority(newtag);
1351
1352 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1353
1354 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1355 break;
1356 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001357 * A misplaced endtag can only close elements with lower
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001358 * or equal priority, so if we find an element with higher
1359 * priority before we find an element with
1360 * matching name, we just ignore this endtag
1361 */
1362 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1363 return;
1364 }
1365 if (i < 0)
1366 return;
1367
1368 while (!xmlStrEqual(newtag, ctxt->name)) {
1369 info = htmlTagLookup(ctxt->name);
1370 if ((info != NULL) && (info->endTag == 3)) {
1371 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1372 "Opening and ending tag mismatch: %s and %s\n",
1373 newtag, ctxt->name);
1374 }
1375 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1376 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1377 htmlnamePop(ctxt);
1378 }
1379}
1380
1381/**
1382 * htmlAutoCloseOnEnd:
1383 * @ctxt: an HTML parser context
1384 *
1385 * Close all remaining tags at the end of the stream
1386 */
1387static void
1388htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1389{
1390 int i;
1391
1392 if (ctxt->nameNr == 0)
1393 return;
1394 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1395 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1396 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1397 htmlnamePop(ctxt);
1398 }
1399}
1400
1401/**
1402 * htmlAutoClose:
1403 * @ctxt: an HTML parser context
1404 * @newtag: The new tag name or NULL
1405 *
1406 * The HTML DTD allows a tag to implicitly close other tags.
1407 * The list is kept in htmlStartClose array. This function is
1408 * called when a new tag has been detected and generates the
1409 * appropriates closes if possible/needed.
1410 * If newtag is NULL this mean we are at the end of the resource
1411 * and we should check
1412 */
1413static void
1414htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1415{
1416 while ((newtag != NULL) && (ctxt->name != NULL) &&
1417 (htmlCheckAutoClose(newtag, ctxt->name))) {
1418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420 htmlnamePop(ctxt);
1421 }
1422 if (newtag == NULL) {
1423 htmlAutoCloseOnEnd(ctxt);
1424 return;
1425 }
1426 while ((newtag == NULL) && (ctxt->name != NULL) &&
1427 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1428 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1429 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1430 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1431 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1432 htmlnamePop(ctxt);
1433 }
1434}
1435
1436/**
1437 * htmlAutoCloseTag:
1438 * @doc: the HTML document
1439 * @name: The tag name
1440 * @elem: the HTML element
1441 *
1442 * The HTML DTD allows a tag to implicitly close other tags.
1443 * The list is kept in htmlStartClose array. This function checks
1444 * if the element or one of it's children would autoclose the
1445 * given tag.
1446 *
1447 * Returns 1 if autoclose, 0 otherwise
1448 */
1449int
1450htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1451 htmlNodePtr child;
1452
1453 if (elem == NULL) return(1);
1454 if (xmlStrEqual(name, elem->name)) return(0);
1455 if (htmlCheckAutoClose(elem->name, name)) return(1);
1456 child = elem->children;
1457 while (child != NULL) {
1458 if (htmlAutoCloseTag(doc, name, child)) return(1);
1459 child = child->next;
1460 }
1461 return(0);
1462}
1463
1464/**
1465 * htmlIsAutoClosed:
1466 * @doc: the HTML document
1467 * @elem: the HTML element
1468 *
1469 * The HTML DTD allows a tag to implicitly close other tags.
1470 * The list is kept in htmlStartClose array. This function checks
1471 * if a tag is autoclosed by one of it's child
1472 *
1473 * Returns 1 if autoclosed, 0 otherwise
1474 */
1475int
1476htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1477 htmlNodePtr child;
1478
1479 if (elem == NULL) return(1);
1480 child = elem->children;
1481 while (child != NULL) {
1482 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1483 child = child->next;
1484 }
1485 return(0);
1486}
1487
1488/**
1489 * htmlCheckImplied:
1490 * @ctxt: an HTML parser context
1491 * @newtag: The new tag name
1492 *
1493 * The HTML DTD allows a tag to exists only implicitly
1494 * called when a new tag has been detected and generates the
1495 * appropriates implicit tags if missing
1496 */
1497static void
1498htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1499 int i;
1500
1501 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1502 return;
1503 if (!htmlOmittedDefaultValue)
1504 return;
1505 if (xmlStrEqual(newtag, BAD_CAST"html"))
1506 return;
1507 if (ctxt->nameNr <= 0) {
1508 htmlnamePush(ctxt, BAD_CAST"html");
1509 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1510 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1511 }
1512 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1513 return;
1514 if ((ctxt->nameNr <= 1) &&
1515 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1516 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1517 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1518 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1519 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1520 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1521 if (ctxt->html >= 3) {
1522 /* we already saw or generated an <head> before */
1523 return;
1524 }
1525 /*
1526 * dropped OBJECT ... i you put it first BODY will be
1527 * assumed !
1528 */
1529 htmlnamePush(ctxt, BAD_CAST"head");
1530 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1531 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1532 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1533 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1534 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1535 if (ctxt->html >= 10) {
1536 /* we already saw or generated a <body> before */
1537 return;
1538 }
1539 for (i = 0;i < ctxt->nameNr;i++) {
1540 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1541 return;
1542 }
1543 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1544 return;
1545 }
1546 }
1547
1548 htmlnamePush(ctxt, BAD_CAST"body");
1549 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1550 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1551 }
1552}
1553
1554/**
1555 * htmlCheckParagraph
1556 * @ctxt: an HTML parser context
1557 *
1558 * Check whether a p element need to be implied before inserting
1559 * characters in the current element.
1560 *
1561 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1562 * in case of error.
1563 */
1564
1565static int
1566htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1567 const xmlChar *tag;
1568 int i;
1569
1570 if (ctxt == NULL)
1571 return(-1);
1572 tag = ctxt->name;
1573 if (tag == NULL) {
1574 htmlAutoClose(ctxt, BAD_CAST"p");
1575 htmlCheckImplied(ctxt, BAD_CAST"p");
1576 htmlnamePush(ctxt, BAD_CAST"p");
1577 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1578 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1579 return(1);
1580 }
1581 if (!htmlOmittedDefaultValue)
1582 return(0);
1583 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1584 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1585 htmlAutoClose(ctxt, BAD_CAST"p");
1586 htmlCheckImplied(ctxt, BAD_CAST"p");
1587 htmlnamePush(ctxt, BAD_CAST"p");
1588 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1589 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1590 return(1);
1591 }
1592 }
1593 return(0);
1594}
1595
1596/**
1597 * htmlIsScriptAttribute:
1598 * @name: an attribute name
1599 *
1600 * Check if an attribute is of content type Script
1601 *
1602 * Returns 1 is the attribute is a script 0 otherwise
1603 */
1604int
1605htmlIsScriptAttribute(const xmlChar *name) {
1606 unsigned int i;
1607
1608 if (name == NULL)
1609 return(0);
1610 /*
1611 * all script attributes start with 'on'
1612 */
1613 if ((name[0] != 'o') || (name[1] != 'n'))
1614 return(0);
1615 for (i = 0;
1616 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1617 i++) {
1618 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1619 return(1);
1620 }
1621 return(0);
1622}
1623
1624/************************************************************************
1625 * *
1626 * The list of HTML predefined entities *
1627 * *
1628 ************************************************************************/
1629
1630
1631static const htmlEntityDesc html40EntitiesTable[] = {
1632/*
1633 * the 4 absolute ones, plus apostrophe.
1634 */
1635{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1636{ 38, "amp", "ampersand, U+0026 ISOnum" },
1637{ 39, "apos", "single quote" },
1638{ 60, "lt", "less-than sign, U+003C ISOnum" },
1639{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1640
1641/*
1642 * A bunch still in the 128-255 range
1643 * Replacing them depend really on the charset used.
1644 */
1645{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1646{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1647{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1648{ 163, "pound","pound sign, U+00A3 ISOnum" },
1649{ 164, "curren","currency sign, U+00A4 ISOnum" },
1650{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1651{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1652{ 167, "sect", "section sign, U+00A7 ISOnum" },
1653{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1654{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1655{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1656{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1657{ 172, "not", "not sign, U+00AC ISOnum" },
1658{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1659{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1660{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1661{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1662{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1663{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1664{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1665{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1666{ 181, "micro","micro sign, U+00B5 ISOnum" },
1667{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1668{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1669{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1670{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1671{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1672{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1673{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1674{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1675{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1676{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1677{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1678{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1679{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1680{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1681{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1682{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1683{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1684{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1685{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1686{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1687{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1688{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1689{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1690{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1691{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1692{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1693{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1694{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1695{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1696{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1697{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1698{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1699{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1700{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1701{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1702{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1703{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1704{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1705{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1706{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1707{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1708{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1709{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1710{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1711{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1712{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1713{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1714{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1715{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1716{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1717{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1718{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1719{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1720{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1721{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1722{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1723{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1724{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1725{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1726{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1727{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1728{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1729{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1730{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1731{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1732{ 247, "divide","division sign, U+00F7 ISOnum" },
1733{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1734{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1735{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1736{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1737{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1738{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1739{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1740{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1741
1742{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1743{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1744{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1745{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1746{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1747
1748/*
1749 * Anything below should really be kept as entities references
1750 */
1751{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1752
1753{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1754{ 732, "tilde","small tilde, U+02DC ISOdia" },
1755
1756{ 913, "Alpha","greek capital letter alpha, U+0391" },
1757{ 914, "Beta", "greek capital letter beta, U+0392" },
1758{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1759{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1760{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1761{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1762{ 919, "Eta", "greek capital letter eta, U+0397" },
1763{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1764{ 921, "Iota", "greek capital letter iota, U+0399" },
1765{ 922, "Kappa","greek capital letter kappa, U+039A" },
1766{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1767{ 924, "Mu", "greek capital letter mu, U+039C" },
1768{ 925, "Nu", "greek capital letter nu, U+039D" },
1769{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1770{ 927, "Omicron","greek capital letter omicron, U+039F" },
1771{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1772{ 929, "Rho", "greek capital letter rho, U+03A1" },
1773{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1774{ 932, "Tau", "greek capital letter tau, U+03A4" },
1775{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1776{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1777{ 935, "Chi", "greek capital letter chi, U+03A7" },
1778{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1779{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1780
1781{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1782{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1783{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1784{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1785{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1786{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1787{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1788{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1789{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1790{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1791{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1792{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1793{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1794{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1795{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1796{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1797{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1798{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1799{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1800{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1801{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1802{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1803{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1804{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1805{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1806{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1807{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1808{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1809
1810{ 8194, "ensp", "en space, U+2002 ISOpub" },
1811{ 8195, "emsp", "em space, U+2003 ISOpub" },
1812{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1813{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1814{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1815{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1816{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1817{ 8211, "ndash","en dash, U+2013 ISOpub" },
1818{ 8212, "mdash","em dash, U+2014 ISOpub" },
1819{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1820{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1821{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1822{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1823{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1824{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1825{ 8224, "dagger","dagger, U+2020 ISOpub" },
1826{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1827
1828{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1829{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1830
1831{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1832
1833{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1834{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1835
1836{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1837{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1838
1839{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1840{ 8260, "frasl","fraction slash, U+2044 NEW" },
1841
1842{ 8364, "euro", "euro sign, U+20AC NEW" },
1843
1844{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1845{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1846{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1847{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1848{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1849{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1850{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1851{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1852{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1853{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1854{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1855{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1856{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1857{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1858{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1859{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1860
1861{ 8704, "forall","for all, U+2200 ISOtech" },
1862{ 8706, "part", "partial differential, U+2202 ISOtech" },
1863{ 8707, "exist","there exists, U+2203 ISOtech" },
1864{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1865{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1866{ 8712, "isin", "element of, U+2208 ISOtech" },
1867{ 8713, "notin","not an element of, U+2209 ISOtech" },
1868{ 8715, "ni", "contains as member, U+220B ISOtech" },
1869{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1870{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1871{ 8722, "minus","minus sign, U+2212 ISOtech" },
1872{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1873{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1874{ 8733, "prop", "proportional to, U+221D ISOtech" },
1875{ 8734, "infin","infinity, U+221E ISOtech" },
1876{ 8736, "ang", "angle, U+2220 ISOamso" },
1877{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1878{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1879{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1880{ 8746, "cup", "union = cup, U+222A ISOtech" },
1881{ 8747, "int", "integral, U+222B ISOtech" },
1882{ 8756, "there4","therefore, U+2234 ISOtech" },
1883{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1884{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1885{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1886{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1887{ 8801, "equiv","identical to, U+2261 ISOtech" },
1888{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1889{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1890{ 8834, "sub", "subset of, U+2282 ISOtech" },
1891{ 8835, "sup", "superset of, U+2283 ISOtech" },
1892{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1893{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1894{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1895{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1896{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1897{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1898{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1899{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1900{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1901{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1902{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1903{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1904{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1905{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1906
1907{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1908{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1909{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1910{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1911
1912};
1913
1914/************************************************************************
1915 * *
1916 * Commodity functions to handle entities *
1917 * *
1918 ************************************************************************/
1919
1920/*
1921 * Macro used to grow the current buffer.
1922 */
1923#define growBuffer(buffer) { \
1924 xmlChar *tmp; \
1925 buffer##_size *= 2; \
1926 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1927 if (tmp == NULL) { \
1928 htmlErrMemory(ctxt, "growing buffer\n"); \
1929 xmlFree(buffer); \
1930 return(NULL); \
1931 } \
1932 buffer = tmp; \
1933}
1934
1935/**
1936 * htmlEntityLookup:
1937 * @name: the entity name
1938 *
1939 * Lookup the given entity in EntitiesTable
1940 *
1941 * TODO: the linear scan is really ugly, an hash table is really needed.
1942 *
1943 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1944 */
1945const htmlEntityDesc *
1946htmlEntityLookup(const xmlChar *name) {
1947 unsigned int i;
1948
1949 for (i = 0;i < (sizeof(html40EntitiesTable)/
1950 sizeof(html40EntitiesTable[0]));i++) {
1951 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1952 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1953 }
1954 }
1955 return(NULL);
1956}
1957
1958/**
1959 * htmlEntityValueLookup:
1960 * @value: the entity's unicode value
1961 *
1962 * Lookup the given entity in EntitiesTable
1963 *
1964 * TODO: the linear scan is really ugly, an hash table is really needed.
1965 *
1966 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1967 */
1968const htmlEntityDesc *
1969htmlEntityValueLookup(unsigned int value) {
1970 unsigned int i;
1971
1972 for (i = 0;i < (sizeof(html40EntitiesTable)/
1973 sizeof(html40EntitiesTable[0]));i++) {
1974 if (html40EntitiesTable[i].value >= value) {
1975 if (html40EntitiesTable[i].value > value)
1976 break;
1977 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1978 }
1979 }
1980 return(NULL);
1981}
1982
1983/**
1984 * UTF8ToHtml:
1985 * @out: a pointer to an array of bytes to store the result
1986 * @outlen: the length of @out
1987 * @in: a pointer to an array of UTF-8 chars
1988 * @inlen: the length of @in
1989 *
1990 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1991 * plus HTML entities block of chars out.
1992 *
1993 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1994 * The value of @inlen after return is the number of octets consumed
1995 * as the return value is positive, else unpredictable.
1996 * The value of @outlen after return is the number of octets consumed.
1997 */
1998int
1999UTF8ToHtml(unsigned char* out, int *outlen,
2000 const unsigned char* in, int *inlen) {
2001 const unsigned char* processed = in;
2002 const unsigned char* outend;
2003 const unsigned char* outstart = out;
2004 const unsigned char* instart = in;
2005 const unsigned char* inend;
2006 unsigned int c, d;
2007 int trailing;
2008
2009 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2010 if (in == NULL) {
2011 /*
2012 * initialization nothing to do
2013 */
2014 *outlen = 0;
2015 *inlen = 0;
2016 return(0);
2017 }
2018 inend = in + (*inlen);
2019 outend = out + (*outlen);
2020 while (in < inend) {
2021 d = *in++;
2022 if (d < 0x80) { c= d; trailing= 0; }
2023 else if (d < 0xC0) {
2024 /* trailing byte in leading position */
2025 *outlen = out - outstart;
2026 *inlen = processed - instart;
2027 return(-2);
2028 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2029 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2030 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2031 else {
2032 /* no chance for this in Ascii */
2033 *outlen = out - outstart;
2034 *inlen = processed - instart;
2035 return(-2);
2036 }
2037
2038 if (inend - in < trailing) {
2039 break;
2040 }
2041
2042 for ( ; trailing; trailing--) {
2043 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2044 break;
2045 c <<= 6;
2046 c |= d & 0x3F;
2047 }
2048
2049 /* assertion: c is a single UTF-4 value */
2050 if (c < 0x80) {
2051 if (out + 1 >= outend)
2052 break;
2053 *out++ = c;
2054 } else {
2055 int len;
2056 const htmlEntityDesc * ent;
2057 const char *cp;
2058 char nbuf[16];
2059
2060 /*
2061 * Try to lookup a predefined HTML entity for it
2062 */
2063
2064 ent = htmlEntityValueLookup(c);
2065 if (ent == NULL) {
2066 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2067 cp = nbuf;
2068 }
2069 else
2070 cp = ent->name;
2071 len = strlen(cp);
2072 if (out + 2 + len >= outend)
2073 break;
2074 *out++ = '&';
2075 memcpy(out, cp, len);
2076 out += len;
2077 *out++ = ';';
2078 }
2079 processed = in;
2080 }
2081 *outlen = out - outstart;
2082 *inlen = processed - instart;
2083 return(0);
2084}
2085
2086/**
2087 * htmlEncodeEntities:
2088 * @out: a pointer to an array of bytes to store the result
2089 * @outlen: the length of @out
2090 * @in: a pointer to an array of UTF-8 chars
2091 * @inlen: the length of @in
2092 * @quoteChar: the quote character to escape (' or ") or zero.
2093 *
2094 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2095 * plus HTML entities block of chars out.
2096 *
2097 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2098 * The value of @inlen after return is the number of octets consumed
2099 * as the return value is positive, else unpredictable.
2100 * The value of @outlen after return is the number of octets consumed.
2101 */
2102int
2103htmlEncodeEntities(unsigned char* out, int *outlen,
2104 const unsigned char* in, int *inlen, int quoteChar) {
2105 const unsigned char* processed = in;
2106 const unsigned char* outend;
2107 const unsigned char* outstart = out;
2108 const unsigned char* instart = in;
2109 const unsigned char* inend;
2110 unsigned int c, d;
2111 int trailing;
2112
2113 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2114 return(-1);
2115 outend = out + (*outlen);
2116 inend = in + (*inlen);
2117 while (in < inend) {
2118 d = *in++;
2119 if (d < 0x80) { c= d; trailing= 0; }
2120 else if (d < 0xC0) {
2121 /* trailing byte in leading position */
2122 *outlen = out - outstart;
2123 *inlen = processed - instart;
2124 return(-2);
2125 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2126 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2127 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2128 else {
2129 /* no chance for this in Ascii */
2130 *outlen = out - outstart;
2131 *inlen = processed - instart;
2132 return(-2);
2133 }
2134
2135 if (inend - in < trailing)
2136 break;
2137
2138 while (trailing--) {
2139 if (((d= *in++) & 0xC0) != 0x80) {
2140 *outlen = out - outstart;
2141 *inlen = processed - instart;
2142 return(-2);
2143 }
2144 c <<= 6;
2145 c |= d & 0x3F;
2146 }
2147
2148 /* assertion: c is a single UTF-4 value */
2149 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2150 (c != '&') && (c != '<') && (c != '>')) {
2151 if (out >= outend)
2152 break;
2153 *out++ = c;
2154 } else {
2155 const htmlEntityDesc * ent;
2156 const char *cp;
2157 char nbuf[16];
2158 int len;
2159
2160 /*
2161 * Try to lookup a predefined HTML entity for it
2162 */
2163 ent = htmlEntityValueLookup(c);
2164 if (ent == NULL) {
2165 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2166 cp = nbuf;
2167 }
2168 else
2169 cp = ent->name;
2170 len = strlen(cp);
2171 if (out + 2 + len > outend)
2172 break;
2173 *out++ = '&';
2174 memcpy(out, cp, len);
2175 out += len;
2176 *out++ = ';';
2177 }
2178 processed = in;
2179 }
2180 *outlen = out - outstart;
2181 *inlen = processed - instart;
2182 return(0);
2183}
2184
2185/************************************************************************
2186 * *
2187 * Commodity functions to handle streams *
2188 * *
2189 ************************************************************************/
2190
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002191#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002192/**
2193 * htmlNewInputStream:
2194 * @ctxt: an HTML parser context
2195 *
2196 * Create a new input stream structure
2197 * Returns the new input stream or NULL
2198 */
2199static htmlParserInputPtr
2200htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2201 htmlParserInputPtr input;
2202
2203 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2204 if (input == NULL) {
2205 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2206 return(NULL);
2207 }
2208 memset(input, 0, sizeof(htmlParserInput));
2209 input->filename = NULL;
2210 input->directory = NULL;
2211 input->base = NULL;
2212 input->cur = NULL;
2213 input->buf = NULL;
2214 input->line = 1;
2215 input->col = 1;
2216 input->buf = NULL;
2217 input->free = NULL;
2218 input->version = NULL;
2219 input->consumed = 0;
2220 input->length = 0;
2221 return(input);
2222}
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002223#endif
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002224
2225
2226/************************************************************************
2227 * *
2228 * Commodity functions, cleanup needed ? *
2229 * *
2230 ************************************************************************/
2231/*
2232 * all tags allowing pc data from the html 4.01 loose dtd
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002233 * NOTE: it might be more appropriate to integrate this information
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002234 * into the html40ElementTable array but I don't want to risk any
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002235 * binary incompatibility
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002236 */
2237static const char *allowPCData[] = {
2238 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2239 "blockquote", "body", "button", "caption", "center", "cite", "code",
2240 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2241 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2242 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2243 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2244};
2245
2246/**
2247 * areBlanks:
2248 * @ctxt: an HTML parser context
2249 * @str: a xmlChar *
2250 * @len: the size of @str
2251 *
2252 * Is this a sequence of blank chars that one can ignore ?
2253 *
2254 * Returns 1 if ignorable 0 otherwise.
2255 */
2256
2257static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2258 unsigned int i;
2259 int j;
2260 xmlNodePtr lastChild;
2261 xmlDtdPtr dtd;
2262
2263 for (j = 0;j < len;j++)
2264 if (!(IS_BLANK_CH(str[j]))) return(0);
2265
2266 if (CUR == 0) return(1);
2267 if (CUR != '<') return(0);
2268 if (ctxt->name == NULL)
2269 return(1);
2270 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2271 return(1);
2272 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2273 return(1);
2274
2275 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2276 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2277 dtd = xmlGetIntSubset(ctxt->myDoc);
2278 if (dtd != NULL && dtd->ExternalID != NULL) {
2279 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2280 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2281 return(1);
2282 }
2283 }
2284
2285 if (ctxt->node == NULL) return(0);
2286 lastChild = xmlGetLastChild(ctxt->node);
2287 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2288 lastChild = lastChild->prev;
2289 if (lastChild == NULL) {
2290 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2291 (ctxt->node->content != NULL)) return(0);
2292 /* keep ws in constructs like ...<b> </b>...
2293 for all tags "b" allowing PCDATA */
2294 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2295 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2296 return(0);
2297 }
2298 }
2299 } else if (xmlNodeIsText(lastChild)) {
2300 return(0);
2301 } else {
2302 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2303 for all tags "p" allowing PCDATA */
2304 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2305 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2306 return(0);
2307 }
2308 }
2309 }
2310 return(1);
2311}
2312
2313/**
2314 * htmlNewDocNoDtD:
2315 * @URI: URI for the dtd, or NULL
2316 * @ExternalID: the external ID of the DTD, or NULL
2317 *
2318 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2319 * are NULL
2320 *
2321 * Returns a new document, do not initialize the DTD if not provided
2322 */
2323htmlDocPtr
2324htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2325 xmlDocPtr cur;
2326
2327 /*
2328 * Allocate a new document and fill the fields.
2329 */
2330 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2331 if (cur == NULL) {
2332 htmlErrMemory(NULL, "HTML document creation failed\n");
2333 return(NULL);
2334 }
2335 memset(cur, 0, sizeof(xmlDoc));
2336
2337 cur->type = XML_HTML_DOCUMENT_NODE;
2338 cur->version = NULL;
2339 cur->intSubset = NULL;
2340 cur->doc = cur;
2341 cur->name = NULL;
2342 cur->children = NULL;
2343 cur->extSubset = NULL;
2344 cur->oldNs = NULL;
2345 cur->encoding = NULL;
2346 cur->standalone = 1;
2347 cur->compression = 0;
2348 cur->ids = NULL;
2349 cur->refs = NULL;
2350 cur->_private = NULL;
2351 cur->charset = XML_CHAR_ENCODING_UTF8;
2352 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2353 if ((ExternalID != NULL) ||
2354 (URI != NULL))
2355 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2356 return(cur);
2357}
2358
2359/**
2360 * htmlNewDoc:
2361 * @URI: URI for the dtd, or NULL
2362 * @ExternalID: the external ID of the DTD, or NULL
2363 *
2364 * Creates a new HTML document
2365 *
2366 * Returns a new document
2367 */
2368htmlDocPtr
2369htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2370 if ((URI == NULL) && (ExternalID == NULL))
2371 return(htmlNewDocNoDtD(
2372 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2373 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2374
2375 return(htmlNewDocNoDtD(URI, ExternalID));
2376}
2377
2378
2379/************************************************************************
2380 * *
2381 * The parser itself *
2382 * Relates to http://www.w3.org/TR/html40 *
2383 * *
2384 ************************************************************************/
2385
2386/************************************************************************
2387 * *
2388 * The parser itself *
2389 * *
2390 ************************************************************************/
2391
2392static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2393
2394/**
2395 * htmlParseHTMLName:
2396 * @ctxt: an HTML parser context
2397 *
2398 * parse an HTML tag or attribute name, note that we convert it to lowercase
2399 * since HTML names are not case-sensitive.
2400 *
2401 * Returns the Tag Name parsed or NULL
2402 */
2403
2404static const xmlChar *
2405htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2406 int i = 0;
2407 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2408
2409 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2410 (CUR != ':') && (CUR != '.')) return(NULL);
2411
2412 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2413 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2414 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2415 (CUR == '.'))) {
2416 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2417 else loc[i] = CUR;
2418 i++;
2419
2420 NEXT;
2421 }
2422
2423 return(xmlDictLookup(ctxt->dict, loc, i));
2424}
2425
2426
2427/**
2428 * htmlParseHTMLName_nonInvasive:
2429 * @ctxt: an HTML parser context
2430 *
2431 * parse an HTML tag or attribute name, note that we convert it to lowercase
2432 * since HTML names are not case-sensitive, this doesn't consume the data
2433 * from the stream, it's a look-ahead
2434 *
2435 * Returns the Tag Name parsed or NULL
2436 */
2437
2438static const xmlChar *
2439htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2440 int i = 0;
2441 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2442
2443 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2444 (NXT(1) != ':')) return(NULL);
2445
2446 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2447 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2448 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2449 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2450 else loc[i] = NXT(1+i);
2451 i++;
2452 }
2453
2454 return(xmlDictLookup(ctxt->dict, loc, i));
2455}
2456
2457
2458/**
2459 * htmlParseName:
2460 * @ctxt: an HTML parser context
2461 *
2462 * parse an HTML name, this routine is case sensitive.
2463 *
2464 * Returns the Name parsed or NULL
2465 */
2466
2467static const xmlChar *
2468htmlParseName(htmlParserCtxtPtr ctxt) {
2469 const xmlChar *in;
2470 const xmlChar *ret;
2471 int count = 0;
2472
2473 GROW;
2474
2475 /*
2476 * Accelerator for simple ASCII names
2477 */
2478 in = ctxt->input->cur;
2479 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2480 ((*in >= 0x41) && (*in <= 0x5A)) ||
2481 (*in == '_') || (*in == ':')) {
2482 in++;
2483 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2484 ((*in >= 0x41) && (*in <= 0x5A)) ||
2485 ((*in >= 0x30) && (*in <= 0x39)) ||
2486 (*in == '_') || (*in == '-') ||
2487 (*in == ':') || (*in == '.'))
2488 in++;
2489
2490 if (in == ctxt->input->end)
2491 return(NULL);
2492
2493 if ((*in > 0) && (*in < 0x80)) {
2494 count = in - ctxt->input->cur;
2495 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2496 ctxt->input->cur = in;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002497 ctxt->input->col += count;
2498 return(ret);
2499 }
2500 }
2501 return(htmlParseNameComplex(ctxt));
2502}
2503
2504static const xmlChar *
2505htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2506 int len = 0, l;
2507 int c;
2508 int count = 0;
2509 const xmlChar *base = ctxt->input->base;
2510
2511 /*
2512 * Handler for more complex cases
2513 */
2514 GROW;
2515 c = CUR_CHAR(l);
2516 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2517 (!IS_LETTER(c) && (c != '_') &&
2518 (c != ':'))) {
2519 return(NULL);
2520 }
2521
2522 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2523 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2524 (c == '.') || (c == '-') ||
2525 (c == '_') || (c == ':') ||
2526 (IS_COMBINING(c)) ||
2527 (IS_EXTENDER(c)))) {
2528 if (count++ > 100) {
2529 count = 0;
2530 GROW;
2531 }
2532 len += l;
2533 NEXTL(l);
2534 c = CUR_CHAR(l);
2535 if (ctxt->input->base != base) {
2536 /*
2537 * We changed encoding from an unknown encoding
2538 * Input buffer changed location, so we better start again
2539 */
2540 return(htmlParseNameComplex(ctxt));
2541 }
2542 }
2543
2544 if (ctxt->input->cur - ctxt->input->base < len) {
2545 /* Sanity check */
2546 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2547 "unexpected change of input buffer", NULL, NULL);
2548 return (NULL);
2549 }
2550
2551 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2552}
2553
2554
2555/**
2556 * htmlParseHTMLAttribute:
2557 * @ctxt: an HTML parser context
2558 * @stop: a char stop value
2559 *
2560 * parse an HTML attribute value till the stop (quote), if
2561 * stop is 0 then it stops at the first space
2562 *
2563 * Returns the attribute parsed or NULL
2564 */
2565
2566static xmlChar *
2567htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2568 xmlChar *buffer = NULL;
2569 int buffer_size = 0;
2570 xmlChar *out = NULL;
2571 const xmlChar *name = NULL;
2572 const xmlChar *cur = NULL;
2573 const htmlEntityDesc * ent;
2574
2575 /*
2576 * allocate a translation buffer.
2577 */
2578 buffer_size = HTML_PARSER_BUFFER_SIZE;
2579 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2580 if (buffer == NULL) {
2581 htmlErrMemory(ctxt, "buffer allocation failed\n");
2582 return(NULL);
2583 }
2584 out = buffer;
2585
2586 /*
2587 * Ok loop until we reach one of the ending chars
2588 */
2589 while ((CUR != 0) && (CUR != stop)) {
2590 if ((stop == 0) && (CUR == '>')) break;
2591 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2592 if (CUR == '&') {
2593 if (NXT(1) == '#') {
2594 unsigned int c;
2595 int bits;
2596
2597 c = htmlParseCharRef(ctxt);
2598 if (c < 0x80)
2599 { *out++ = c; bits= -6; }
2600 else if (c < 0x800)
2601 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2602 else if (c < 0x10000)
2603 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2604 else
2605 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2606
2607 for ( ; bits >= 0; bits-= 6) {
2608 *out++ = ((c >> bits) & 0x3F) | 0x80;
2609 }
2610
2611 if (out - buffer > buffer_size - 100) {
2612 int indx = out - buffer;
2613
2614 growBuffer(buffer);
2615 out = &buffer[indx];
2616 }
2617 } else {
2618 ent = htmlParseEntityRef(ctxt, &name);
2619 if (name == NULL) {
2620 *out++ = '&';
2621 if (out - buffer > buffer_size - 100) {
2622 int indx = out - buffer;
2623
2624 growBuffer(buffer);
2625 out = &buffer[indx];
2626 }
2627 } else if (ent == NULL) {
2628 *out++ = '&';
2629 cur = name;
2630 while (*cur != 0) {
2631 if (out - buffer > buffer_size - 100) {
2632 int indx = out - buffer;
2633
2634 growBuffer(buffer);
2635 out = &buffer[indx];
2636 }
2637 *out++ = *cur++;
2638 }
2639 } else {
2640 unsigned int c;
2641 int bits;
2642
2643 if (out - buffer > buffer_size - 100) {
2644 int indx = out - buffer;
2645
2646 growBuffer(buffer);
2647 out = &buffer[indx];
2648 }
2649 c = ent->value;
2650 if (c < 0x80)
2651 { *out++ = c; bits= -6; }
2652 else if (c < 0x800)
2653 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2654 else if (c < 0x10000)
2655 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2656 else
2657 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2658
2659 for ( ; bits >= 0; bits-= 6) {
2660 *out++ = ((c >> bits) & 0x3F) | 0x80;
2661 }
2662 }
2663 }
2664 } else {
2665 unsigned int c;
2666 int bits, l;
2667
2668 if (out - buffer > buffer_size - 100) {
2669 int indx = out - buffer;
2670
2671 growBuffer(buffer);
2672 out = &buffer[indx];
2673 }
2674 c = CUR_CHAR(l);
2675 if (c < 0x80)
2676 { *out++ = c; bits= -6; }
2677 else if (c < 0x800)
2678 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2679 else if (c < 0x10000)
2680 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2681 else
2682 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2683
2684 for ( ; bits >= 0; bits-= 6) {
2685 *out++ = ((c >> bits) & 0x3F) | 0x80;
2686 }
2687 NEXT;
2688 }
2689 }
2690 *out = 0;
2691 return(buffer);
2692}
2693
2694/**
2695 * htmlParseEntityRef:
2696 * @ctxt: an HTML parser context
2697 * @str: location to store the entity name
2698 *
2699 * parse an HTML ENTITY references
2700 *
2701 * [68] EntityRef ::= '&' Name ';'
2702 *
2703 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2704 * if non-NULL *str will have to be freed by the caller.
2705 */
2706const htmlEntityDesc *
2707htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2708 const xmlChar *name;
2709 const htmlEntityDesc * ent = NULL;
2710
2711 if (str != NULL) *str = NULL;
2712 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2713
2714 if (CUR == '&') {
2715 NEXT;
2716 name = htmlParseName(ctxt);
2717 if (name == NULL) {
2718 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2719 "htmlParseEntityRef: no name\n", NULL, NULL);
2720 } else {
2721 GROW;
2722 if (CUR == ';') {
2723 if (str != NULL)
2724 *str = name;
2725
2726 /*
2727 * Lookup the entity in the table.
2728 */
2729 ent = htmlEntityLookup(name);
2730 if (ent != NULL) /* OK that's ugly !!! */
2731 NEXT;
2732 } else {
2733 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2734 "htmlParseEntityRef: expecting ';'\n",
2735 NULL, NULL);
2736 if (str != NULL)
2737 *str = name;
2738 }
2739 }
2740 }
2741 return(ent);
2742}
2743
2744/**
2745 * htmlParseAttValue:
2746 * @ctxt: an HTML parser context
2747 *
2748 * parse a value for an attribute
2749 * Note: the parser won't do substitution of entities here, this
2750 * will be handled later in xmlStringGetNodeList, unless it was
2751 * asked for ctxt->replaceEntities != 0
2752 *
2753 * Returns the AttValue parsed or NULL.
2754 */
2755
2756static xmlChar *
2757htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2758 xmlChar *ret = NULL;
2759
2760 if (CUR == '"') {
2761 NEXT;
2762 ret = htmlParseHTMLAttribute(ctxt, '"');
2763 if (CUR != '"') {
2764 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2765 "AttValue: \" expected\n", NULL, NULL);
2766 } else
2767 NEXT;
2768 } else if (CUR == '\'') {
2769 NEXT;
2770 ret = htmlParseHTMLAttribute(ctxt, '\'');
2771 if (CUR != '\'') {
2772 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2773 "AttValue: ' expected\n", NULL, NULL);
2774 } else
2775 NEXT;
2776 } else {
2777 /*
2778 * That's an HTMLism, the attribute value may not be quoted
2779 */
2780 ret = htmlParseHTMLAttribute(ctxt, 0);
2781 if (ret == NULL) {
2782 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2783 "AttValue: no value found\n", NULL, NULL);
2784 }
2785 }
2786 return(ret);
2787}
2788
2789/**
2790 * htmlParseSystemLiteral:
2791 * @ctxt: an HTML parser context
2792 *
2793 * parse an HTML Literal
2794 *
2795 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2796 *
2797 * Returns the SystemLiteral parsed or NULL
2798 */
2799
2800static xmlChar *
2801htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2802 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002803 int err = 0;
2804 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002805 xmlChar *ret = NULL;
2806
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002807 if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002808 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002809 "SystemLiteral \" or ' expected\n", NULL, NULL);
2810 return(NULL);
2811 }
2812 quote = CUR;
2813 NEXT;
2814
2815 if (CUR_PTR < BASE_PTR)
2816 return(ret);
2817 startPosition = CUR_PTR - BASE_PTR;
2818
2819 while ((CUR != 0) && (CUR != quote)) {
2820 /* TODO: Handle UTF-8 */
2821 if (!IS_CHAR_CH(CUR)) {
2822 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2823 "Invalid char in SystemLiteral 0x%X\n", CUR);
2824 err = 1;
2825 }
2826 NEXT;
2827 len++;
2828 }
2829 if (CUR != quote) {
2830 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2831 "Unfinished SystemLiteral\n", NULL, NULL);
2832 } else {
2833 NEXT;
2834 if (err == 0)
2835 ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002836 }
2837
2838 return(ret);
2839}
2840
2841/**
2842 * htmlParsePubidLiteral:
2843 * @ctxt: an HTML parser context
2844 *
2845 * parse an HTML public literal
2846 *
2847 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2848 *
2849 * Returns the PubidLiteral parsed or NULL.
2850 */
2851
2852static xmlChar *
2853htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2854 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002855 int err = 0;
2856 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002857 xmlChar *ret = NULL;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002858
2859 if ((CUR != '"') && (CUR != '\'')) {
2860 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2861 "PubidLiteral \" or ' expected\n", NULL, NULL);
2862 return(NULL);
2863 }
2864 quote = CUR;
2865 NEXT;
2866
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002867 /*
2868 * Name ::= (Letter | '_') (NameChar)*
2869 */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002870 if (CUR_PTR < BASE_PTR)
2871 return(ret);
2872 startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002873
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002874 while ((CUR != 0) && (CUR != quote)) {
2875 if (!IS_PUBIDCHAR_CH(CUR)) {
2876 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2877 "Invalid char in PubidLiteral 0x%X\n", CUR);
2878 err = 1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002879 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002880 len++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002881 NEXT;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002882 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002883
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002884 if (CUR != '"') {
2885 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2886 "Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002887 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002888 NEXT;
2889 if (err == 0)
2890 ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002891 }
2892
2893 return(ret);
2894}
2895
2896/**
2897 * htmlParseScript:
2898 * @ctxt: an HTML parser context
2899 *
2900 * parse the content of an HTML SCRIPT or STYLE element
2901 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2902 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2903 * http://www.w3.org/TR/html4/types.html#type-script
2904 * http://www.w3.org/TR/html4/types.html#h-6.15
2905 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2906 *
2907 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2908 * element and the value of intrinsic event attributes. User agents must
2909 * not evaluate script data as HTML markup but instead must pass it on as
2910 * data to a script engine.
2911 * NOTES:
2912 * - The content is passed like CDATA
2913 * - the attributes for style and scripting "onXXX" are also described
2914 * as CDATA but SGML allows entities references in attributes so their
2915 * processing is identical as other attributes
2916 */
2917static void
2918htmlParseScript(htmlParserCtxtPtr ctxt) {
2919 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2920 int nbchar = 0;
2921 int cur,l;
2922
2923 SHRINK;
2924 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002925 while (cur != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002926 if ((cur == '<') && (NXT(1) == '/')) {
2927 /*
2928 * One should break here, the specification is clear:
2929 * Authors should therefore escape "</" within the content.
2930 * Escape mechanisms are specific to each scripting or
2931 * style sheet language.
2932 *
2933 * In recovery mode, only break if end tag match the
2934 * current tag, effectively ignoring all tags inside the
2935 * script/style block and treating the entire block as
2936 * CDATA.
2937 */
2938 if (ctxt->recovery) {
2939 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2940 xmlStrlen(ctxt->name)) == 0)
2941 {
2942 break; /* while */
2943 } else {
2944 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2945 "Element %s embeds close tag\n",
2946 ctxt->name, NULL);
2947 }
2948 } else {
2949 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2950 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2951 {
2952 break; /* while */
2953 }
2954 }
2955 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002956 if (IS_CHAR(cur)) {
2957 COPY_BUF(l,buf,nbchar,cur);
2958 } else {
2959 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2960 "Invalid char in CDATA 0x%X\n", cur);
2961 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002962 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002963 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002964 if (ctxt->sax->cdataBlock!= NULL) {
2965 /*
2966 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2967 */
2968 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2969 } else if (ctxt->sax->characters != NULL) {
2970 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2971 }
2972 nbchar = 0;
2973 }
2974 GROW;
2975 NEXTL(l);
2976 cur = CUR_CHAR(l);
2977 }
2978
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002979 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002980 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002981 if (ctxt->sax->cdataBlock!= NULL) {
2982 /*
2983 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2984 */
2985 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2986 } else if (ctxt->sax->characters != NULL) {
2987 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2988 }
2989 }
2990}
2991
2992
2993/**
2994 * htmlParseCharDataInternal:
2995 * @ctxt: an HTML parser context
2996 * @readahead: optional read ahead character in ascii range
2997 *
2998 * parse a CharData section.
2999 * if we are within a CDATA section ']]>' marks an end of section.
3000 *
3001 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3002 */
3003
3004static void
3005htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3006 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3007 int nbchar = 0;
3008 int cur, l;
3009 int chunk = 0;
3010
3011 if (readahead)
3012 buf[nbchar++] = readahead;
3013
3014 SHRINK;
3015 cur = CUR_CHAR(l);
3016 while (((cur != '<') || (ctxt->token == '<')) &&
3017 ((cur != '&') || (ctxt->token == '&')) &&
3018 (cur != 0)) {
3019 if (!(IS_CHAR(cur))) {
3020 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3021 "Invalid char in CDATA 0x%X\n", cur);
3022 } else {
3023 COPY_BUF(l,buf,nbchar,cur);
3024 }
3025 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003026 buf[nbchar] = 0;
3027
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003028 /*
3029 * Ok the segment is to be consumed as chars.
3030 */
3031 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3032 if (areBlanks(ctxt, buf, nbchar)) {
3033 if (ctxt->keepBlanks) {
3034 if (ctxt->sax->characters != NULL)
3035 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3036 } else {
3037 if (ctxt->sax->ignorableWhitespace != NULL)
3038 ctxt->sax->ignorableWhitespace(ctxt->userData,
3039 buf, nbchar);
3040 }
3041 } else {
3042 htmlCheckParagraph(ctxt);
3043 if (ctxt->sax->characters != NULL)
3044 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3045 }
3046 }
3047 nbchar = 0;
3048 }
3049 NEXTL(l);
3050 chunk++;
3051 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3052 chunk = 0;
3053 SHRINK;
3054 GROW;
3055 }
3056 cur = CUR_CHAR(l);
3057 if (cur == 0) {
3058 SHRINK;
3059 GROW;
3060 cur = CUR_CHAR(l);
3061 }
3062 }
3063 if (nbchar != 0) {
3064 buf[nbchar] = 0;
3065
3066 /*
3067 * Ok the segment is to be consumed as chars.
3068 */
3069 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3070 if (areBlanks(ctxt, buf, nbchar)) {
3071 if (ctxt->keepBlanks) {
3072 if (ctxt->sax->characters != NULL)
3073 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3074 } else {
3075 if (ctxt->sax->ignorableWhitespace != NULL)
3076 ctxt->sax->ignorableWhitespace(ctxt->userData,
3077 buf, nbchar);
3078 }
3079 } else {
3080 htmlCheckParagraph(ctxt);
3081 if (ctxt->sax->characters != NULL)
3082 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3083 }
3084 }
3085 } else {
3086 /*
3087 * Loop detection
3088 */
3089 if (cur == 0)
3090 ctxt->instate = XML_PARSER_EOF;
3091 }
3092}
3093
3094/**
3095 * htmlParseCharData:
3096 * @ctxt: an HTML parser context
3097 *
3098 * parse a CharData section.
3099 * if we are within a CDATA section ']]>' marks an end of section.
3100 *
3101 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3102 */
3103
3104static void
3105htmlParseCharData(htmlParserCtxtPtr ctxt) {
3106 htmlParseCharDataInternal(ctxt, 0);
3107}
3108
3109/**
3110 * htmlParseExternalID:
3111 * @ctxt: an HTML parser context
3112 * @publicID: a xmlChar** receiving PubidLiteral
3113 *
3114 * Parse an External ID or a Public ID
3115 *
3116 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3117 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3118 *
3119 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3120 *
3121 * Returns the function returns SystemLiteral and in the second
3122 * case publicID receives PubidLiteral, is strict is off
3123 * it is possible to return NULL and have publicID set.
3124 */
3125
3126static xmlChar *
3127htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3128 xmlChar *URI = NULL;
3129
3130 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3131 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3132 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3133 SKIP(6);
3134 if (!IS_BLANK_CH(CUR)) {
3135 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3136 "Space required after 'SYSTEM'\n", NULL, NULL);
3137 }
3138 SKIP_BLANKS;
3139 URI = htmlParseSystemLiteral(ctxt);
3140 if (URI == NULL) {
3141 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3142 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3143 }
3144 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3145 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3146 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3147 SKIP(6);
3148 if (!IS_BLANK_CH(CUR)) {
3149 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3150 "Space required after 'PUBLIC'\n", NULL, NULL);
3151 }
3152 SKIP_BLANKS;
3153 *publicID = htmlParsePubidLiteral(ctxt);
3154 if (*publicID == NULL) {
3155 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3156 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3157 NULL, NULL);
3158 }
3159 SKIP_BLANKS;
3160 if ((CUR == '"') || (CUR == '\'')) {
3161 URI = htmlParseSystemLiteral(ctxt);
3162 }
3163 }
3164 return(URI);
3165}
3166
3167/**
3168 * xmlParsePI:
3169 * @ctxt: an XML parser context
3170 *
3171 * parse an XML Processing Instruction.
3172 *
3173 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3174 */
3175static void
3176htmlParsePI(htmlParserCtxtPtr ctxt) {
3177 xmlChar *buf = NULL;
3178 int len = 0;
3179 int size = HTML_PARSER_BUFFER_SIZE;
3180 int cur, l;
3181 const xmlChar *target;
3182 xmlParserInputState state;
3183 int count = 0;
3184
3185 if ((RAW == '<') && (NXT(1) == '?')) {
3186 state = ctxt->instate;
3187 ctxt->instate = XML_PARSER_PI;
3188 /*
3189 * this is a Processing Instruction.
3190 */
3191 SKIP(2);
3192 SHRINK;
3193
3194 /*
3195 * Parse the target name and check for special support like
3196 * namespace.
3197 */
3198 target = htmlParseName(ctxt);
3199 if (target != NULL) {
3200 if (RAW == '>') {
3201 SKIP(1);
3202
3203 /*
3204 * SAX: PI detected.
3205 */
3206 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3207 (ctxt->sax->processingInstruction != NULL))
3208 ctxt->sax->processingInstruction(ctxt->userData,
3209 target, NULL);
3210 ctxt->instate = state;
3211 return;
3212 }
3213 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3214 if (buf == NULL) {
3215 htmlErrMemory(ctxt, NULL);
3216 ctxt->instate = state;
3217 return;
3218 }
3219 cur = CUR;
3220 if (!IS_BLANK(cur)) {
3221 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3222 "ParsePI: PI %s space expected\n", target, NULL);
3223 }
3224 SKIP_BLANKS;
3225 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003226 while ((cur != 0) && (cur != '>')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003227 if (len + 5 >= size) {
3228 xmlChar *tmp;
3229
3230 size *= 2;
3231 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3232 if (tmp == NULL) {
3233 htmlErrMemory(ctxt, NULL);
3234 xmlFree(buf);
3235 ctxt->instate = state;
3236 return;
3237 }
3238 buf = tmp;
3239 }
3240 count++;
3241 if (count > 50) {
3242 GROW;
3243 count = 0;
3244 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003245 if (IS_CHAR(cur)) {
3246 COPY_BUF(l,buf,len,cur);
3247 } else {
3248 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3249 "Invalid char in processing instruction "
3250 "0x%X\n", cur);
3251 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003252 NEXTL(l);
3253 cur = CUR_CHAR(l);
3254 if (cur == 0) {
3255 SHRINK;
3256 GROW;
3257 cur = CUR_CHAR(l);
3258 }
3259 }
3260 buf[len] = 0;
3261 if (cur != '>') {
3262 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3263 "ParsePI: PI %s never end ...\n", target, NULL);
3264 } else {
3265 SKIP(1);
3266
3267 /*
3268 * SAX: PI detected.
3269 */
3270 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3271 (ctxt->sax->processingInstruction != NULL))
3272 ctxt->sax->processingInstruction(ctxt->userData,
3273 target, buf);
3274 }
3275 xmlFree(buf);
3276 } else {
3277 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3278 "PI is not started correctly", NULL, NULL);
3279 }
3280 ctxt->instate = state;
3281 }
3282}
3283
3284/**
3285 * htmlParseComment:
3286 * @ctxt: an HTML parser context
3287 *
3288 * Parse an XML (SGML) comment <!-- .... -->
3289 *
3290 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3291 */
3292static void
3293htmlParseComment(htmlParserCtxtPtr ctxt) {
3294 xmlChar *buf = NULL;
3295 int len;
3296 int size = HTML_PARSER_BUFFER_SIZE;
3297 int q, ql;
3298 int r, rl;
3299 int cur, l;
Haibo Huangd75f3892021-01-05 21:34:50 -08003300 int next, nl;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003301 xmlParserInputState state;
3302
3303 /*
3304 * Check that there is a comment right here.
3305 */
3306 if ((RAW != '<') || (NXT(1) != '!') ||
3307 (NXT(2) != '-') || (NXT(3) != '-')) return;
3308
3309 state = ctxt->instate;
3310 ctxt->instate = XML_PARSER_COMMENT;
3311 SHRINK;
3312 SKIP(4);
3313 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3314 if (buf == NULL) {
3315 htmlErrMemory(ctxt, "buffer allocation failed\n");
3316 ctxt->instate = state;
3317 return;
3318 }
3319 len = 0;
3320 buf[len] = 0;
3321 q = CUR_CHAR(ql);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003322 if (q == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003323 goto unfinished;
3324 NEXTL(ql);
3325 r = CUR_CHAR(rl);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003326 if (r == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003327 goto unfinished;
3328 NEXTL(rl);
3329 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003330 while ((cur != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003331 ((cur != '>') ||
3332 (r != '-') || (q != '-'))) {
Haibo Huangd75f3892021-01-05 21:34:50 -08003333 NEXTL(l);
3334 next = CUR_CHAR(nl);
3335 if (next == 0) {
3336 SHRINK;
3337 GROW;
3338 next = CUR_CHAR(nl);
3339 }
3340
3341 if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3342 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3343 "Comment incorrectly closed by '--!>'", NULL, NULL);
3344 cur = '>';
3345 break;
3346 }
3347
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003348 if (len + 5 >= size) {
3349 xmlChar *tmp;
3350
3351 size *= 2;
3352 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3353 if (tmp == NULL) {
3354 xmlFree(buf);
3355 htmlErrMemory(ctxt, "growing buffer failed\n");
3356 ctxt->instate = state;
3357 return;
3358 }
3359 buf = tmp;
3360 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003361 if (IS_CHAR(q)) {
3362 COPY_BUF(ql,buf,len,q);
3363 } else {
3364 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3365 "Invalid char in comment 0x%X\n", q);
3366 }
Haibo Huangd75f3892021-01-05 21:34:50 -08003367
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003368 q = r;
3369 ql = rl;
3370 r = cur;
3371 rl = l;
Haibo Huangd75f3892021-01-05 21:34:50 -08003372 cur = next;
3373 l = nl;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003374 }
3375 buf[len] = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003376 if (cur == '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003377 NEXT;
3378 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3379 (!ctxt->disableSAX))
3380 ctxt->sax->comment(ctxt->userData, buf);
3381 xmlFree(buf);
3382 ctxt->instate = state;
3383 return;
3384 }
3385
3386unfinished:
3387 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3388 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3389 xmlFree(buf);
3390}
3391
3392/**
3393 * htmlParseCharRef:
3394 * @ctxt: an HTML parser context
3395 *
3396 * parse Reference declarations
3397 *
3398 * [66] CharRef ::= '&#' [0-9]+ ';' |
3399 * '&#x' [0-9a-fA-F]+ ';'
3400 *
3401 * Returns the value parsed (as an int)
3402 */
3403int
3404htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3405 int val = 0;
3406
3407 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3408 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3409 "htmlParseCharRef: context error\n",
3410 NULL, NULL);
3411 return(0);
3412 }
3413 if ((CUR == '&') && (NXT(1) == '#') &&
3414 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3415 SKIP(3);
3416 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003417 if ((CUR >= '0') && (CUR <= '9')) {
3418 if (val < 0x110000)
3419 val = val * 16 + (CUR - '0');
3420 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3421 if (val < 0x110000)
3422 val = val * 16 + (CUR - 'a') + 10;
3423 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3424 if (val < 0x110000)
3425 val = val * 16 + (CUR - 'A') + 10;
3426 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003427 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3428 "htmlParseCharRef: missing semicolon\n",
3429 NULL, NULL);
3430 break;
3431 }
3432 NEXT;
3433 }
3434 if (CUR == ';')
3435 NEXT;
3436 } else if ((CUR == '&') && (NXT(1) == '#')) {
3437 SKIP(2);
3438 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003439 if ((CUR >= '0') && (CUR <= '9')) {
3440 if (val < 0x110000)
3441 val = val * 10 + (CUR - '0');
3442 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003443 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3444 "htmlParseCharRef: missing semicolon\n",
3445 NULL, NULL);
3446 break;
3447 }
3448 NEXT;
3449 }
3450 if (CUR == ';')
3451 NEXT;
3452 } else {
3453 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3454 "htmlParseCharRef: invalid value\n", NULL, NULL);
3455 }
3456 /*
3457 * Check the value IS_CHAR ...
3458 */
3459 if (IS_CHAR(val)) {
3460 return(val);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003461 } else if (val >= 0x110000) {
3462 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3463 "htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003464 } else {
3465 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3466 "htmlParseCharRef: invalid xmlChar value %d\n",
3467 val);
3468 }
3469 return(0);
3470}
3471
3472
3473/**
3474 * htmlParseDocTypeDecl:
3475 * @ctxt: an HTML parser context
3476 *
3477 * parse a DOCTYPE declaration
3478 *
3479 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3480 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3481 */
3482
3483static void
3484htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3485 const xmlChar *name;
3486 xmlChar *ExternalID = NULL;
3487 xmlChar *URI = NULL;
3488
3489 /*
3490 * We know that '<!DOCTYPE' has been detected.
3491 */
3492 SKIP(9);
3493
3494 SKIP_BLANKS;
3495
3496 /*
3497 * Parse the DOCTYPE name.
3498 */
3499 name = htmlParseName(ctxt);
3500 if (name == NULL) {
3501 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3502 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3503 NULL, NULL);
3504 }
3505 /*
3506 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3507 */
3508
3509 SKIP_BLANKS;
3510
3511 /*
3512 * Check for SystemID and ExternalID
3513 */
3514 URI = htmlParseExternalID(ctxt, &ExternalID);
3515 SKIP_BLANKS;
3516
3517 /*
3518 * We should be at the end of the DOCTYPE declaration.
3519 */
3520 if (CUR != '>') {
3521 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3522 "DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003523 /* Ignore bogus content */
3524 while ((CUR != 0) && (CUR != '>'))
3525 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003526 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003527 if (CUR == '>')
3528 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003529
3530 /*
3531 * Create or update the document accordingly to the DOCTYPE
3532 */
3533 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3534 (!ctxt->disableSAX))
3535 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3536
3537 /*
3538 * Cleanup, since we don't use all those identifiers
3539 */
3540 if (URI != NULL) xmlFree(URI);
3541 if (ExternalID != NULL) xmlFree(ExternalID);
3542}
3543
3544/**
3545 * htmlParseAttribute:
3546 * @ctxt: an HTML parser context
3547 * @value: a xmlChar ** used to store the value of the attribute
3548 *
3549 * parse an attribute
3550 *
3551 * [41] Attribute ::= Name Eq AttValue
3552 *
3553 * [25] Eq ::= S? '=' S?
3554 *
3555 * With namespace:
3556 *
3557 * [NS 11] Attribute ::= QName Eq AttValue
3558 *
3559 * Also the case QName == xmlns:??? is handled independently as a namespace
3560 * definition.
3561 *
3562 * Returns the attribute name, and the value in *value.
3563 */
3564
3565static const xmlChar *
3566htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3567 const xmlChar *name;
3568 xmlChar *val = NULL;
3569
3570 *value = NULL;
3571 name = htmlParseHTMLName(ctxt);
3572 if (name == NULL) {
3573 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3574 "error parsing attribute name\n", NULL, NULL);
3575 return(NULL);
3576 }
3577
3578 /*
3579 * read the value
3580 */
3581 SKIP_BLANKS;
3582 if (CUR == '=') {
3583 NEXT;
3584 SKIP_BLANKS;
3585 val = htmlParseAttValue(ctxt);
3586 }
3587
3588 *value = val;
3589 return(name);
3590}
3591
3592/**
3593 * htmlCheckEncodingDirect:
3594 * @ctxt: an HTML parser context
3595 * @attvalue: the attribute value
3596 *
3597 * Checks an attribute value to detect
3598 * the encoding
3599 * If a new encoding is detected the parser is switched to decode
3600 * it and pass UTF8
3601 */
3602static void
3603htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3604
3605 if ((ctxt == NULL) || (encoding == NULL) ||
3606 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3607 return;
3608
3609 /* do not change encoding */
3610 if (ctxt->input->encoding != NULL)
3611 return;
3612
3613 if (encoding != NULL) {
3614 xmlCharEncoding enc;
3615 xmlCharEncodingHandlerPtr handler;
3616
3617 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3618
3619 if (ctxt->input->encoding != NULL)
3620 xmlFree((xmlChar *) ctxt->input->encoding);
3621 ctxt->input->encoding = xmlStrdup(encoding);
3622
3623 enc = xmlParseCharEncoding((const char *) encoding);
3624 /*
3625 * registered set of known encodings
3626 */
3627 if (enc != XML_CHAR_ENCODING_ERROR) {
3628 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3629 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3630 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3631 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3632 (ctxt->input->buf != NULL) &&
3633 (ctxt->input->buf->encoder == NULL)) {
3634 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3635 "htmlCheckEncoding: wrong encoding meta\n",
3636 NULL, NULL);
3637 } else {
3638 xmlSwitchEncoding(ctxt, enc);
3639 }
3640 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3641 } else {
3642 /*
3643 * fallback for unknown encodings
3644 */
3645 handler = xmlFindCharEncodingHandler((const char *) encoding);
3646 if (handler != NULL) {
3647 xmlSwitchToEncoding(ctxt, handler);
3648 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3649 } else {
3650 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3651 "htmlCheckEncoding: unknown encoding %s\n",
3652 encoding, NULL);
3653 }
3654 }
3655
3656 if ((ctxt->input->buf != NULL) &&
3657 (ctxt->input->buf->encoder != NULL) &&
3658 (ctxt->input->buf->raw != NULL) &&
3659 (ctxt->input->buf->buffer != NULL)) {
3660 int nbchars;
3661 int processed;
3662
3663 /*
3664 * convert as much as possible to the parser reading buffer.
3665 */
3666 processed = ctxt->input->cur - ctxt->input->base;
3667 xmlBufShrink(ctxt->input->buf->buffer, processed);
3668 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3669 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3670 if (nbchars < 0) {
3671 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3672 "htmlCheckEncoding: encoder error\n",
3673 NULL, NULL);
3674 }
3675 }
3676 }
3677}
3678
3679/**
3680 * htmlCheckEncoding:
3681 * @ctxt: an HTML parser context
3682 * @attvalue: the attribute value
3683 *
3684 * Checks an http-equiv attribute from a Meta tag to detect
3685 * the encoding
3686 * If a new encoding is detected the parser is switched to decode
3687 * it and pass UTF8
3688 */
3689static void
3690htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3691 const xmlChar *encoding;
3692
3693 if (!attvalue)
3694 return;
3695
3696 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3697 if (encoding != NULL) {
3698 encoding += 7;
3699 }
3700 /*
3701 * skip blank
3702 */
3703 if (encoding && IS_BLANK_CH(*encoding))
3704 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3705 if (encoding && *encoding == '=') {
3706 encoding ++;
3707 htmlCheckEncodingDirect(ctxt, encoding);
3708 }
3709}
3710
3711/**
3712 * htmlCheckMeta:
3713 * @ctxt: an HTML parser context
3714 * @atts: the attributes values
3715 *
3716 * Checks an attributes from a Meta tag
3717 */
3718static void
3719htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3720 int i;
3721 const xmlChar *att, *value;
3722 int http = 0;
3723 const xmlChar *content = NULL;
3724
3725 if ((ctxt == NULL) || (atts == NULL))
3726 return;
3727
3728 i = 0;
3729 att = atts[i++];
3730 while (att != NULL) {
3731 value = atts[i++];
3732 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3733 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3734 http = 1;
3735 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3736 htmlCheckEncodingDirect(ctxt, value);
3737 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3738 content = value;
3739 att = atts[i++];
3740 }
3741 if ((http) && (content != NULL))
3742 htmlCheckEncoding(ctxt, content);
3743
3744}
3745
3746/**
3747 * htmlParseStartTag:
3748 * @ctxt: an HTML parser context
3749 *
3750 * parse a start of tag either for rule element or
3751 * EmptyElement. In both case we don't parse the tag closing chars.
3752 *
3753 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3754 *
3755 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3756 *
3757 * With namespace:
3758 *
3759 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3760 *
3761 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3762 *
3763 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3764 */
3765
3766static int
3767htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3768 const xmlChar *name;
3769 const xmlChar *attname;
3770 xmlChar *attvalue;
3771 const xmlChar **atts;
3772 int nbatts = 0;
3773 int maxatts;
3774 int meta = 0;
3775 int i;
3776 int discardtag = 0;
3777
3778 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3779 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3780 "htmlParseStartTag: context error\n", NULL, NULL);
3781 return -1;
3782 }
3783 if (ctxt->instate == XML_PARSER_EOF)
3784 return(-1);
3785 if (CUR != '<') return -1;
3786 NEXT;
3787
3788 atts = ctxt->atts;
3789 maxatts = ctxt->maxatts;
3790
3791 GROW;
3792 name = htmlParseHTMLName(ctxt);
3793 if (name == NULL) {
3794 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3795 "htmlParseStartTag: invalid element name\n",
3796 NULL, NULL);
3797 /* if recover preserve text on classic misconstructs */
3798 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3799 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3800 htmlParseCharDataInternal(ctxt, '<');
3801 return(-1);
3802 }
3803
3804
3805 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003806 while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003807 (ctxt->instate != XML_PARSER_EOF))
3808 NEXT;
3809 return -1;
3810 }
3811 if (xmlStrEqual(name, BAD_CAST"meta"))
3812 meta = 1;
3813
3814 /*
3815 * Check for auto-closure of HTML elements.
3816 */
3817 htmlAutoClose(ctxt, name);
3818
3819 /*
3820 * Check for implied HTML elements.
3821 */
3822 htmlCheckImplied(ctxt, name);
3823
3824 /*
3825 * Avoid html at any level > 0, head at any level != 1
3826 * or any attempt to recurse body
3827 */
3828 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3829 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3830 "htmlParseStartTag: misplaced <html> tag\n",
3831 name, NULL);
3832 discardtag = 1;
3833 ctxt->depth++;
3834 }
3835 if ((ctxt->nameNr != 1) &&
3836 (xmlStrEqual(name, BAD_CAST"head"))) {
3837 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3838 "htmlParseStartTag: misplaced <head> tag\n",
3839 name, NULL);
3840 discardtag = 1;
3841 ctxt->depth++;
3842 }
3843 if (xmlStrEqual(name, BAD_CAST"body")) {
3844 int indx;
3845 for (indx = 0;indx < ctxt->nameNr;indx++) {
3846 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3847 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3848 "htmlParseStartTag: misplaced <body> tag\n",
3849 name, NULL);
3850 discardtag = 1;
3851 ctxt->depth++;
3852 }
3853 }
3854 }
3855
3856 /*
3857 * Now parse the attributes, it ends up with the ending
3858 *
3859 * (S Attribute)* S?
3860 */
3861 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003862 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003863 (CUR != '>') &&
3864 ((CUR != '/') || (NXT(1) != '>'))) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003865 GROW;
3866 attname = htmlParseAttribute(ctxt, &attvalue);
3867 if (attname != NULL) {
3868
3869 /*
3870 * Well formedness requires at most one declaration of an attribute
3871 */
3872 for (i = 0; i < nbatts;i += 2) {
3873 if (xmlStrEqual(atts[i], attname)) {
3874 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3875 "Attribute %s redefined\n", attname, NULL);
3876 if (attvalue != NULL)
3877 xmlFree(attvalue);
3878 goto failed;
3879 }
3880 }
3881
3882 /*
3883 * Add the pair to atts
3884 */
3885 if (atts == NULL) {
3886 maxatts = 22; /* allow for 10 attrs by default */
3887 atts = (const xmlChar **)
3888 xmlMalloc(maxatts * sizeof(xmlChar *));
3889 if (atts == NULL) {
3890 htmlErrMemory(ctxt, NULL);
3891 if (attvalue != NULL)
3892 xmlFree(attvalue);
3893 goto failed;
3894 }
3895 ctxt->atts = atts;
3896 ctxt->maxatts = maxatts;
3897 } else if (nbatts + 4 > maxatts) {
3898 const xmlChar **n;
3899
3900 maxatts *= 2;
3901 n = (const xmlChar **) xmlRealloc((void *) atts,
3902 maxatts * sizeof(const xmlChar *));
3903 if (n == NULL) {
3904 htmlErrMemory(ctxt, NULL);
3905 if (attvalue != NULL)
3906 xmlFree(attvalue);
3907 goto failed;
3908 }
3909 atts = n;
3910 ctxt->atts = atts;
3911 ctxt->maxatts = maxatts;
3912 }
3913 atts[nbatts++] = attname;
3914 atts[nbatts++] = attvalue;
3915 atts[nbatts] = NULL;
3916 atts[nbatts + 1] = NULL;
3917 }
3918 else {
3919 if (attvalue != NULL)
3920 xmlFree(attvalue);
3921 /* Dump the bogus attribute string up to the next blank or
3922 * the end of the tag. */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003923 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003924 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3925 ((CUR != '/') || (NXT(1) != '>')))
3926 NEXT;
3927 }
3928
3929failed:
3930 SKIP_BLANKS;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003931 }
3932
3933 /*
3934 * Handle specific association to the META tag
3935 */
3936 if (meta && (nbatts != 0))
3937 htmlCheckMeta(ctxt, atts);
3938
3939 /*
3940 * SAX: Start of Element !
3941 */
3942 if (!discardtag) {
3943 htmlnamePush(ctxt, name);
3944 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3945 if (nbatts != 0)
3946 ctxt->sax->startElement(ctxt->userData, name, atts);
3947 else
3948 ctxt->sax->startElement(ctxt->userData, name, NULL);
3949 }
3950 }
3951
3952 if (atts != NULL) {
3953 for (i = 1;i < nbatts;i += 2) {
3954 if (atts[i] != NULL)
3955 xmlFree((xmlChar *) atts[i]);
3956 }
3957 }
3958
3959 return(discardtag);
3960}
3961
3962/**
3963 * htmlParseEndTag:
3964 * @ctxt: an HTML parser context
3965 *
3966 * parse an end of tag
3967 *
3968 * [42] ETag ::= '</' Name S? '>'
3969 *
3970 * With namespace
3971 *
3972 * [NS 9] ETag ::= '</' QName S? '>'
3973 *
3974 * Returns 1 if the current level should be closed.
3975 */
3976
3977static int
3978htmlParseEndTag(htmlParserCtxtPtr ctxt)
3979{
3980 const xmlChar *name;
3981 const xmlChar *oldname;
3982 int i, ret;
3983
3984 if ((CUR != '<') || (NXT(1) != '/')) {
3985 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3986 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3987 return (0);
3988 }
3989 SKIP(2);
3990
3991 name = htmlParseHTMLName(ctxt);
3992 if (name == NULL)
3993 return (0);
3994 /*
3995 * We should definitely be at the ending "S? '>'" part
3996 */
3997 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003998 if (CUR != '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003999 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4000 "End tag : expected '>'\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004001 /* Skip to next '>' */
4002 while ((CUR != 0) && (CUR != '>'))
4003 NEXT;
4004 }
4005 if (CUR == '>')
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004006 NEXT;
4007
4008 /*
4009 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4010 * out now.
4011 */
4012 if ((ctxt->depth > 0) &&
4013 (xmlStrEqual(name, BAD_CAST "html") ||
4014 xmlStrEqual(name, BAD_CAST "body") ||
4015 xmlStrEqual(name, BAD_CAST "head"))) {
4016 ctxt->depth--;
4017 return (0);
4018 }
4019
4020 /*
4021 * If the name read is not one of the element in the parsing stack
4022 * then return, it's just an error.
4023 */
4024 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4025 if (xmlStrEqual(name, ctxt->nameTab[i]))
4026 break;
4027 }
4028 if (i < 0) {
4029 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4030 "Unexpected end tag : %s\n", name, NULL);
4031 return (0);
4032 }
4033
4034
4035 /*
4036 * Check for auto-closure of HTML elements.
4037 */
4038
4039 htmlAutoCloseOnClose(ctxt, name);
4040
4041 /*
4042 * Well formedness constraints, opening and closing must match.
4043 * With the exception that the autoclose may have popped stuff out
4044 * of the stack.
4045 */
4046 if (!xmlStrEqual(name, ctxt->name)) {
4047 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4048 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4049 "Opening and ending tag mismatch: %s and %s\n",
4050 name, ctxt->name);
4051 }
4052 }
4053
4054 /*
4055 * SAX: End of Tag
4056 */
4057 oldname = ctxt->name;
4058 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4059 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4060 ctxt->sax->endElement(ctxt->userData, name);
4061 htmlNodeInfoPop(ctxt);
4062 htmlnamePop(ctxt);
4063 ret = 1;
4064 } else {
4065 ret = 0;
4066 }
4067
4068 return (ret);
4069}
4070
4071
4072/**
4073 * htmlParseReference:
4074 * @ctxt: an HTML parser context
4075 *
4076 * parse and handle entity references in content,
4077 * this will end-up in a call to character() since this is either a
4078 * CharRef, or a predefined entity.
4079 */
4080static void
4081htmlParseReference(htmlParserCtxtPtr ctxt) {
4082 const htmlEntityDesc * ent;
4083 xmlChar out[6];
4084 const xmlChar *name;
4085 if (CUR != '&') return;
4086
4087 if (NXT(1) == '#') {
4088 unsigned int c;
4089 int bits, i = 0;
4090
4091 c = htmlParseCharRef(ctxt);
4092 if (c == 0)
4093 return;
4094
4095 if (c < 0x80) { out[i++]= c; bits= -6; }
4096 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4097 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4098 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4099
4100 for ( ; bits >= 0; bits-= 6) {
4101 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4102 }
4103 out[i] = 0;
4104
4105 htmlCheckParagraph(ctxt);
4106 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4107 ctxt->sax->characters(ctxt->userData, out, i);
4108 } else {
4109 ent = htmlParseEntityRef(ctxt, &name);
4110 if (name == NULL) {
4111 htmlCheckParagraph(ctxt);
4112 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4113 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4114 return;
4115 }
4116 if ((ent == NULL) || !(ent->value > 0)) {
4117 htmlCheckParagraph(ctxt);
4118 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4119 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4120 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4121 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4122 }
4123 } else {
4124 unsigned int c;
4125 int bits, i = 0;
4126
4127 c = ent->value;
4128 if (c < 0x80)
4129 { out[i++]= c; bits= -6; }
4130 else if (c < 0x800)
4131 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4132 else if (c < 0x10000)
4133 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4134 else
4135 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4136
4137 for ( ; bits >= 0; bits-= 6) {
4138 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4139 }
4140 out[i] = 0;
4141
4142 htmlCheckParagraph(ctxt);
4143 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4144 ctxt->sax->characters(ctxt->userData, out, i);
4145 }
4146 }
4147}
4148
4149/**
4150 * htmlParseContent:
4151 * @ctxt: an HTML parser context
4152 *
4153 * Parse a content: comment, sub-element, reference or text.
4154 * Kept for compatibility with old code
4155 */
4156
4157static void
4158htmlParseContent(htmlParserCtxtPtr ctxt) {
4159 xmlChar *currentNode;
4160 int depth;
4161 const xmlChar *name;
4162
4163 currentNode = xmlStrdup(ctxt->name);
4164 depth = ctxt->nameNr;
4165 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004166 GROW;
4167
4168 if (ctxt->instate == XML_PARSER_EOF)
4169 break;
4170
4171 /*
4172 * Our tag or one of it's parent or children is ending.
4173 */
4174 if ((CUR == '<') && (NXT(1) == '/')) {
4175 if (htmlParseEndTag(ctxt) &&
4176 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4177 if (currentNode != NULL)
4178 xmlFree(currentNode);
4179 return;
4180 }
4181 continue; /* while */
4182 }
4183
4184 else if ((CUR == '<') &&
4185 ((IS_ASCII_LETTER(NXT(1))) ||
4186 (NXT(1) == '_') || (NXT(1) == ':'))) {
4187 name = htmlParseHTMLName_nonInvasive(ctxt);
4188 if (name == NULL) {
4189 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4190 "htmlParseStartTag: invalid element name\n",
4191 NULL, NULL);
4192 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004193 while ((CUR != 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004194 NEXT;
4195
4196 if (currentNode != NULL)
4197 xmlFree(currentNode);
4198 return;
4199 }
4200
4201 if (ctxt->name != NULL) {
4202 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4203 htmlAutoClose(ctxt, name);
4204 continue;
4205 }
4206 }
4207 }
4208
4209 /*
4210 * Has this node been popped out during parsing of
4211 * the next element
4212 */
4213 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4214 (!xmlStrEqual(currentNode, ctxt->name)))
4215 {
4216 if (currentNode != NULL) xmlFree(currentNode);
4217 return;
4218 }
4219
4220 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4221 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4222 /*
4223 * Handle SCRIPT/STYLE separately
4224 */
4225 htmlParseScript(ctxt);
4226 } else {
4227 /*
4228 * Sometimes DOCTYPE arrives in the middle of the document
4229 */
4230 if ((CUR == '<') && (NXT(1) == '!') &&
4231 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4232 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4233 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4234 (UPP(8) == 'E')) {
4235 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4236 "Misplaced DOCTYPE declaration\n",
4237 BAD_CAST "DOCTYPE" , NULL);
4238 htmlParseDocTypeDecl(ctxt);
4239 }
4240
4241 /*
4242 * First case : a comment
4243 */
4244 if ((CUR == '<') && (NXT(1) == '!') &&
4245 (NXT(2) == '-') && (NXT(3) == '-')) {
4246 htmlParseComment(ctxt);
4247 }
4248
4249 /*
4250 * Second case : a Processing Instruction.
4251 */
4252 else if ((CUR == '<') && (NXT(1) == '?')) {
4253 htmlParsePI(ctxt);
4254 }
4255
4256 /*
4257 * Third case : a sub-element.
4258 */
4259 else if (CUR == '<') {
4260 htmlParseElement(ctxt);
4261 }
4262
4263 /*
4264 * Fourth case : a reference. If if has not been resolved,
4265 * parsing returns it's Name, create the node
4266 */
4267 else if (CUR == '&') {
4268 htmlParseReference(ctxt);
4269 }
4270
4271 /*
4272 * Fifth case : end of the resource
4273 */
4274 else if (CUR == 0) {
4275 htmlAutoCloseOnEnd(ctxt);
4276 break;
4277 }
4278
4279 /*
4280 * Last case, text. Note that References are handled directly.
4281 */
4282 else {
4283 htmlParseCharData(ctxt);
4284 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004285 }
4286 GROW;
4287 }
4288 if (currentNode != NULL) xmlFree(currentNode);
4289}
4290
4291/**
4292 * htmlParseElement:
4293 * @ctxt: an HTML parser context
4294 *
4295 * parse an HTML element, this is highly recursive
4296 * this is kept for compatibility with previous code versions
4297 *
4298 * [39] element ::= EmptyElemTag | STag content ETag
4299 *
4300 * [41] Attribute ::= Name Eq AttValue
4301 */
4302
4303void
4304htmlParseElement(htmlParserCtxtPtr ctxt) {
4305 const xmlChar *name;
4306 xmlChar *currentNode = NULL;
4307 const htmlElemDesc * info;
4308 htmlParserNodeInfo node_info;
4309 int failed;
4310 int depth;
4311 const xmlChar *oldptr;
4312
4313 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4314 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4315 "htmlParseElement: context error\n", NULL, NULL);
4316 return;
4317 }
4318
4319 if (ctxt->instate == XML_PARSER_EOF)
4320 return;
4321
4322 /* Capture start position */
4323 if (ctxt->record_info) {
4324 node_info.begin_pos = ctxt->input->consumed +
4325 (CUR_PTR - ctxt->input->base);
4326 node_info.begin_line = ctxt->input->line;
4327 }
4328
4329 failed = htmlParseStartTag(ctxt);
4330 name = ctxt->name;
4331 if ((failed == -1) || (name == NULL)) {
4332 if (CUR == '>')
4333 NEXT;
4334 return;
4335 }
4336
4337 /*
4338 * Lookup the info for that element.
4339 */
4340 info = htmlTagLookup(name);
4341 if (info == NULL) {
4342 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4343 "Tag %s invalid\n", name, NULL);
4344 }
4345
4346 /*
4347 * Check for an Empty Element labeled the XML/SGML way
4348 */
4349 if ((CUR == '/') && (NXT(1) == '>')) {
4350 SKIP(2);
4351 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4352 ctxt->sax->endElement(ctxt->userData, name);
4353 htmlnamePop(ctxt);
4354 return;
4355 }
4356
4357 if (CUR == '>') {
4358 NEXT;
4359 } else {
4360 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4361 "Couldn't find end of Start Tag %s\n", name, NULL);
4362
4363 /*
4364 * end of parsing of this node.
4365 */
4366 if (xmlStrEqual(name, ctxt->name)) {
4367 nodePop(ctxt);
4368 htmlnamePop(ctxt);
4369 }
4370
4371 /*
4372 * Capture end position and add node
4373 */
4374 if (ctxt->record_info) {
4375 node_info.end_pos = ctxt->input->consumed +
4376 (CUR_PTR - ctxt->input->base);
4377 node_info.end_line = ctxt->input->line;
4378 node_info.node = ctxt->node;
4379 xmlParserAddNodeInfo(ctxt, &node_info);
4380 }
4381 return;
4382 }
4383
4384 /*
4385 * Check for an Empty Element from DTD definition
4386 */
4387 if ((info != NULL) && (info->empty)) {
4388 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4389 ctxt->sax->endElement(ctxt->userData, name);
4390 htmlnamePop(ctxt);
4391 return;
4392 }
4393
4394 /*
4395 * Parse the content of the element:
4396 */
4397 currentNode = xmlStrdup(ctxt->name);
4398 depth = ctxt->nameNr;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004399 while (CUR != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004400 oldptr = ctxt->input->cur;
4401 htmlParseContent(ctxt);
4402 if (oldptr==ctxt->input->cur) break;
4403 if (ctxt->nameNr < depth) break;
4404 }
4405
4406 /*
4407 * Capture end position and add node
4408 */
4409 if ( currentNode != NULL && ctxt->record_info ) {
4410 node_info.end_pos = ctxt->input->consumed +
4411 (CUR_PTR - ctxt->input->base);
4412 node_info.end_line = ctxt->input->line;
4413 node_info.node = ctxt->node;
4414 xmlParserAddNodeInfo(ctxt, &node_info);
4415 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004416 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004417 htmlAutoCloseOnEnd(ctxt);
4418 }
4419
4420 if (currentNode != NULL)
4421 xmlFree(currentNode);
4422}
4423
4424static void
4425htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4426 /*
4427 * Capture end position and add node
4428 */
4429 if ( ctxt->node != NULL && ctxt->record_info ) {
4430 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4431 (CUR_PTR - ctxt->input->base);
4432 ctxt->nodeInfo->end_line = ctxt->input->line;
4433 ctxt->nodeInfo->node = ctxt->node;
4434 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4435 htmlNodeInfoPop(ctxt);
4436 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004437 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004438 htmlAutoCloseOnEnd(ctxt);
4439 }
4440}
4441
4442/**
4443 * htmlParseElementInternal:
4444 * @ctxt: an HTML parser context
4445 *
4446 * parse an HTML element, new version, non recursive
4447 *
4448 * [39] element ::= EmptyElemTag | STag content ETag
4449 *
4450 * [41] Attribute ::= Name Eq AttValue
4451 */
4452
4453static void
4454htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4455 const xmlChar *name;
4456 const htmlElemDesc * info;
4457 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4458 int failed;
4459
4460 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4461 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4462 "htmlParseElementInternal: context error\n", NULL, NULL);
4463 return;
4464 }
4465
4466 if (ctxt->instate == XML_PARSER_EOF)
4467 return;
4468
4469 /* Capture start position */
4470 if (ctxt->record_info) {
4471 node_info.begin_pos = ctxt->input->consumed +
4472 (CUR_PTR - ctxt->input->base);
4473 node_info.begin_line = ctxt->input->line;
4474 }
4475
4476 failed = htmlParseStartTag(ctxt);
4477 name = ctxt->name;
4478 if ((failed == -1) || (name == NULL)) {
4479 if (CUR == '>')
4480 NEXT;
4481 return;
4482 }
4483
4484 /*
4485 * Lookup the info for that element.
4486 */
4487 info = htmlTagLookup(name);
4488 if (info == NULL) {
4489 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4490 "Tag %s invalid\n", name, NULL);
4491 }
4492
4493 /*
4494 * Check for an Empty Element labeled the XML/SGML way
4495 */
4496 if ((CUR == '/') && (NXT(1) == '>')) {
4497 SKIP(2);
4498 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4499 ctxt->sax->endElement(ctxt->userData, name);
4500 htmlnamePop(ctxt);
4501 return;
4502 }
4503
4504 if (CUR == '>') {
4505 NEXT;
4506 } else {
4507 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4508 "Couldn't find end of Start Tag %s\n", name, NULL);
4509
4510 /*
4511 * end of parsing of this node.
4512 */
4513 if (xmlStrEqual(name, ctxt->name)) {
4514 nodePop(ctxt);
4515 htmlnamePop(ctxt);
4516 }
4517
4518 if (ctxt->record_info)
4519 htmlNodeInfoPush(ctxt, &node_info);
4520 htmlParserFinishElementParsing(ctxt);
4521 return;
4522 }
4523
4524 /*
4525 * Check for an Empty Element from DTD definition
4526 */
4527 if ((info != NULL) && (info->empty)) {
4528 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4529 ctxt->sax->endElement(ctxt->userData, name);
4530 htmlnamePop(ctxt);
4531 return;
4532 }
4533
4534 if (ctxt->record_info)
4535 htmlNodeInfoPush(ctxt, &node_info);
4536}
4537
4538/**
4539 * htmlParseContentInternal:
4540 * @ctxt: an HTML parser context
4541 *
4542 * Parse a content: comment, sub-element, reference or text.
4543 * New version for non recursive htmlParseElementInternal
4544 */
4545
4546static void
4547htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4548 xmlChar *currentNode;
4549 int depth;
4550 const xmlChar *name;
4551
4552 currentNode = xmlStrdup(ctxt->name);
4553 depth = ctxt->nameNr;
4554 while (1) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004555 GROW;
4556
4557 if (ctxt->instate == XML_PARSER_EOF)
4558 break;
4559
4560 /*
4561 * Our tag or one of it's parent or children is ending.
4562 */
4563 if ((CUR == '<') && (NXT(1) == '/')) {
4564 if (htmlParseEndTag(ctxt) &&
4565 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4566 if (currentNode != NULL)
4567 xmlFree(currentNode);
4568
4569 currentNode = xmlStrdup(ctxt->name);
4570 depth = ctxt->nameNr;
4571 }
4572 continue; /* while */
4573 }
4574
4575 else if ((CUR == '<') &&
4576 ((IS_ASCII_LETTER(NXT(1))) ||
4577 (NXT(1) == '_') || (NXT(1) == ':'))) {
4578 name = htmlParseHTMLName_nonInvasive(ctxt);
4579 if (name == NULL) {
4580 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4581 "htmlParseStartTag: invalid element name\n",
4582 NULL, NULL);
4583 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004584 while ((CUR == 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004585 NEXT;
4586
4587 htmlParserFinishElementParsing(ctxt);
4588 if (currentNode != NULL)
4589 xmlFree(currentNode);
4590
4591 currentNode = xmlStrdup(ctxt->name);
4592 depth = ctxt->nameNr;
4593 continue;
4594 }
4595
4596 if (ctxt->name != NULL) {
4597 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4598 htmlAutoClose(ctxt, name);
4599 continue;
4600 }
4601 }
4602 }
4603
4604 /*
4605 * Has this node been popped out during parsing of
4606 * the next element
4607 */
4608 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4609 (!xmlStrEqual(currentNode, ctxt->name)))
4610 {
4611 htmlParserFinishElementParsing(ctxt);
4612 if (currentNode != NULL) xmlFree(currentNode);
4613
4614 currentNode = xmlStrdup(ctxt->name);
4615 depth = ctxt->nameNr;
4616 continue;
4617 }
4618
4619 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4620 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4621 /*
4622 * Handle SCRIPT/STYLE separately
4623 */
4624 htmlParseScript(ctxt);
4625 } else {
4626 /*
4627 * Sometimes DOCTYPE arrives in the middle of the document
4628 */
4629 if ((CUR == '<') && (NXT(1) == '!') &&
4630 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4631 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4632 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4633 (UPP(8) == 'E')) {
4634 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4635 "Misplaced DOCTYPE declaration\n",
4636 BAD_CAST "DOCTYPE" , NULL);
4637 htmlParseDocTypeDecl(ctxt);
4638 }
4639
4640 /*
4641 * First case : a comment
4642 */
4643 if ((CUR == '<') && (NXT(1) == '!') &&
4644 (NXT(2) == '-') && (NXT(3) == '-')) {
4645 htmlParseComment(ctxt);
4646 }
4647
4648 /*
4649 * Second case : a Processing Instruction.
4650 */
4651 else if ((CUR == '<') && (NXT(1) == '?')) {
4652 htmlParsePI(ctxt);
4653 }
4654
4655 /*
4656 * Third case : a sub-element.
4657 */
4658 else if (CUR == '<') {
4659 htmlParseElementInternal(ctxt);
4660 if (currentNode != NULL) xmlFree(currentNode);
4661
4662 currentNode = xmlStrdup(ctxt->name);
4663 depth = ctxt->nameNr;
4664 }
4665
4666 /*
4667 * Fourth case : a reference. If if has not been resolved,
4668 * parsing returns it's Name, create the node
4669 */
4670 else if (CUR == '&') {
4671 htmlParseReference(ctxt);
4672 }
4673
4674 /*
4675 * Fifth case : end of the resource
4676 */
4677 else if (CUR == 0) {
4678 htmlAutoCloseOnEnd(ctxt);
4679 break;
4680 }
4681
4682 /*
4683 * Last case, text. Note that References are handled directly.
4684 */
4685 else {
4686 htmlParseCharData(ctxt);
4687 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004688 }
4689 GROW;
4690 }
4691 if (currentNode != NULL) xmlFree(currentNode);
4692}
4693
4694/**
4695 * htmlParseContent:
4696 * @ctxt: an HTML parser context
4697 *
4698 * Parse a content: comment, sub-element, reference or text.
4699 * This is the entry point when called from parser.c
4700 */
4701
4702void
4703__htmlParseContent(void *ctxt) {
4704 if (ctxt != NULL)
4705 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4706}
4707
4708/**
4709 * htmlParseDocument:
4710 * @ctxt: an HTML parser context
4711 *
4712 * parse an HTML document (and build a tree if using the standard SAX
4713 * interface).
4714 *
4715 * Returns 0, -1 in case of error. the parser context is augmented
4716 * as a result of the parsing.
4717 */
4718
4719int
4720htmlParseDocument(htmlParserCtxtPtr ctxt) {
4721 xmlChar start[4];
4722 xmlCharEncoding enc;
4723 xmlDtdPtr dtd;
4724
4725 xmlInitParser();
4726
4727 htmlDefaultSAXHandlerInit();
4728
4729 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4730 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4731 "htmlParseDocument: context error\n", NULL, NULL);
4732 return(XML_ERR_INTERNAL_ERROR);
4733 }
4734 ctxt->html = 1;
4735 ctxt->linenumbers = 1;
4736 GROW;
4737 /*
4738 * SAX: beginning of the document processing.
4739 */
4740 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4741 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4742
4743 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4744 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4745 /*
4746 * Get the 4 first bytes and decode the charset
4747 * if enc != XML_CHAR_ENCODING_NONE
4748 * plug some encoding conversion routines.
4749 */
4750 start[0] = RAW;
4751 start[1] = NXT(1);
4752 start[2] = NXT(2);
4753 start[3] = NXT(3);
4754 enc = xmlDetectCharEncoding(&start[0], 4);
4755 if (enc != XML_CHAR_ENCODING_NONE) {
4756 xmlSwitchEncoding(ctxt, enc);
4757 }
4758 }
4759
4760 /*
4761 * Wipe out everything which is before the first '<'
4762 */
4763 SKIP_BLANKS;
4764 if (CUR == 0) {
4765 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4766 "Document is empty\n", NULL, NULL);
4767 }
4768
4769 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4770 ctxt->sax->startDocument(ctxt->userData);
4771
4772
4773 /*
4774 * Parse possible comments and PIs before any content
4775 */
4776 while (((CUR == '<') && (NXT(1) == '!') &&
4777 (NXT(2) == '-') && (NXT(3) == '-')) ||
4778 ((CUR == '<') && (NXT(1) == '?'))) {
4779 htmlParseComment(ctxt);
4780 htmlParsePI(ctxt);
4781 SKIP_BLANKS;
4782 }
4783
4784
4785 /*
4786 * Then possibly doc type declaration(s) and more Misc
4787 * (doctypedecl Misc*)?
4788 */
4789 if ((CUR == '<') && (NXT(1) == '!') &&
4790 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4791 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4792 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4793 (UPP(8) == 'E')) {
4794 htmlParseDocTypeDecl(ctxt);
4795 }
4796 SKIP_BLANKS;
4797
4798 /*
4799 * Parse possible comments and PIs before any content
4800 */
4801 while (((CUR == '<') && (NXT(1) == '!') &&
4802 (NXT(2) == '-') && (NXT(3) == '-')) ||
4803 ((CUR == '<') && (NXT(1) == '?'))) {
4804 htmlParseComment(ctxt);
4805 htmlParsePI(ctxt);
4806 SKIP_BLANKS;
4807 }
4808
4809 /*
4810 * Time to start parsing the tree itself
4811 */
4812 htmlParseContentInternal(ctxt);
4813
4814 /*
4815 * autoclose
4816 */
4817 if (CUR == 0)
4818 htmlAutoCloseOnEnd(ctxt);
4819
4820
4821 /*
4822 * SAX: end of the document processing.
4823 */
4824 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4825 ctxt->sax->endDocument(ctxt->userData);
4826
4827 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4828 dtd = xmlGetIntSubset(ctxt->myDoc);
4829 if (dtd == NULL)
4830 ctxt->myDoc->intSubset =
4831 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4832 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4833 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4834 }
4835 if (! ctxt->wellFormed) return(-1);
4836 return(0);
4837}
4838
4839
4840/************************************************************************
4841 * *
4842 * Parser contexts handling *
4843 * *
4844 ************************************************************************/
4845
4846/**
4847 * htmlInitParserCtxt:
4848 * @ctxt: an HTML parser context
4849 *
4850 * Initialize a parser context
4851 *
4852 * Returns 0 in case of success and -1 in case of error
4853 */
4854
4855static int
4856htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4857{
4858 htmlSAXHandler *sax;
4859
4860 if (ctxt == NULL) return(-1);
4861 memset(ctxt, 0, sizeof(htmlParserCtxt));
4862
4863 ctxt->dict = xmlDictCreate();
4864 if (ctxt->dict == NULL) {
4865 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4866 return(-1);
4867 }
4868 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4869 if (sax == NULL) {
4870 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4871 return(-1);
4872 }
4873 else
4874 memset(sax, 0, sizeof(htmlSAXHandler));
4875
4876 /* Allocate the Input stack */
4877 ctxt->inputTab = (htmlParserInputPtr *)
4878 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4879 if (ctxt->inputTab == NULL) {
4880 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4881 ctxt->inputNr = 0;
4882 ctxt->inputMax = 0;
4883 ctxt->input = NULL;
4884 return(-1);
4885 }
4886 ctxt->inputNr = 0;
4887 ctxt->inputMax = 5;
4888 ctxt->input = NULL;
4889 ctxt->version = NULL;
4890 ctxt->encoding = NULL;
4891 ctxt->standalone = -1;
4892 ctxt->instate = XML_PARSER_START;
4893
4894 /* Allocate the Node stack */
4895 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4896 if (ctxt->nodeTab == NULL) {
4897 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4898 ctxt->nodeNr = 0;
4899 ctxt->nodeMax = 0;
4900 ctxt->node = NULL;
4901 ctxt->inputNr = 0;
4902 ctxt->inputMax = 0;
4903 ctxt->input = NULL;
4904 return(-1);
4905 }
4906 ctxt->nodeNr = 0;
4907 ctxt->nodeMax = 10;
4908 ctxt->node = NULL;
4909
4910 /* Allocate the Name stack */
4911 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4912 if (ctxt->nameTab == NULL) {
4913 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4914 ctxt->nameNr = 0;
4915 ctxt->nameMax = 0;
4916 ctxt->name = NULL;
4917 ctxt->nodeNr = 0;
4918 ctxt->nodeMax = 0;
4919 ctxt->node = NULL;
4920 ctxt->inputNr = 0;
4921 ctxt->inputMax = 0;
4922 ctxt->input = NULL;
4923 return(-1);
4924 }
4925 ctxt->nameNr = 0;
4926 ctxt->nameMax = 10;
4927 ctxt->name = NULL;
4928
4929 ctxt->nodeInfoTab = NULL;
4930 ctxt->nodeInfoNr = 0;
4931 ctxt->nodeInfoMax = 0;
4932
4933 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4934 else {
4935 ctxt->sax = sax;
4936 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4937 }
4938 ctxt->userData = ctxt;
4939 ctxt->myDoc = NULL;
4940 ctxt->wellFormed = 1;
4941 ctxt->replaceEntities = 0;
4942 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4943 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4944 ctxt->html = 1;
4945 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4946 ctxt->vctxt.userData = ctxt;
4947 ctxt->vctxt.error = xmlParserValidityError;
4948 ctxt->vctxt.warning = xmlParserValidityWarning;
4949 ctxt->record_info = 0;
4950 ctxt->validate = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004951 ctxt->checkIndex = 0;
4952 ctxt->catalogs = NULL;
4953 xmlInitNodeInfoSeq(&ctxt->node_seq);
4954 return(0);
4955}
4956
4957/**
4958 * htmlFreeParserCtxt:
4959 * @ctxt: an HTML parser context
4960 *
4961 * Free all the memory used by a parser context. However the parsed
4962 * document in ctxt->myDoc is not freed.
4963 */
4964
4965void
4966htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4967{
4968 xmlFreeParserCtxt(ctxt);
4969}
4970
4971/**
4972 * htmlNewParserCtxt:
4973 *
4974 * Allocate and initialize a new parser context.
4975 *
4976 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4977 */
4978
4979htmlParserCtxtPtr
4980htmlNewParserCtxt(void)
4981{
4982 xmlParserCtxtPtr ctxt;
4983
4984 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4985 if (ctxt == NULL) {
4986 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4987 return(NULL);
4988 }
4989 memset(ctxt, 0, sizeof(xmlParserCtxt));
4990 if (htmlInitParserCtxt(ctxt) < 0) {
4991 htmlFreeParserCtxt(ctxt);
4992 return(NULL);
4993 }
4994 return(ctxt);
4995}
4996
4997/**
4998 * htmlCreateMemoryParserCtxt:
4999 * @buffer: a pointer to a char array
5000 * @size: the size of the array
5001 *
5002 * Create a parser context for an HTML in-memory document.
5003 *
5004 * Returns the new parser context or NULL
5005 */
5006htmlParserCtxtPtr
5007htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5008 xmlParserCtxtPtr ctxt;
5009 xmlParserInputPtr input;
5010 xmlParserInputBufferPtr buf;
5011
5012 if (buffer == NULL)
5013 return(NULL);
5014 if (size <= 0)
5015 return(NULL);
5016
5017 ctxt = htmlNewParserCtxt();
5018 if (ctxt == NULL)
5019 return(NULL);
5020
5021 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5022 if (buf == NULL) return(NULL);
5023
5024 input = xmlNewInputStream(ctxt);
5025 if (input == NULL) {
5026 xmlFreeParserCtxt(ctxt);
5027 return(NULL);
5028 }
5029
5030 input->filename = NULL;
5031 input->buf = buf;
5032 xmlBufResetInput(buf->buffer, input);
5033
5034 inputPush(ctxt, input);
5035 return(ctxt);
5036}
5037
5038/**
5039 * htmlCreateDocParserCtxt:
5040 * @cur: a pointer to an array of xmlChar
5041 * @encoding: a free form C string describing the HTML document encoding, or NULL
5042 *
5043 * Create a parser context for an HTML document.
5044 *
5045 * TODO: check the need to add encoding handling there
5046 *
5047 * Returns the new parser context or NULL
5048 */
5049static htmlParserCtxtPtr
5050htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5051 int len;
5052 htmlParserCtxtPtr ctxt;
5053
5054 if (cur == NULL)
5055 return(NULL);
5056 len = xmlStrlen(cur);
5057 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5058 if (ctxt == NULL)
5059 return(NULL);
5060
5061 if (encoding != NULL) {
5062 xmlCharEncoding enc;
5063 xmlCharEncodingHandlerPtr handler;
5064
5065 if (ctxt->input->encoding != NULL)
5066 xmlFree((xmlChar *) ctxt->input->encoding);
5067 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5068
5069 enc = xmlParseCharEncoding(encoding);
5070 /*
5071 * registered set of known encodings
5072 */
5073 if (enc != XML_CHAR_ENCODING_ERROR) {
5074 xmlSwitchEncoding(ctxt, enc);
5075 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5076 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5077 "Unsupported encoding %s\n",
5078 (const xmlChar *) encoding, NULL);
5079 }
5080 } else {
5081 /*
5082 * fallback for unknown encodings
5083 */
5084 handler = xmlFindCharEncodingHandler((const char *) encoding);
5085 if (handler != NULL) {
5086 xmlSwitchToEncoding(ctxt, handler);
5087 } else {
5088 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5089 "Unsupported encoding %s\n",
5090 (const xmlChar *) encoding, NULL);
5091 }
5092 }
5093 }
5094 return(ctxt);
5095}
5096
5097#ifdef LIBXML_PUSH_ENABLED
5098/************************************************************************
5099 * *
5100 * Progressive parsing interfaces *
5101 * *
5102 ************************************************************************/
5103
5104/**
5105 * htmlParseLookupSequence:
5106 * @ctxt: an HTML parser context
5107 * @first: the first char to lookup
5108 * @next: the next char to lookup or zero
5109 * @third: the next char to lookup or zero
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005110 * @ignoreattrval: skip over attribute values
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005111 *
5112 * Try to find if a sequence (first, next, third) or just (first next) or
5113 * (first) is available in the input stream.
5114 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5115 * to avoid rescanning sequences of bytes, it DOES change the state of the
5116 * parser, do not use liberally.
5117 * This is basically similar to xmlParseLookupSequence()
5118 *
5119 * Returns the index to the current parsing point if the full sequence
5120 * is available, -1 otherwise.
5121 */
5122static int
5123htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005124 xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005125{
5126 int base, len;
5127 htmlParserInputPtr in;
5128 const xmlChar *buf;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005129 int invalue = 0;
5130 char valdellim = 0x0;
5131
5132 in = ctxt->input;
5133 if (in == NULL)
5134 return (-1);
5135
5136 base = in->cur - in->base;
5137 if (base < 0)
5138 return (-1);
5139
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005140 if (ctxt->checkIndex > base) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005141 base = ctxt->checkIndex;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005142 /* Abuse hasPErefs member to restore current state. */
5143 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5144 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005145
5146 if (in->buf == NULL) {
5147 buf = in->base;
5148 len = in->length;
5149 } else {
5150 buf = xmlBufContent(in->buf->buffer);
5151 len = xmlBufUse(in->buf->buffer);
5152 }
5153
5154 /* take into account the sequence length */
5155 if (third)
5156 len -= 2;
5157 else if (next)
5158 len--;
5159 for (; base < len; base++) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005160 if (ignoreattrval) {
5161 if (buf[base] == '"' || buf[base] == '\'') {
5162 if (invalue) {
5163 if (buf[base] == valdellim) {
5164 invalue = 0;
5165 continue;
5166 }
5167 } else {
5168 valdellim = buf[base];
5169 invalue = 1;
5170 continue;
5171 }
5172 } else if (invalue) {
5173 continue;
5174 }
5175 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005176 if (buf[base] == first) {
5177 if (third != 0) {
5178 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5179 continue;
5180 } else if (next != 0) {
5181 if (buf[base + 1] != next)
5182 continue;
5183 }
5184 ctxt->checkIndex = 0;
5185#ifdef DEBUG_PUSH
5186 if (next == 0)
5187 xmlGenericError(xmlGenericErrorContext,
5188 "HPP: lookup '%c' found at %d\n",
5189 first, base);
5190 else if (third == 0)
5191 xmlGenericError(xmlGenericErrorContext,
5192 "HPP: lookup '%c%c' found at %d\n",
5193 first, next, base);
5194 else
5195 xmlGenericError(xmlGenericErrorContext,
5196 "HPP: lookup '%c%c%c' found at %d\n",
5197 first, next, third, base);
5198#endif
5199 return (base - (in->cur - in->base));
5200 }
5201 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005202 ctxt->checkIndex = base;
5203 /* Abuse hasPErefs member to track current state. */
5204 if (invalue)
5205 ctxt->hasPErefs |= 1;
5206 else
5207 ctxt->hasPErefs &= ~1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005208#ifdef DEBUG_PUSH
5209 if (next == 0)
5210 xmlGenericError(xmlGenericErrorContext,
5211 "HPP: lookup '%c' failed\n", first);
5212 else if (third == 0)
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: lookup '%c%c' failed\n", first, next);
5215 else
5216 xmlGenericError(xmlGenericErrorContext,
5217 "HPP: lookup '%c%c%c' failed\n", first, next,
5218 third);
5219#endif
5220 return (-1);
5221}
5222
5223/**
Haibo Huangd75f3892021-01-05 21:34:50 -08005224 * htmlParseLookupCommentEnd:
5225 * @ctxt: an HTML parser context
5226 *
5227 * Try to find a comment end tag in the input stream
5228 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5229 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5230 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5231 * to avoid rescanning sequences of bytes, it DOES change the state of the
5232 * parser, do not use liberally.
5233 * This wraps to htmlParseLookupSequence()
5234 *
5235 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5236 */
5237static int
5238htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5239{
5240 int mark = 0;
5241 int cur = CUR_PTR - BASE_PTR;
5242
5243 while (mark >= 0) {
5244 mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5245 if ((mark < 0) ||
5246 (NXT(mark+2) == '>') ||
5247 ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5248 return mark;
5249 }
5250 ctxt->checkIndex = cur + mark + 1;
5251 }
5252 return mark;
5253}
5254
5255
5256/**
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005257 * htmlParseTryOrFinish:
5258 * @ctxt: an HTML parser context
5259 * @terminate: last chunk indicator
5260 *
5261 * Try to progress on parsing
5262 *
5263 * Returns zero if no parsing was possible
5264 */
5265static int
5266htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5267 int ret = 0;
5268 htmlParserInputPtr in;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005269 ptrdiff_t avail = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005270 xmlChar cur, next;
5271
5272 htmlParserNodeInfo node_info;
5273
5274#ifdef DEBUG_PUSH
5275 switch (ctxt->instate) {
5276 case XML_PARSER_EOF:
5277 xmlGenericError(xmlGenericErrorContext,
5278 "HPP: try EOF\n"); break;
5279 case XML_PARSER_START:
5280 xmlGenericError(xmlGenericErrorContext,
5281 "HPP: try START\n"); break;
5282 case XML_PARSER_MISC:
5283 xmlGenericError(xmlGenericErrorContext,
5284 "HPP: try MISC\n");break;
5285 case XML_PARSER_COMMENT:
5286 xmlGenericError(xmlGenericErrorContext,
5287 "HPP: try COMMENT\n");break;
5288 case XML_PARSER_PROLOG:
5289 xmlGenericError(xmlGenericErrorContext,
5290 "HPP: try PROLOG\n");break;
5291 case XML_PARSER_START_TAG:
5292 xmlGenericError(xmlGenericErrorContext,
5293 "HPP: try START_TAG\n");break;
5294 case XML_PARSER_CONTENT:
5295 xmlGenericError(xmlGenericErrorContext,
5296 "HPP: try CONTENT\n");break;
5297 case XML_PARSER_CDATA_SECTION:
5298 xmlGenericError(xmlGenericErrorContext,
5299 "HPP: try CDATA_SECTION\n");break;
5300 case XML_PARSER_END_TAG:
5301 xmlGenericError(xmlGenericErrorContext,
5302 "HPP: try END_TAG\n");break;
5303 case XML_PARSER_ENTITY_DECL:
5304 xmlGenericError(xmlGenericErrorContext,
5305 "HPP: try ENTITY_DECL\n");break;
5306 case XML_PARSER_ENTITY_VALUE:
5307 xmlGenericError(xmlGenericErrorContext,
5308 "HPP: try ENTITY_VALUE\n");break;
5309 case XML_PARSER_ATTRIBUTE_VALUE:
5310 xmlGenericError(xmlGenericErrorContext,
5311 "HPP: try ATTRIBUTE_VALUE\n");break;
5312 case XML_PARSER_DTD:
5313 xmlGenericError(xmlGenericErrorContext,
5314 "HPP: try DTD\n");break;
5315 case XML_PARSER_EPILOG:
5316 xmlGenericError(xmlGenericErrorContext,
5317 "HPP: try EPILOG\n");break;
5318 case XML_PARSER_PI:
5319 xmlGenericError(xmlGenericErrorContext,
5320 "HPP: try PI\n");break;
5321 case XML_PARSER_SYSTEM_LITERAL:
5322 xmlGenericError(xmlGenericErrorContext,
5323 "HPP: try SYSTEM_LITERAL\n");break;
5324 }
5325#endif
5326
5327 while (1) {
5328
5329 in = ctxt->input;
5330 if (in == NULL) break;
5331 if (in->buf == NULL)
5332 avail = in->length - (in->cur - in->base);
5333 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005334 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5335 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005336 if ((avail == 0) && (terminate)) {
5337 htmlAutoCloseOnEnd(ctxt);
5338 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5339 /*
5340 * SAX: end of the document processing.
5341 */
5342 ctxt->instate = XML_PARSER_EOF;
5343 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5344 ctxt->sax->endDocument(ctxt->userData);
5345 }
5346 }
5347 if (avail < 1)
5348 goto done;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005349 /*
5350 * This is done to make progress and avoid an infinite loop
5351 * if a parsing attempt was aborted by hitting a NUL byte. After
5352 * changing htmlCurrentChar, this probably isn't necessary anymore.
5353 * We should consider removing this check.
5354 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005355 cur = in->cur[0];
5356 if (cur == 0) {
5357 SKIP(1);
5358 continue;
5359 }
5360
5361 switch (ctxt->instate) {
5362 case XML_PARSER_EOF:
5363 /*
5364 * Document parsing is done !
5365 */
5366 goto done;
5367 case XML_PARSER_START:
5368 /*
5369 * Very first chars read from the document flow.
5370 */
5371 cur = in->cur[0];
5372 if (IS_BLANK_CH(cur)) {
5373 SKIP_BLANKS;
5374 if (in->buf == NULL)
5375 avail = in->length - (in->cur - in->base);
5376 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005377 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5378 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005379 }
5380 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5381 ctxt->sax->setDocumentLocator(ctxt->userData,
5382 &xmlDefaultSAXLocator);
5383 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5384 (!ctxt->disableSAX))
5385 ctxt->sax->startDocument(ctxt->userData);
5386
5387 cur = in->cur[0];
5388 next = in->cur[1];
5389 if ((cur == '<') && (next == '!') &&
5390 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5391 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5392 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5393 (UPP(8) == 'E')) {
5394 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005395 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005396 goto done;
5397#ifdef DEBUG_PUSH
5398 xmlGenericError(xmlGenericErrorContext,
5399 "HPP: Parsing internal subset\n");
5400#endif
5401 htmlParseDocTypeDecl(ctxt);
5402 ctxt->instate = XML_PARSER_PROLOG;
5403#ifdef DEBUG_PUSH
5404 xmlGenericError(xmlGenericErrorContext,
5405 "HPP: entering PROLOG\n");
5406#endif
5407 } else {
5408 ctxt->instate = XML_PARSER_MISC;
5409#ifdef DEBUG_PUSH
5410 xmlGenericError(xmlGenericErrorContext,
5411 "HPP: entering MISC\n");
5412#endif
5413 }
5414 break;
5415 case XML_PARSER_MISC:
5416 SKIP_BLANKS;
5417 if (in->buf == NULL)
5418 avail = in->length - (in->cur - in->base);
5419 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005420 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5421 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005422 /*
5423 * no chars in buffer
5424 */
5425 if (avail < 1)
5426 goto done;
5427 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005428 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005429 */
5430 if (avail < 2) {
5431 if (!terminate)
5432 goto done;
5433 else
5434 next = ' ';
5435 } else {
5436 next = in->cur[1];
5437 }
5438 cur = in->cur[0];
5439 if ((cur == '<') && (next == '!') &&
5440 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005441 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005442 goto done;
5443#ifdef DEBUG_PUSH
5444 xmlGenericError(xmlGenericErrorContext,
5445 "HPP: Parsing Comment\n");
5446#endif
5447 htmlParseComment(ctxt);
5448 ctxt->instate = XML_PARSER_MISC;
5449 } else if ((cur == '<') && (next == '?')) {
5450 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005451 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005452 goto done;
5453#ifdef DEBUG_PUSH
5454 xmlGenericError(xmlGenericErrorContext,
5455 "HPP: Parsing PI\n");
5456#endif
5457 htmlParsePI(ctxt);
5458 ctxt->instate = XML_PARSER_MISC;
5459 } else if ((cur == '<') && (next == '!') &&
5460 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5461 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5462 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5463 (UPP(8) == 'E')) {
5464 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005465 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005466 goto done;
5467#ifdef DEBUG_PUSH
5468 xmlGenericError(xmlGenericErrorContext,
5469 "HPP: Parsing internal subset\n");
5470#endif
5471 htmlParseDocTypeDecl(ctxt);
5472 ctxt->instate = XML_PARSER_PROLOG;
5473#ifdef DEBUG_PUSH
5474 xmlGenericError(xmlGenericErrorContext,
5475 "HPP: entering PROLOG\n");
5476#endif
5477 } else if ((cur == '<') && (next == '!') &&
5478 (avail < 9)) {
5479 goto done;
5480 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005481 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005482#ifdef DEBUG_PUSH
5483 xmlGenericError(xmlGenericErrorContext,
5484 "HPP: entering START_TAG\n");
5485#endif
5486 }
5487 break;
5488 case XML_PARSER_PROLOG:
5489 SKIP_BLANKS;
5490 if (in->buf == NULL)
5491 avail = in->length - (in->cur - in->base);
5492 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005493 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5494 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005495 if (avail < 2)
5496 goto done;
5497 cur = in->cur[0];
5498 next = in->cur[1];
5499 if ((cur == '<') && (next == '!') &&
5500 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005501 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005502 goto done;
5503#ifdef DEBUG_PUSH
5504 xmlGenericError(xmlGenericErrorContext,
5505 "HPP: Parsing Comment\n");
5506#endif
5507 htmlParseComment(ctxt);
5508 ctxt->instate = XML_PARSER_PROLOG;
5509 } else if ((cur == '<') && (next == '?')) {
5510 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005511 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005512 goto done;
5513#ifdef DEBUG_PUSH
5514 xmlGenericError(xmlGenericErrorContext,
5515 "HPP: Parsing PI\n");
5516#endif
5517 htmlParsePI(ctxt);
5518 ctxt->instate = XML_PARSER_PROLOG;
5519 } else if ((cur == '<') && (next == '!') &&
5520 (avail < 4)) {
5521 goto done;
5522 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005523 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005524#ifdef DEBUG_PUSH
5525 xmlGenericError(xmlGenericErrorContext,
5526 "HPP: entering START_TAG\n");
5527#endif
5528 }
5529 break;
5530 case XML_PARSER_EPILOG:
5531 if (in->buf == NULL)
5532 avail = in->length - (in->cur - in->base);
5533 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005534 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5535 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005536 if (avail < 1)
5537 goto done;
5538 cur = in->cur[0];
5539 if (IS_BLANK_CH(cur)) {
5540 htmlParseCharData(ctxt);
5541 goto done;
5542 }
5543 if (avail < 2)
5544 goto done;
5545 next = in->cur[1];
5546 if ((cur == '<') && (next == '!') &&
5547 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005548 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005549 goto done;
5550#ifdef DEBUG_PUSH
5551 xmlGenericError(xmlGenericErrorContext,
5552 "HPP: Parsing Comment\n");
5553#endif
5554 htmlParseComment(ctxt);
5555 ctxt->instate = XML_PARSER_EPILOG;
5556 } else if ((cur == '<') && (next == '?')) {
5557 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005558 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005559 goto done;
5560#ifdef DEBUG_PUSH
5561 xmlGenericError(xmlGenericErrorContext,
5562 "HPP: Parsing PI\n");
5563#endif
5564 htmlParsePI(ctxt);
5565 ctxt->instate = XML_PARSER_EPILOG;
5566 } else if ((cur == '<') && (next == '!') &&
5567 (avail < 4)) {
5568 goto done;
5569 } else {
5570 ctxt->errNo = XML_ERR_DOCUMENT_END;
5571 ctxt->wellFormed = 0;
5572 ctxt->instate = XML_PARSER_EOF;
5573#ifdef DEBUG_PUSH
5574 xmlGenericError(xmlGenericErrorContext,
5575 "HPP: entering EOF\n");
5576#endif
5577 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5578 ctxt->sax->endDocument(ctxt->userData);
5579 goto done;
5580 }
5581 break;
5582 case XML_PARSER_START_TAG: {
5583 const xmlChar *name;
5584 int failed;
5585 const htmlElemDesc * info;
5586
5587 /*
5588 * no chars in buffer
5589 */
5590 if (avail < 1)
5591 goto done;
5592 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005593 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005594 */
5595 if (avail < 2) {
5596 if (!terminate)
5597 goto done;
5598 else
5599 next = ' ';
5600 } else {
5601 next = in->cur[1];
5602 }
5603 cur = in->cur[0];
5604 if (cur != '<') {
5605 ctxt->instate = XML_PARSER_CONTENT;
5606#ifdef DEBUG_PUSH
5607 xmlGenericError(xmlGenericErrorContext,
5608 "HPP: entering CONTENT\n");
5609#endif
5610 break;
5611 }
5612 if (next == '/') {
5613 ctxt->instate = XML_PARSER_END_TAG;
5614 ctxt->checkIndex = 0;
5615#ifdef DEBUG_PUSH
5616 xmlGenericError(xmlGenericErrorContext,
5617 "HPP: entering END_TAG\n");
5618#endif
5619 break;
5620 }
5621 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005622 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005623 goto done;
5624
5625 /* Capture start position */
5626 if (ctxt->record_info) {
5627 node_info.begin_pos = ctxt->input->consumed +
5628 (CUR_PTR - ctxt->input->base);
5629 node_info.begin_line = ctxt->input->line;
5630 }
5631
5632
5633 failed = htmlParseStartTag(ctxt);
5634 name = ctxt->name;
5635 if ((failed == -1) ||
5636 (name == NULL)) {
5637 if (CUR == '>')
5638 NEXT;
5639 break;
5640 }
5641
5642 /*
5643 * Lookup the info for that element.
5644 */
5645 info = htmlTagLookup(name);
5646 if (info == NULL) {
5647 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5648 "Tag %s invalid\n", name, NULL);
5649 }
5650
5651 /*
5652 * Check for an Empty Element labeled the XML/SGML way
5653 */
5654 if ((CUR == '/') && (NXT(1) == '>')) {
5655 SKIP(2);
5656 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5657 ctxt->sax->endElement(ctxt->userData, name);
5658 htmlnamePop(ctxt);
5659 ctxt->instate = XML_PARSER_CONTENT;
5660#ifdef DEBUG_PUSH
5661 xmlGenericError(xmlGenericErrorContext,
5662 "HPP: entering CONTENT\n");
5663#endif
5664 break;
5665 }
5666
5667 if (CUR == '>') {
5668 NEXT;
5669 } else {
5670 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5671 "Couldn't find end of Start Tag %s\n",
5672 name, NULL);
5673
5674 /*
5675 * end of parsing of this node.
5676 */
5677 if (xmlStrEqual(name, ctxt->name)) {
5678 nodePop(ctxt);
5679 htmlnamePop(ctxt);
5680 }
5681
5682 if (ctxt->record_info)
5683 htmlNodeInfoPush(ctxt, &node_info);
5684
5685 ctxt->instate = XML_PARSER_CONTENT;
5686#ifdef DEBUG_PUSH
5687 xmlGenericError(xmlGenericErrorContext,
5688 "HPP: entering CONTENT\n");
5689#endif
5690 break;
5691 }
5692
5693 /*
5694 * Check for an Empty Element from DTD definition
5695 */
5696 if ((info != NULL) && (info->empty)) {
5697 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5698 ctxt->sax->endElement(ctxt->userData, name);
5699 htmlnamePop(ctxt);
5700 }
5701
5702 if (ctxt->record_info)
5703 htmlNodeInfoPush(ctxt, &node_info);
5704
5705 ctxt->instate = XML_PARSER_CONTENT;
5706#ifdef DEBUG_PUSH
5707 xmlGenericError(xmlGenericErrorContext,
5708 "HPP: entering CONTENT\n");
5709#endif
5710 break;
5711 }
5712 case XML_PARSER_CONTENT: {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005713 xmlChar chr[2] = { 0, 0 };
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005714
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005715 /*
5716 * Handle preparsed entities and charRef
5717 */
5718 if (ctxt->token != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005719 chr[0] = (xmlChar) ctxt->token;
5720 htmlCheckParagraph(ctxt);
5721 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5722 ctxt->sax->characters(ctxt->userData, chr, 1);
5723 ctxt->token = 0;
5724 ctxt->checkIndex = 0;
5725 }
5726 if ((avail == 1) && (terminate)) {
5727 cur = in->cur[0];
5728 if ((cur != '<') && (cur != '&')) {
5729 if (ctxt->sax != NULL) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005730 chr[0] = cur;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005731 if (IS_BLANK_CH(cur)) {
5732 if (ctxt->keepBlanks) {
5733 if (ctxt->sax->characters != NULL)
5734 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005735 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005736 } else {
5737 if (ctxt->sax->ignorableWhitespace != NULL)
5738 ctxt->sax->ignorableWhitespace(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005739 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005740 }
5741 } else {
5742 htmlCheckParagraph(ctxt);
5743 if (ctxt->sax->characters != NULL)
5744 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005745 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005746 }
5747 }
5748 ctxt->token = 0;
5749 ctxt->checkIndex = 0;
5750 in->cur++;
5751 break;
5752 }
5753 }
5754 if (avail < 2)
5755 goto done;
5756 cur = in->cur[0];
5757 next = in->cur[1];
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005758 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5759 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5760 /*
5761 * Handle SCRIPT/STYLE separately
5762 */
5763 if (!terminate) {
5764 int idx;
5765 xmlChar val;
5766
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005767 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005768 if (idx < 0)
5769 goto done;
5770 val = in->cur[idx + 2];
5771 if (val == 0) /* bad cut of input */
5772 goto done;
5773 }
5774 htmlParseScript(ctxt);
5775 if ((cur == '<') && (next == '/')) {
5776 ctxt->instate = XML_PARSER_END_TAG;
5777 ctxt->checkIndex = 0;
5778#ifdef DEBUG_PUSH
5779 xmlGenericError(xmlGenericErrorContext,
5780 "HPP: entering END_TAG\n");
5781#endif
5782 break;
5783 }
5784 } else {
5785 /*
5786 * Sometimes DOCTYPE arrives in the middle of the document
5787 */
5788 if ((cur == '<') && (next == '!') &&
5789 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5790 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5791 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5792 (UPP(8) == 'E')) {
5793 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005794 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005795 goto done;
5796 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5797 "Misplaced DOCTYPE declaration\n",
5798 BAD_CAST "DOCTYPE" , NULL);
5799 htmlParseDocTypeDecl(ctxt);
5800 } else if ((cur == '<') && (next == '!') &&
5801 (in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huangd75f3892021-01-05 21:34:50 -08005802 if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005803 goto done;
5804#ifdef DEBUG_PUSH
5805 xmlGenericError(xmlGenericErrorContext,
5806 "HPP: Parsing Comment\n");
5807#endif
5808 htmlParseComment(ctxt);
5809 ctxt->instate = XML_PARSER_CONTENT;
5810 } else if ((cur == '<') && (next == '?')) {
5811 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005812 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005813 goto done;
5814#ifdef DEBUG_PUSH
5815 xmlGenericError(xmlGenericErrorContext,
5816 "HPP: Parsing PI\n");
5817#endif
5818 htmlParsePI(ctxt);
5819 ctxt->instate = XML_PARSER_CONTENT;
5820 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5821 goto done;
5822 } else if ((cur == '<') && (next == '/')) {
5823 ctxt->instate = XML_PARSER_END_TAG;
5824 ctxt->checkIndex = 0;
5825#ifdef DEBUG_PUSH
5826 xmlGenericError(xmlGenericErrorContext,
5827 "HPP: entering END_TAG\n");
5828#endif
5829 break;
5830 } else if (cur == '<') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005831 if ((!terminate) && (next == 0))
5832 goto done;
5833 /*
5834 * Only switch to START_TAG if the next character
5835 * starts a valid name. Otherwise, htmlParseStartTag
5836 * might return without consuming all characters
5837 * up to the final '>'.
5838 */
5839 if ((IS_ASCII_LETTER(next)) ||
5840 (next == '_') || (next == ':') || (next == '.')) {
5841 ctxt->instate = XML_PARSER_START_TAG;
5842 ctxt->checkIndex = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005843#ifdef DEBUG_PUSH
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005844 xmlGenericError(xmlGenericErrorContext,
5845 "HPP: entering START_TAG\n");
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005846#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005847 } else {
5848 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
5849 "htmlParseTryOrFinish: "
5850 "invalid element name\n",
5851 NULL, NULL);
5852 htmlCheckParagraph(ctxt);
5853 if ((ctxt->sax != NULL) &&
5854 (ctxt->sax->characters != NULL))
5855 ctxt->sax->characters(ctxt->userData,
5856 in->cur, 1);
5857 NEXT;
5858 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005859 break;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005860 } else {
5861 /*
5862 * check that the text sequence is complete
5863 * before handing out the data to the parser
5864 * to avoid problems with erroneous end of
5865 * data detection.
5866 */
5867 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005868 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005869 goto done;
5870 ctxt->checkIndex = 0;
5871#ifdef DEBUG_PUSH
5872 xmlGenericError(xmlGenericErrorContext,
5873 "HPP: Parsing char data\n");
5874#endif
Haibo Huangca689272021-02-09 16:43:43 -08005875 while ((ctxt->instate != XML_PARSER_EOF) &&
5876 (cur != '<') && (in->cur < in->end)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005877 if (cur == '&') {
5878 htmlParseReference(ctxt);
5879 } else {
5880 htmlParseCharData(ctxt);
5881 }
5882 cur = in->cur[0];
5883 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005884 }
5885 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005886
5887 break;
5888 }
5889 case XML_PARSER_END_TAG:
5890 if (avail < 2)
5891 goto done;
5892 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005893 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005894 goto done;
5895 htmlParseEndTag(ctxt);
5896 if (ctxt->nameNr == 0) {
5897 ctxt->instate = XML_PARSER_EPILOG;
5898 } else {
5899 ctxt->instate = XML_PARSER_CONTENT;
5900 }
5901 ctxt->checkIndex = 0;
5902#ifdef DEBUG_PUSH
5903 xmlGenericError(xmlGenericErrorContext,
5904 "HPP: entering CONTENT\n");
5905#endif
5906 break;
5907 case XML_PARSER_CDATA_SECTION:
5908 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5909 "HPP: internal error, state == CDATA\n",
5910 NULL, NULL);
5911 ctxt->instate = XML_PARSER_CONTENT;
5912 ctxt->checkIndex = 0;
5913#ifdef DEBUG_PUSH
5914 xmlGenericError(xmlGenericErrorContext,
5915 "HPP: entering CONTENT\n");
5916#endif
5917 break;
5918 case XML_PARSER_DTD:
5919 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920 "HPP: internal error, state == DTD\n",
5921 NULL, NULL);
5922 ctxt->instate = XML_PARSER_CONTENT;
5923 ctxt->checkIndex = 0;
5924#ifdef DEBUG_PUSH
5925 xmlGenericError(xmlGenericErrorContext,
5926 "HPP: entering CONTENT\n");
5927#endif
5928 break;
5929 case XML_PARSER_COMMENT:
5930 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5931 "HPP: internal error, state == COMMENT\n",
5932 NULL, NULL);
5933 ctxt->instate = XML_PARSER_CONTENT;
5934 ctxt->checkIndex = 0;
5935#ifdef DEBUG_PUSH
5936 xmlGenericError(xmlGenericErrorContext,
5937 "HPP: entering CONTENT\n");
5938#endif
5939 break;
5940 case XML_PARSER_PI:
5941 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5942 "HPP: internal error, state == PI\n",
5943 NULL, NULL);
5944 ctxt->instate = XML_PARSER_CONTENT;
5945 ctxt->checkIndex = 0;
5946#ifdef DEBUG_PUSH
5947 xmlGenericError(xmlGenericErrorContext,
5948 "HPP: entering CONTENT\n");
5949#endif
5950 break;
5951 case XML_PARSER_ENTITY_DECL:
5952 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5953 "HPP: internal error, state == ENTITY_DECL\n",
5954 NULL, NULL);
5955 ctxt->instate = XML_PARSER_CONTENT;
5956 ctxt->checkIndex = 0;
5957#ifdef DEBUG_PUSH
5958 xmlGenericError(xmlGenericErrorContext,
5959 "HPP: entering CONTENT\n");
5960#endif
5961 break;
5962 case XML_PARSER_ENTITY_VALUE:
5963 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5964 "HPP: internal error, state == ENTITY_VALUE\n",
5965 NULL, NULL);
5966 ctxt->instate = XML_PARSER_CONTENT;
5967 ctxt->checkIndex = 0;
5968#ifdef DEBUG_PUSH
5969 xmlGenericError(xmlGenericErrorContext,
5970 "HPP: entering DTD\n");
5971#endif
5972 break;
5973 case XML_PARSER_ATTRIBUTE_VALUE:
5974 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5975 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5976 NULL, NULL);
5977 ctxt->instate = XML_PARSER_START_TAG;
5978 ctxt->checkIndex = 0;
5979#ifdef DEBUG_PUSH
5980 xmlGenericError(xmlGenericErrorContext,
5981 "HPP: entering START_TAG\n");
5982#endif
5983 break;
5984 case XML_PARSER_SYSTEM_LITERAL:
5985 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5986 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5987 NULL, NULL);
5988 ctxt->instate = XML_PARSER_CONTENT;
5989 ctxt->checkIndex = 0;
5990#ifdef DEBUG_PUSH
5991 xmlGenericError(xmlGenericErrorContext,
5992 "HPP: entering CONTENT\n");
5993#endif
5994 break;
5995 case XML_PARSER_IGNORE:
5996 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5997 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5998 NULL, NULL);
5999 ctxt->instate = XML_PARSER_CONTENT;
6000 ctxt->checkIndex = 0;
6001#ifdef DEBUG_PUSH
6002 xmlGenericError(xmlGenericErrorContext,
6003 "HPP: entering CONTENT\n");
6004#endif
6005 break;
6006 case XML_PARSER_PUBLIC_LITERAL:
6007 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6008 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6009 NULL, NULL);
6010 ctxt->instate = XML_PARSER_CONTENT;
6011 ctxt->checkIndex = 0;
6012#ifdef DEBUG_PUSH
6013 xmlGenericError(xmlGenericErrorContext,
6014 "HPP: entering CONTENT\n");
6015#endif
6016 break;
6017
6018 }
6019 }
6020done:
6021 if ((avail == 0) && (terminate)) {
6022 htmlAutoCloseOnEnd(ctxt);
6023 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6024 /*
6025 * SAX: end of the document processing.
6026 */
6027 ctxt->instate = XML_PARSER_EOF;
6028 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6029 ctxt->sax->endDocument(ctxt->userData);
6030 }
6031 }
6032 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6033 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6034 (ctxt->instate == XML_PARSER_EPILOG))) {
6035 xmlDtdPtr dtd;
6036 dtd = xmlGetIntSubset(ctxt->myDoc);
6037 if (dtd == NULL)
6038 ctxt->myDoc->intSubset =
6039 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6040 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6041 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6042 }
6043#ifdef DEBUG_PUSH
6044 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6045#endif
6046 return(ret);
6047}
6048
6049/**
6050 * htmlParseChunk:
6051 * @ctxt: an HTML parser context
6052 * @chunk: an char array
6053 * @size: the size in byte of the chunk
6054 * @terminate: last chunk indicator
6055 *
6056 * Parse a Chunk of memory
6057 *
6058 * Returns zero if no error, the xmlParserErrors otherwise.
6059 */
6060int
6061htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6062 int terminate) {
6063 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6064 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6065 "htmlParseChunk: context error\n", NULL, NULL);
6066 return(XML_ERR_INTERNAL_ERROR);
6067 }
6068 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6069 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6070 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6071 size_t cur = ctxt->input->cur - ctxt->input->base;
6072 int res;
6073
6074 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006075 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006076 if (res < 0) {
6077 ctxt->errNo = XML_PARSER_EOF;
6078 ctxt->disableSAX = 1;
6079 return (XML_PARSER_EOF);
6080 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006081#ifdef DEBUG_PUSH
6082 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6083#endif
6084
6085#if 0
6086 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6087 htmlParseTryOrFinish(ctxt, terminate);
6088#endif
6089 } else if (ctxt->instate != XML_PARSER_EOF) {
6090 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6091 xmlParserInputBufferPtr in = ctxt->input->buf;
6092 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6093 (in->raw != NULL)) {
6094 int nbchars;
6095 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6096 size_t current = ctxt->input->cur - ctxt->input->base;
6097
6098 nbchars = xmlCharEncInput(in, terminate);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006099 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006100 if (nbchars < 0) {
6101 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6102 "encoder error\n", NULL, NULL);
6103 return(XML_ERR_INVALID_ENCODING);
6104 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006105 }
6106 }
6107 }
6108 htmlParseTryOrFinish(ctxt, terminate);
6109 if (terminate) {
6110 if ((ctxt->instate != XML_PARSER_EOF) &&
6111 (ctxt->instate != XML_PARSER_EPILOG) &&
6112 (ctxt->instate != XML_PARSER_MISC)) {
6113 ctxt->errNo = XML_ERR_DOCUMENT_END;
6114 ctxt->wellFormed = 0;
6115 }
6116 if (ctxt->instate != XML_PARSER_EOF) {
6117 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6118 ctxt->sax->endDocument(ctxt->userData);
6119 }
6120 ctxt->instate = XML_PARSER_EOF;
6121 }
6122 return((xmlParserErrors) ctxt->errNo);
6123}
6124
6125/************************************************************************
6126 * *
6127 * User entry points *
6128 * *
6129 ************************************************************************/
6130
6131/**
6132 * htmlCreatePushParserCtxt:
6133 * @sax: a SAX handler
6134 * @user_data: The user data returned on SAX callbacks
6135 * @chunk: a pointer to an array of chars
6136 * @size: number of chars in the array
6137 * @filename: an optional file name or URI
6138 * @enc: an optional encoding
6139 *
6140 * Create a parser context for using the HTML parser in push mode
6141 * The value of @filename is used for fetching external entities
6142 * and error/warning reports.
6143 *
6144 * Returns the new parser context or NULL
6145 */
6146htmlParserCtxtPtr
6147htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6148 const char *chunk, int size, const char *filename,
6149 xmlCharEncoding enc) {
6150 htmlParserCtxtPtr ctxt;
6151 htmlParserInputPtr inputStream;
6152 xmlParserInputBufferPtr buf;
6153
6154 xmlInitParser();
6155
6156 buf = xmlAllocParserInputBuffer(enc);
6157 if (buf == NULL) return(NULL);
6158
6159 ctxt = htmlNewParserCtxt();
6160 if (ctxt == NULL) {
6161 xmlFreeParserInputBuffer(buf);
6162 return(NULL);
6163 }
6164 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6165 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6166 if (sax != NULL) {
6167 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6168 xmlFree(ctxt->sax);
6169 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6170 if (ctxt->sax == NULL) {
6171 xmlFree(buf);
6172 xmlFree(ctxt);
6173 return(NULL);
6174 }
6175 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6176 if (user_data != NULL)
6177 ctxt->userData = user_data;
6178 }
6179 if (filename == NULL) {
6180 ctxt->directory = NULL;
6181 } else {
6182 ctxt->directory = xmlParserGetDirectory(filename);
6183 }
6184
6185 inputStream = htmlNewInputStream(ctxt);
6186 if (inputStream == NULL) {
6187 xmlFreeParserCtxt(ctxt);
6188 xmlFree(buf);
6189 return(NULL);
6190 }
6191
6192 if (filename == NULL)
6193 inputStream->filename = NULL;
6194 else
6195 inputStream->filename = (char *)
6196 xmlCanonicPath((const xmlChar *) filename);
6197 inputStream->buf = buf;
6198 xmlBufResetInput(buf->buffer, inputStream);
6199
6200 inputPush(ctxt, inputStream);
6201
6202 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6203 (ctxt->input->buf != NULL)) {
6204 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6205 size_t cur = ctxt->input->cur - ctxt->input->base;
6206
6207 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6208
6209 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6210#ifdef DEBUG_PUSH
6211 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6212#endif
6213 }
6214 ctxt->progressive = 1;
6215
6216 return(ctxt);
6217}
6218#endif /* LIBXML_PUSH_ENABLED */
6219
6220/**
6221 * htmlSAXParseDoc:
6222 * @cur: a pointer to an array of xmlChar
6223 * @encoding: a free form C string describing the HTML document encoding, or NULL
6224 * @sax: the SAX handler block
6225 * @userData: if using SAX, this pointer will be provided on callbacks.
6226 *
6227 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6228 * to handle parse events. If sax is NULL, fallback to the default DOM
6229 * behavior and return a tree.
6230 *
6231 * Returns the resulting document tree unless SAX is NULL or the document is
6232 * not well formed.
6233 */
6234
6235htmlDocPtr
6236htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6237 htmlSAXHandlerPtr sax, void *userData) {
6238 htmlDocPtr ret;
6239 htmlParserCtxtPtr ctxt;
6240
6241 xmlInitParser();
6242
6243 if (cur == NULL) return(NULL);
6244
6245
6246 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6247 if (ctxt == NULL) return(NULL);
6248 if (sax != NULL) {
6249 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6250 ctxt->sax = sax;
6251 ctxt->userData = userData;
6252 }
6253
6254 htmlParseDocument(ctxt);
6255 ret = ctxt->myDoc;
6256 if (sax != NULL) {
6257 ctxt->sax = NULL;
6258 ctxt->userData = NULL;
6259 }
6260 htmlFreeParserCtxt(ctxt);
6261
6262 return(ret);
6263}
6264
6265/**
6266 * htmlParseDoc:
6267 * @cur: a pointer to an array of xmlChar
6268 * @encoding: a free form C string describing the HTML document encoding, or NULL
6269 *
6270 * parse an HTML in-memory document and build a tree.
6271 *
6272 * Returns the resulting document tree
6273 */
6274
6275htmlDocPtr
6276htmlParseDoc(const xmlChar *cur, const char *encoding) {
6277 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6278}
6279
6280
6281/**
6282 * htmlCreateFileParserCtxt:
6283 * @filename: the filename
6284 * @encoding: a free form C string describing the HTML document encoding, or NULL
6285 *
6286 * Create a parser context for a file content.
6287 * Automatic support for ZLIB/Compress compressed document is provided
6288 * by default if found at compile-time.
6289 *
6290 * Returns the new parser context or NULL
6291 */
6292htmlParserCtxtPtr
6293htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6294{
6295 htmlParserCtxtPtr ctxt;
6296 htmlParserInputPtr inputStream;
6297 char *canonicFilename;
6298 /* htmlCharEncoding enc; */
6299 xmlChar *content, *content_line = (xmlChar *) "charset=";
6300
6301 if (filename == NULL)
6302 return(NULL);
6303
6304 ctxt = htmlNewParserCtxt();
6305 if (ctxt == NULL) {
6306 return(NULL);
6307 }
6308 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6309 if (canonicFilename == NULL) {
6310#ifdef LIBXML_SAX1_ENABLED
6311 if (xmlDefaultSAXHandler.error != NULL) {
6312 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6313 }
6314#endif
6315 xmlFreeParserCtxt(ctxt);
6316 return(NULL);
6317 }
6318
6319 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6320 xmlFree(canonicFilename);
6321 if (inputStream == NULL) {
6322 xmlFreeParserCtxt(ctxt);
6323 return(NULL);
6324 }
6325
6326 inputPush(ctxt, inputStream);
6327
6328 /* set encoding */
6329 if (encoding) {
6330 size_t l = strlen(encoding);
6331
6332 if (l < 1000) {
6333 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6334 if (content) {
6335 strcpy ((char *)content, (char *)content_line);
6336 strcat ((char *)content, (char *)encoding);
6337 htmlCheckEncoding (ctxt, content);
6338 xmlFree (content);
6339 }
6340 }
6341 }
6342
6343 return(ctxt);
6344}
6345
6346/**
6347 * htmlSAXParseFile:
6348 * @filename: the filename
6349 * @encoding: a free form C string describing the HTML document encoding, or NULL
6350 * @sax: the SAX handler block
6351 * @userData: if using SAX, this pointer will be provided on callbacks.
6352 *
6353 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6354 * compressed document is provided by default if found at compile-time.
6355 * It use the given SAX function block to handle the parsing callback.
6356 * If sax is NULL, fallback to the default DOM tree building routines.
6357 *
6358 * Returns the resulting document tree unless SAX is NULL or the document is
6359 * not well formed.
6360 */
6361
6362htmlDocPtr
6363htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6364 void *userData) {
6365 htmlDocPtr ret;
6366 htmlParserCtxtPtr ctxt;
6367 htmlSAXHandlerPtr oldsax = NULL;
6368
6369 xmlInitParser();
6370
6371 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6372 if (ctxt == NULL) return(NULL);
6373 if (sax != NULL) {
6374 oldsax = ctxt->sax;
6375 ctxt->sax = sax;
6376 ctxt->userData = userData;
6377 }
6378
6379 htmlParseDocument(ctxt);
6380
6381 ret = ctxt->myDoc;
6382 if (sax != NULL) {
6383 ctxt->sax = oldsax;
6384 ctxt->userData = NULL;
6385 }
6386 htmlFreeParserCtxt(ctxt);
6387
6388 return(ret);
6389}
6390
6391/**
6392 * htmlParseFile:
6393 * @filename: the filename
6394 * @encoding: a free form C string describing the HTML document encoding, or NULL
6395 *
6396 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6397 * compressed document is provided by default if found at compile-time.
6398 *
6399 * Returns the resulting document tree
6400 */
6401
6402htmlDocPtr
6403htmlParseFile(const char *filename, const char *encoding) {
6404 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6405}
6406
6407/**
6408 * htmlHandleOmittedElem:
6409 * @val: int 0 or 1
6410 *
6411 * Set and return the previous value for handling HTML omitted tags.
6412 *
6413 * Returns the last value for 0 for no handling, 1 for auto insertion.
6414 */
6415
6416int
6417htmlHandleOmittedElem(int val) {
6418 int old = htmlOmittedDefaultValue;
6419
6420 htmlOmittedDefaultValue = val;
6421 return(old);
6422}
6423
6424/**
6425 * htmlElementAllowedHere:
6426 * @parent: HTML parent element
6427 * @elt: HTML element
6428 *
6429 * Checks whether an HTML element may be a direct child of a parent element.
6430 * Note - doesn't check for deprecated elements
6431 *
6432 * Returns 1 if allowed; 0 otherwise.
6433 */
6434int
6435htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6436 const char** p ;
6437
6438 if ( ! elt || ! parent || ! parent->subelts )
6439 return 0 ;
6440
6441 for ( p = parent->subelts; *p; ++p )
6442 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6443 return 1 ;
6444
6445 return 0 ;
6446}
6447/**
6448 * htmlElementStatusHere:
6449 * @parent: HTML parent element
6450 * @elt: HTML element
6451 *
6452 * Checks whether an HTML element may be a direct child of a parent element.
6453 * and if so whether it is valid or deprecated.
6454 *
6455 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6456 */
6457htmlStatus
6458htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6459 if ( ! parent || ! elt )
6460 return HTML_INVALID ;
6461 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6462 return HTML_INVALID ;
6463
6464 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6465}
6466/**
6467 * htmlAttrAllowed:
6468 * @elt: HTML element
6469 * @attr: HTML attribute
6470 * @legacy: whether to allow deprecated attributes
6471 *
6472 * Checks whether an attribute is valid for an element
6473 * Has full knowledge of Required and Deprecated attributes
6474 *
6475 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6476 */
6477htmlStatus
6478htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6479 const char** p ;
6480
6481 if ( !elt || ! attr )
6482 return HTML_INVALID ;
6483
6484 if ( elt->attrs_req )
6485 for ( p = elt->attrs_req; *p; ++p)
6486 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6487 return HTML_REQUIRED ;
6488
6489 if ( elt->attrs_opt )
6490 for ( p = elt->attrs_opt; *p; ++p)
6491 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6492 return HTML_VALID ;
6493
6494 if ( legacy && elt->attrs_depr )
6495 for ( p = elt->attrs_depr; *p; ++p)
6496 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6497 return HTML_DEPRECATED ;
6498
6499 return HTML_INVALID ;
6500}
6501/**
6502 * htmlNodeStatus:
6503 * @node: an htmlNodePtr in a tree
6504 * @legacy: whether to allow deprecated elements (YES is faster here
6505 * for Element nodes)
6506 *
6507 * Checks whether the tree node is valid. Experimental (the author
6508 * only uses the HTML enhancements in a SAX parser)
6509 *
6510 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6511 * legacy allowed) or htmlElementStatusHere (otherwise).
6512 * for Attribute nodes, a return from htmlAttrAllowed
6513 * for other nodes, HTML_NA (no checks performed)
6514 */
6515htmlStatus
6516htmlNodeStatus(const htmlNodePtr node, int legacy) {
6517 if ( ! node )
6518 return HTML_INVALID ;
6519
6520 switch ( node->type ) {
6521 case XML_ELEMENT_NODE:
6522 return legacy
6523 ? ( htmlElementAllowedHere (
6524 htmlTagLookup(node->parent->name) , node->name
6525 ) ? HTML_VALID : HTML_INVALID )
6526 : htmlElementStatusHere(
6527 htmlTagLookup(node->parent->name) ,
6528 htmlTagLookup(node->name) )
6529 ;
6530 case XML_ATTRIBUTE_NODE:
6531 return htmlAttrAllowed(
6532 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6533 default: return HTML_NA ;
6534 }
6535}
6536/************************************************************************
6537 * *
6538 * New set (2.6.0) of simpler and more flexible APIs *
6539 * *
6540 ************************************************************************/
6541/**
6542 * DICT_FREE:
6543 * @str: a string
6544 *
6545 * Free a string if it is not owned by the "dict" dictionary in the
6546 * current scope
6547 */
6548#define DICT_FREE(str) \
6549 if ((str) && ((!dict) || \
6550 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6551 xmlFree((char *)(str));
6552
6553/**
6554 * htmlCtxtReset:
6555 * @ctxt: an HTML parser context
6556 *
6557 * Reset a parser context
6558 */
6559void
6560htmlCtxtReset(htmlParserCtxtPtr ctxt)
6561{
6562 xmlParserInputPtr input;
6563 xmlDictPtr dict;
6564
6565 if (ctxt == NULL)
6566 return;
6567
6568 xmlInitParser();
6569 dict = ctxt->dict;
6570
6571 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6572 xmlFreeInputStream(input);
6573 }
6574 ctxt->inputNr = 0;
6575 ctxt->input = NULL;
6576
6577 ctxt->spaceNr = 0;
6578 if (ctxt->spaceTab != NULL) {
6579 ctxt->spaceTab[0] = -1;
6580 ctxt->space = &ctxt->spaceTab[0];
6581 } else {
6582 ctxt->space = NULL;
6583 }
6584
6585
6586 ctxt->nodeNr = 0;
6587 ctxt->node = NULL;
6588
6589 ctxt->nameNr = 0;
6590 ctxt->name = NULL;
6591
6592 DICT_FREE(ctxt->version);
6593 ctxt->version = NULL;
6594 DICT_FREE(ctxt->encoding);
6595 ctxt->encoding = NULL;
6596 DICT_FREE(ctxt->directory);
6597 ctxt->directory = NULL;
6598 DICT_FREE(ctxt->extSubURI);
6599 ctxt->extSubURI = NULL;
6600 DICT_FREE(ctxt->extSubSystem);
6601 ctxt->extSubSystem = NULL;
6602 if (ctxt->myDoc != NULL)
6603 xmlFreeDoc(ctxt->myDoc);
6604 ctxt->myDoc = NULL;
6605
6606 ctxt->standalone = -1;
6607 ctxt->hasExternalSubset = 0;
6608 ctxt->hasPErefs = 0;
6609 ctxt->html = 1;
6610 ctxt->external = 0;
6611 ctxt->instate = XML_PARSER_START;
6612 ctxt->token = 0;
6613
6614 ctxt->wellFormed = 1;
6615 ctxt->nsWellFormed = 1;
6616 ctxt->disableSAX = 0;
6617 ctxt->valid = 1;
6618 ctxt->vctxt.userData = ctxt;
6619 ctxt->vctxt.error = xmlParserValidityError;
6620 ctxt->vctxt.warning = xmlParserValidityWarning;
6621 ctxt->record_info = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006622 ctxt->checkIndex = 0;
6623 ctxt->inSubset = 0;
6624 ctxt->errNo = XML_ERR_OK;
6625 ctxt->depth = 0;
6626 ctxt->charset = XML_CHAR_ENCODING_NONE;
6627 ctxt->catalogs = NULL;
6628 xmlInitNodeInfoSeq(&ctxt->node_seq);
6629
6630 if (ctxt->attsDefault != NULL) {
6631 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6632 ctxt->attsDefault = NULL;
6633 }
6634 if (ctxt->attsSpecial != NULL) {
6635 xmlHashFree(ctxt->attsSpecial, NULL);
6636 ctxt->attsSpecial = NULL;
6637 }
6638}
6639
6640/**
6641 * htmlCtxtUseOptions:
6642 * @ctxt: an HTML parser context
6643 * @options: a combination of htmlParserOption(s)
6644 *
6645 * Applies the options to the parser context
6646 *
6647 * Returns 0 in case of success, the set of unknown or unimplemented options
6648 * in case of error.
6649 */
6650int
6651htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6652{
6653 if (ctxt == NULL)
6654 return(-1);
6655
6656 if (options & HTML_PARSE_NOWARNING) {
6657 ctxt->sax->warning = NULL;
6658 ctxt->vctxt.warning = NULL;
6659 options -= XML_PARSE_NOWARNING;
6660 ctxt->options |= XML_PARSE_NOWARNING;
6661 }
6662 if (options & HTML_PARSE_NOERROR) {
6663 ctxt->sax->error = NULL;
6664 ctxt->vctxt.error = NULL;
6665 ctxt->sax->fatalError = NULL;
6666 options -= XML_PARSE_NOERROR;
6667 ctxt->options |= XML_PARSE_NOERROR;
6668 }
6669 if (options & HTML_PARSE_PEDANTIC) {
6670 ctxt->pedantic = 1;
6671 options -= XML_PARSE_PEDANTIC;
6672 ctxt->options |= XML_PARSE_PEDANTIC;
6673 } else
6674 ctxt->pedantic = 0;
6675 if (options & XML_PARSE_NOBLANKS) {
6676 ctxt->keepBlanks = 0;
6677 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6678 options -= XML_PARSE_NOBLANKS;
6679 ctxt->options |= XML_PARSE_NOBLANKS;
6680 } else
6681 ctxt->keepBlanks = 1;
6682 if (options & HTML_PARSE_RECOVER) {
6683 ctxt->recovery = 1;
6684 options -= HTML_PARSE_RECOVER;
6685 } else
6686 ctxt->recovery = 0;
6687 if (options & HTML_PARSE_COMPACT) {
6688 ctxt->options |= HTML_PARSE_COMPACT;
6689 options -= HTML_PARSE_COMPACT;
6690 }
6691 if (options & XML_PARSE_HUGE) {
6692 ctxt->options |= XML_PARSE_HUGE;
6693 options -= XML_PARSE_HUGE;
6694 }
6695 if (options & HTML_PARSE_NODEFDTD) {
6696 ctxt->options |= HTML_PARSE_NODEFDTD;
6697 options -= HTML_PARSE_NODEFDTD;
6698 }
6699 if (options & HTML_PARSE_IGNORE_ENC) {
6700 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6701 options -= HTML_PARSE_IGNORE_ENC;
6702 }
6703 if (options & HTML_PARSE_NOIMPLIED) {
6704 ctxt->options |= HTML_PARSE_NOIMPLIED;
6705 options -= HTML_PARSE_NOIMPLIED;
6706 }
6707 ctxt->dictNames = 0;
6708 return (options);
6709}
6710
6711/**
6712 * htmlDoRead:
6713 * @ctxt: an HTML parser context
6714 * @URL: the base URL to use for the document
6715 * @encoding: the document encoding, or NULL
6716 * @options: a combination of htmlParserOption(s)
6717 * @reuse: keep the context for reuse
6718 *
6719 * Common front-end for the htmlRead functions
6720 *
6721 * Returns the resulting document tree or NULL
6722 */
6723static htmlDocPtr
6724htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6725 int options, int reuse)
6726{
6727 htmlDocPtr ret;
6728
6729 htmlCtxtUseOptions(ctxt, options);
6730 ctxt->html = 1;
6731 if (encoding != NULL) {
6732 xmlCharEncodingHandlerPtr hdlr;
6733
6734 hdlr = xmlFindCharEncodingHandler(encoding);
6735 if (hdlr != NULL) {
6736 xmlSwitchToEncoding(ctxt, hdlr);
6737 if (ctxt->input->encoding != NULL)
6738 xmlFree((xmlChar *) ctxt->input->encoding);
6739 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6740 }
6741 }
6742 if ((URL != NULL) && (ctxt->input != NULL) &&
6743 (ctxt->input->filename == NULL))
6744 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6745 htmlParseDocument(ctxt);
6746 ret = ctxt->myDoc;
6747 ctxt->myDoc = NULL;
6748 if (!reuse) {
6749 if ((ctxt->dictNames) &&
6750 (ret != NULL) &&
6751 (ret->dict == ctxt->dict))
6752 ctxt->dict = NULL;
6753 xmlFreeParserCtxt(ctxt);
6754 }
6755 return (ret);
6756}
6757
6758/**
6759 * htmlReadDoc:
6760 * @cur: a pointer to a zero terminated string
6761 * @URL: the base URL to use for the document
6762 * @encoding: the document encoding, or NULL
6763 * @options: a combination of htmlParserOption(s)
6764 *
6765 * parse an XML in-memory document and build a tree.
6766 *
6767 * Returns the resulting document tree
6768 */
6769htmlDocPtr
6770htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6771{
6772 htmlParserCtxtPtr ctxt;
6773
6774 if (cur == NULL)
6775 return (NULL);
6776
6777 xmlInitParser();
6778 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6779 if (ctxt == NULL)
6780 return (NULL);
6781 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6782}
6783
6784/**
6785 * htmlReadFile:
6786 * @filename: a file or URL
6787 * @encoding: the document encoding, or NULL
6788 * @options: a combination of htmlParserOption(s)
6789 *
6790 * parse an XML file from the filesystem or the network.
6791 *
6792 * Returns the resulting document tree
6793 */
6794htmlDocPtr
6795htmlReadFile(const char *filename, const char *encoding, int options)
6796{
6797 htmlParserCtxtPtr ctxt;
6798
6799 xmlInitParser();
6800 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6801 if (ctxt == NULL)
6802 return (NULL);
6803 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6804}
6805
6806/**
6807 * htmlReadMemory:
6808 * @buffer: a pointer to a char array
6809 * @size: the size of the array
6810 * @URL: the base URL to use for the document
6811 * @encoding: the document encoding, or NULL
6812 * @options: a combination of htmlParserOption(s)
6813 *
6814 * parse an XML in-memory document and build a tree.
6815 *
6816 * Returns the resulting document tree
6817 */
6818htmlDocPtr
6819htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6820{
6821 htmlParserCtxtPtr ctxt;
6822
6823 xmlInitParser();
6824 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6825 if (ctxt == NULL)
6826 return (NULL);
6827 htmlDefaultSAXHandlerInit();
6828 if (ctxt->sax != NULL)
6829 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6830 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6831}
6832
6833/**
6834 * htmlReadFd:
6835 * @fd: an open file descriptor
6836 * @URL: the base URL to use for the document
6837 * @encoding: the document encoding, or NULL
6838 * @options: a combination of htmlParserOption(s)
6839 *
6840 * parse an XML from a file descriptor and build a tree.
6841 *
6842 * Returns the resulting document tree
6843 */
6844htmlDocPtr
6845htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6846{
6847 htmlParserCtxtPtr ctxt;
6848 xmlParserInputBufferPtr input;
6849 xmlParserInputPtr stream;
6850
6851 if (fd < 0)
6852 return (NULL);
6853 xmlInitParser();
6854
6855 xmlInitParser();
6856 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6857 if (input == NULL)
6858 return (NULL);
6859 ctxt = xmlNewParserCtxt();
6860 if (ctxt == NULL) {
6861 xmlFreeParserInputBuffer(input);
6862 return (NULL);
6863 }
6864 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6865 if (stream == NULL) {
6866 xmlFreeParserInputBuffer(input);
6867 xmlFreeParserCtxt(ctxt);
6868 return (NULL);
6869 }
6870 inputPush(ctxt, stream);
6871 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6872}
6873
6874/**
6875 * htmlReadIO:
6876 * @ioread: an I/O read function
6877 * @ioclose: an I/O close function
6878 * @ioctx: an I/O handler
6879 * @URL: the base URL to use for the document
6880 * @encoding: the document encoding, or NULL
6881 * @options: a combination of htmlParserOption(s)
6882 *
6883 * parse an HTML document from I/O functions and source and build a tree.
6884 *
6885 * Returns the resulting document tree
6886 */
6887htmlDocPtr
6888htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6889 void *ioctx, const char *URL, const char *encoding, int options)
6890{
6891 htmlParserCtxtPtr ctxt;
6892 xmlParserInputBufferPtr input;
6893 xmlParserInputPtr stream;
6894
6895 if (ioread == NULL)
6896 return (NULL);
6897 xmlInitParser();
6898
6899 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6900 XML_CHAR_ENCODING_NONE);
6901 if (input == NULL) {
6902 if (ioclose != NULL)
6903 ioclose(ioctx);
6904 return (NULL);
6905 }
6906 ctxt = htmlNewParserCtxt();
6907 if (ctxt == NULL) {
6908 xmlFreeParserInputBuffer(input);
6909 return (NULL);
6910 }
6911 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6912 if (stream == NULL) {
6913 xmlFreeParserInputBuffer(input);
6914 xmlFreeParserCtxt(ctxt);
6915 return (NULL);
6916 }
6917 inputPush(ctxt, stream);
6918 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6919}
6920
6921/**
6922 * htmlCtxtReadDoc:
6923 * @ctxt: an HTML parser context
6924 * @cur: a pointer to a zero terminated string
6925 * @URL: the base URL to use for the document
6926 * @encoding: the document encoding, or NULL
6927 * @options: a combination of htmlParserOption(s)
6928 *
6929 * parse an XML in-memory document and build a tree.
6930 * This reuses the existing @ctxt parser context
6931 *
6932 * Returns the resulting document tree
6933 */
6934htmlDocPtr
6935htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6936 const char *URL, const char *encoding, int options)
6937{
6938 xmlParserInputPtr stream;
6939
6940 if (cur == NULL)
6941 return (NULL);
6942 if (ctxt == NULL)
6943 return (NULL);
6944 xmlInitParser();
6945
6946 htmlCtxtReset(ctxt);
6947
6948 stream = xmlNewStringInputStream(ctxt, cur);
6949 if (stream == NULL) {
6950 return (NULL);
6951 }
6952 inputPush(ctxt, stream);
6953 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6954}
6955
6956/**
6957 * htmlCtxtReadFile:
6958 * @ctxt: an HTML parser context
6959 * @filename: a file or URL
6960 * @encoding: the document encoding, or NULL
6961 * @options: a combination of htmlParserOption(s)
6962 *
6963 * parse an XML file from the filesystem or the network.
6964 * This reuses the existing @ctxt parser context
6965 *
6966 * Returns the resulting document tree
6967 */
6968htmlDocPtr
6969htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6970 const char *encoding, int options)
6971{
6972 xmlParserInputPtr stream;
6973
6974 if (filename == NULL)
6975 return (NULL);
6976 if (ctxt == NULL)
6977 return (NULL);
6978 xmlInitParser();
6979
6980 htmlCtxtReset(ctxt);
6981
6982 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6983 if (stream == NULL) {
6984 return (NULL);
6985 }
6986 inputPush(ctxt, stream);
6987 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6988}
6989
6990/**
6991 * htmlCtxtReadMemory:
6992 * @ctxt: an HTML parser context
6993 * @buffer: a pointer to a char array
6994 * @size: the size of the array
6995 * @URL: the base URL to use for the document
6996 * @encoding: the document encoding, or NULL
6997 * @options: a combination of htmlParserOption(s)
6998 *
6999 * parse an XML in-memory document and build a tree.
7000 * This reuses the existing @ctxt parser context
7001 *
7002 * Returns the resulting document tree
7003 */
7004htmlDocPtr
7005htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7006 const char *URL, const char *encoding, int options)
7007{
7008 xmlParserInputBufferPtr input;
7009 xmlParserInputPtr stream;
7010
7011 if (ctxt == NULL)
7012 return (NULL);
7013 if (buffer == NULL)
7014 return (NULL);
7015 xmlInitParser();
7016
7017 htmlCtxtReset(ctxt);
7018
7019 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7020 if (input == NULL) {
7021 return(NULL);
7022 }
7023
7024 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7025 if (stream == NULL) {
7026 xmlFreeParserInputBuffer(input);
7027 return(NULL);
7028 }
7029
7030 inputPush(ctxt, stream);
7031 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7032}
7033
7034/**
7035 * htmlCtxtReadFd:
7036 * @ctxt: an HTML parser context
7037 * @fd: an open file descriptor
7038 * @URL: the base URL to use for the document
7039 * @encoding: the document encoding, or NULL
7040 * @options: a combination of htmlParserOption(s)
7041 *
7042 * parse an XML from a file descriptor and build a tree.
7043 * This reuses the existing @ctxt parser context
7044 *
7045 * Returns the resulting document tree
7046 */
7047htmlDocPtr
7048htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7049 const char *URL, const char *encoding, int options)
7050{
7051 xmlParserInputBufferPtr input;
7052 xmlParserInputPtr stream;
7053
7054 if (fd < 0)
7055 return (NULL);
7056 if (ctxt == NULL)
7057 return (NULL);
7058 xmlInitParser();
7059
7060 htmlCtxtReset(ctxt);
7061
7062
7063 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7064 if (input == NULL)
7065 return (NULL);
7066 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7067 if (stream == NULL) {
7068 xmlFreeParserInputBuffer(input);
7069 return (NULL);
7070 }
7071 inputPush(ctxt, stream);
7072 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7073}
7074
7075/**
7076 * htmlCtxtReadIO:
7077 * @ctxt: an HTML parser context
7078 * @ioread: an I/O read function
7079 * @ioclose: an I/O close function
7080 * @ioctx: an I/O handler
7081 * @URL: the base URL to use for the document
7082 * @encoding: the document encoding, or NULL
7083 * @options: a combination of htmlParserOption(s)
7084 *
7085 * parse an HTML document from I/O functions and source and build a tree.
7086 * This reuses the existing @ctxt parser context
7087 *
7088 * Returns the resulting document tree
7089 */
7090htmlDocPtr
7091htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7092 xmlInputCloseCallback ioclose, void *ioctx,
7093 const char *URL,
7094 const char *encoding, int options)
7095{
7096 xmlParserInputBufferPtr input;
7097 xmlParserInputPtr stream;
7098
7099 if (ioread == NULL)
7100 return (NULL);
7101 if (ctxt == NULL)
7102 return (NULL);
7103 xmlInitParser();
7104
7105 htmlCtxtReset(ctxt);
7106
7107 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7108 XML_CHAR_ENCODING_NONE);
7109 if (input == NULL) {
7110 if (ioclose != NULL)
7111 ioclose(ioctx);
7112 return (NULL);
7113 }
7114 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7115 if (stream == NULL) {
7116 xmlFreeParserInputBuffer(input);
7117 return (NULL);
7118 }
7119 inputPush(ctxt, stream);
7120 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7121}
7122
7123#define bottom_HTMLparser
7124#include "elfgcchack.h"
7125#endif /* LIBXML_HTML_ENABLED */