blob: b981298544a6c671344759e0321d27113c2e1548 [file] [log] [blame]
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef LIBXML_ZLIB_ENABLED
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#include "buf.h"
48#include "enc.h"
49
50#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
57static int htmlOmittedDefaultValue = 1;
58
59xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
61static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63/************************************************************************
64 * *
65 * Some factorized error routines *
66 * *
67 ************************************************************************/
68
69/**
70 * htmlErrMemory:
71 * @ctxt: an HTML parser context
Haibo Huangcfd91dc2020-07-30 23:01:33 -070072 * @extra: extra information
Elliott Hughes7fbecab2019-01-10 16:42:03 -080073 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
79 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
82 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
88 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
93 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void LIBXML_ATTR_FORMAT(3,0)
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
115 if (ctxt != NULL)
116 ctxt->errNo = error;
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void LIBXML_ATTR_FORMAT(3,0)
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
142 if (ctxt != NULL)
143 ctxt->errNo = error;
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
149}
150
151/************************************************************************
152 * *
153 * Parser stacks related functions and macros *
154 * *
155 ************************************************************************/
156
157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
165 */
166static int
167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168{
169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
175 ctxt->nameTab = (const xmlChar * *)
176 xmlRealloc((xmlChar * *)ctxt->nameTab,
177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
180 htmlErrMemory(ctxt, NULL);
181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
196static const xmlChar *
197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
199 const xmlChar *ret;
200
201 if (ctxt->nameNr <= 0)
202 return (NULL);
203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
205 return (NULL);
206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
211 ctxt->nameTab[ctxt->nameNr] = NULL;
212 return (ret);
213}
214
215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285 * strings without newlines within the parser.
286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
299#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306#define BASE_PTR ctxt->input->base
307
308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
311
312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700320/* Imported from XML */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800321
322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
324#define NEXT xmlNextChar(ctxt)
325
326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
334 } while (0)
335
336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
406 * The current char value, if using UTF-8 this may actually span multiple
407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
412 * Returns the current char value and its length
413 */
414
415static int
416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700417 const unsigned char *cur;
418 unsigned char c;
419 unsigned int val;
420
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800421 if (ctxt->instate == XML_PARSER_EOF)
422 return(0);
423
424 if (ctxt->token != 0) {
425 *len = 0;
426 return(ctxt->token);
427 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700428 if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800429 xmlChar * guess;
430 xmlCharEncodingHandlerPtr handler;
431
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700432 /*
433 * Assume it's a fixed length encoding (1) with
434 * a compatible encoding for the ASCII set, since
435 * HTML constructs only use < 128 chars
436 */
437 if ((int) *ctxt->input->cur < 0x80) {
438 *len = 1;
439 if ((*ctxt->input->cur == 0) &&
440 (ctxt->input->cur < ctxt->input->end)) {
441 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442 "Char 0x%X out of allowed range\n", 0);
443 return(' ');
444 }
445 return((int) *ctxt->input->cur);
446 }
447
448 /*
449 * Humm this is bad, do an automatic flow conversion
450 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800451 guess = htmlFindEncoding(ctxt);
452 if (guess == NULL) {
453 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454 } else {
455 if (ctxt->input->encoding != NULL)
456 xmlFree((xmlChar *) ctxt->input->encoding);
457 ctxt->input->encoding = guess;
458 handler = xmlFindCharEncodingHandler((const char *) guess);
459 if (handler != NULL) {
460 xmlSwitchToEncoding(ctxt, handler);
461 } else {
462 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
463 "Unsupported encoding %s", guess, NULL);
464 }
465 }
466 ctxt->charset = XML_CHAR_ENCODING_UTF8;
467 }
468
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700469 /*
470 * We are supposed to handle UTF8, check it's valid
471 * From rfc2044: encoding of the Unicode values on UTF-8:
472 *
473 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
474 * 0000 0000-0000 007F 0xxxxxxx
475 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
476 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
477 *
478 * Check for the 0x110000 limit too
479 */
480 cur = ctxt->input->cur;
481 c = *cur;
482 if (c & 0x80) {
483 if ((c & 0x40) == 0)
484 goto encoding_error;
485 if (cur[1] == 0) {
486 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
487 cur = ctxt->input->cur;
488 }
489 if ((cur[1] & 0xc0) != 0x80)
490 goto encoding_error;
491 if ((c & 0xe0) == 0xe0) {
492
493 if (cur[2] == 0) {
494 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
495 cur = ctxt->input->cur;
496 }
497 if ((cur[2] & 0xc0) != 0x80)
498 goto encoding_error;
499 if ((c & 0xf0) == 0xf0) {
500 if (cur[3] == 0) {
501 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
502 cur = ctxt->input->cur;
503 }
504 if (((c & 0xf8) != 0xf0) ||
505 ((cur[3] & 0xc0) != 0x80))
506 goto encoding_error;
507 /* 4-byte code */
508 *len = 4;
509 val = (cur[0] & 0x7) << 18;
510 val |= (cur[1] & 0x3f) << 12;
511 val |= (cur[2] & 0x3f) << 6;
512 val |= cur[3] & 0x3f;
513 if (val < 0x10000)
514 goto encoding_error;
515 } else {
516 /* 3-byte code */
517 *len = 3;
518 val = (cur[0] & 0xf) << 12;
519 val |= (cur[1] & 0x3f) << 6;
520 val |= cur[2] & 0x3f;
521 if (val < 0x800)
522 goto encoding_error;
523 }
524 } else {
525 /* 2-byte code */
526 *len = 2;
527 val = (cur[0] & 0x1f) << 6;
528 val |= cur[1] & 0x3f;
529 if (val < 0x80)
530 goto encoding_error;
531 }
532 if (!IS_CHAR(val)) {
533 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
534 "Char 0x%X out of allowed range\n", val);
535 }
536 return(val);
537 } else {
538 if ((*ctxt->input->cur == 0) &&
539 (ctxt->input->cur < ctxt->input->end)) {
540 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
541 "Char 0x%X out of allowed range\n", 0);
542 *len = 1;
543 return(' ');
544 }
545 /* 1-byte code */
546 *len = 1;
547 return((int) *ctxt->input->cur);
548 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800549
550encoding_error:
551 /*
552 * If we detect an UTF8 error that probably mean that the
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700553 * input encoding didn't get properly advertised in the
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800554 * declaration header. Report the error and switch the encoding
555 * to ISO-Latin-1 (if you don't like this policy, just declare the
556 * encoding !)
557 */
558 {
559 char buffer[150];
560
561 if (ctxt->input->end - ctxt->input->cur >= 4) {
562 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
563 ctxt->input->cur[0], ctxt->input->cur[1],
564 ctxt->input->cur[2], ctxt->input->cur[3]);
565 } else {
566 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
567 }
568 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
569 "Input is not proper UTF-8, indicate encoding !\n",
570 BAD_CAST buffer, NULL);
571 }
572
573 ctxt->charset = XML_CHAR_ENCODING_8859_1;
574 *len = 1;
575 return((int) *ctxt->input->cur);
576}
577
578/**
579 * htmlSkipBlankChars:
580 * @ctxt: the HTML parser context
581 *
582 * skip all blanks character found at that point in the input streams.
583 *
584 * Returns the number of space chars skipped
585 */
586
587static int
588htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
589 int res = 0;
590
591 while (IS_BLANK_CH(*(ctxt->input->cur))) {
592 if ((*ctxt->input->cur == 0) &&
593 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
594 xmlPopInput(ctxt);
595 } else {
596 if (*(ctxt->input->cur) == '\n') {
597 ctxt->input->line++; ctxt->input->col = 1;
598 } else ctxt->input->col++;
599 ctxt->input->cur++;
600 ctxt->nbChars++;
601 if (*ctxt->input->cur == 0)
602 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
603 }
604 res++;
605 }
606 return(res);
607}
608
609
610
611/************************************************************************
612 * *
613 * The list of HTML elements and their properties *
614 * *
615 ************************************************************************/
616
617/*
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700618 * Start Tag: 1 means the start tag can be omitted
619 * End Tag: 1 means the end tag can be omitted
Elliott Hughes7fbecab2019-01-10 16:42:03 -0800620 * 2 means it's forbidden (empty elements)
621 * 3 means the tag is stylistic and should be closed easily
622 * Depr: this element is deprecated
623 * DTD: 1 means that this element is valid only in the Loose DTD
624 * 2 means that this element is valid only in the Frameset DTD
625 *
626 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
627 , subElements , impliedsubelt , Attributes, userdata
628 */
629
630/* Definitions and a couple of vars for HTML Elements */
631
632#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
633#define NB_FONTSTYLE 8
634#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
635#define NB_PHRASE 10
636#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
637#define NB_SPECIAL 16
638#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
639#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
640#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
641#define NB_BLOCK NB_HEADING + NB_LIST + 14
642#define FORMCTRL "input", "select", "textarea", "label", "button"
643#define NB_FORMCTRL 5
644#define PCDATA
645#define NB_PCDATA 0
646#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
647#define NB_HEADING 6
648#define LIST "ul", "ol", "dir", "menu"
649#define NB_LIST 4
650#define MODIFIER
651#define NB_MODIFIER 0
652#define FLOW BLOCK,INLINE
653#define NB_FLOW NB_BLOCK + NB_INLINE
654#define EMPTY NULL
655
656
657static const char* const html_flow[] = { FLOW, NULL } ;
658static const char* const html_inline[] = { INLINE, NULL } ;
659
660/* placeholders: elts with content but no subelements */
661static const char* const html_pcdata[] = { NULL } ;
662#define html_cdata html_pcdata
663
664
665/* ... and for HTML Attributes */
666
667#define COREATTRS "id", "class", "style", "title"
668#define NB_COREATTRS 4
669#define I18N "lang", "dir"
670#define NB_I18N 2
671#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
672#define NB_EVENTS 9
673#define ATTRS COREATTRS,I18N,EVENTS
674#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
675#define CELLHALIGN "align", "char", "charoff"
676#define NB_CELLHALIGN 3
677#define CELLVALIGN "valign"
678#define NB_CELLVALIGN 1
679
680static const char* const html_attrs[] = { ATTRS, NULL } ;
681static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
682static const char* const core_attrs[] = { COREATTRS, NULL } ;
683static const char* const i18n_attrs[] = { I18N, NULL } ;
684
685
686/* Other declarations that should go inline ... */
687static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
688 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
689 "tabindex", "onfocus", "onblur", NULL } ;
690static const char* const target_attr[] = { "target", NULL } ;
691static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
692static const char* const alt_attr[] = { "alt", NULL } ;
693static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
694static const char* const href_attrs[] = { "href", NULL } ;
695static const char* const clear_attrs[] = { "clear", NULL } ;
696static const char* const inline_p[] = { INLINE, "p", NULL } ;
697
698static const char* const flow_param[] = { FLOW, "param", NULL } ;
699static const char* const applet_attrs[] = { COREATTRS , "codebase",
700 "archive", "alt", "name", "height", "width", "align",
701 "hspace", "vspace", NULL } ;
702static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
703 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
704static const char* const basefont_attrs[] =
705 { "id", "size", "color", "face", NULL } ;
706static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
707static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
708static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
709static const char* const body_depr[] = { "background", "bgcolor", "text",
710 "link", "vlink", "alink", NULL } ;
711static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
712 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
713
714
715static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
716static const char* const col_elt[] = { "col", NULL } ;
717static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
718static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
719static const char* const dl_contents[] = { "dt", "dd", NULL } ;
720static const char* const compact_attr[] = { "compact", NULL } ;
721static const char* const label_attr[] = { "label", NULL } ;
722static const char* const fieldset_contents[] = { FLOW, "legend" } ;
723static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
724static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
725static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
726static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
727static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
728static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
729static const char* const head_attrs[] = { I18N, "profile", NULL } ;
730static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
731static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
732static const char* const version_attr[] = { "version", NULL } ;
733static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
734static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
735static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
736static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
737static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
738static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
739static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
740static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
741static const char* const align_attr[] = { "align", NULL } ;
742static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
743static const char* const map_contents[] = { BLOCK, "area", NULL } ;
744static const char* const name_attr[] = { "name", NULL } ;
745static const char* const action_attr[] = { "action", NULL } ;
746static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
747static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
748static const char* const content_attr[] = { "content", NULL } ;
749static const char* const type_attr[] = { "type", NULL } ;
750static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
751static const char* const object_contents[] = { FLOW, "param", NULL } ;
752static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
753static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
754static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
755static const char* const option_elt[] = { "option", NULL } ;
756static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
757static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
758static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
759static const char* const width_attr[] = { "width", NULL } ;
760static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
761static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
762static const char* const language_attr[] = { "language", NULL } ;
763static const char* const select_content[] = { "optgroup", "option", NULL } ;
764static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
765static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
766static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
767static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
768static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
769static const char* const tr_elt[] = { "tr", NULL } ;
770static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
771static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
772static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
773static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
774static const char* const tr_contents[] = { "th", "td", NULL } ;
775static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
776static const char* const li_elt[] = { "li", NULL } ;
777static const char* const ul_depr[] = { "type", "compact", NULL} ;
778static const char* const dir_attr[] = { "dir", NULL} ;
779
780#define DECL (const char**)
781
782static const htmlElemDesc
783html40ElementTable[] = {
784{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
785 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
786},
787{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
788 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
789},
790{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
791 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
792},
793{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
794 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
795},
796{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
797 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
798},
799{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
800 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
801},
802{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
803 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
804},
805{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
806 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
807},
808{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
809 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
810},
811{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
812 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
813},
814{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
815 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
816},
817{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
818 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
819},
820{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
821 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
822},
823{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
824 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
825},
826{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
827 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
828},
829{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
830 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
831},
832{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
833 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
834},
835{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
836 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
837},
838{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
839 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
840},
841{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
842 EMPTY , NULL , DECL col_attrs , NULL, NULL
843},
844{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
845 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
846},
847{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
848 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
849},
850{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
851 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
852},
853{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
854 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
855},
856{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
857 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
858},
859{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
860 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
861},
862{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
863 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
864},
865{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
866 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
867},
868{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
869 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
870},
871{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
872 EMPTY, NULL, DECL embed_attrs, NULL, NULL
873},
874{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
875 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
876},
877{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
878 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
879},
880{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
881 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
882},
883{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
884 EMPTY, NULL, NULL, DECL frame_attrs, NULL
885},
886{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
887 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
888},
889{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
890 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
891},
892{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
893 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894},
895{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
896 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
897},
898{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
899 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
900},
901{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
902 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
903},
904{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
905 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
906},
907{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
908 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
909},
910{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
911 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
912},
913{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
914 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
915},
916{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
917 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
918},
919{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
920 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
921},
922{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
923 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
924},
925{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
926 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
927},
928{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
929 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
930},
931{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
932 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
933},
934{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
935 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936},
937{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
938 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
939},
940{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
941 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
942},
943{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
944 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
945},
946{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
947 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
948},
949{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
950 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
951},
952{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
953 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
954},
955{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
956 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
957},
958{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
959 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
960},
961{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
962 DECL html_flow, "div", DECL html_attrs, NULL, NULL
963},
964{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
965 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
966},
967{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
968 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
969},
970{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
971 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
972},
973{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
974 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
975},
976{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
977 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
978},
979{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
980 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
981},
982{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
983 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
984},
985{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
986 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
987},
988{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
989 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
990},
991{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
992 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
993},
994{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
995 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
996},
997{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
998 DECL select_content, NULL, DECL select_attrs, NULL, NULL
999},
1000{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1001 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1002},
1003{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1004 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1005},
1006{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1007 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1008},
1009{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1010 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1011},
1012{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1013 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1014},
1015{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1016 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1017},
1018{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1019 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1020},
1021{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1022 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1023},
1024{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1025 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1026},
1027{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1028 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1029},
1030{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1031 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1032},
1033{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1034 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1035},
1036{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1037 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1038},
1039{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1040 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1041},
1042{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1043 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1044},
1045{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1046 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1047},
1048{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1049 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1050},
1051{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1052 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1053},
1054{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1055 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1056},
1057{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1058 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1059}
1060};
1061
1062/*
1063 * start tags that imply the end of current element
1064 */
1065static const char * const htmlStartClose[] = {
1066"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1067 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1068 "listing", "xmp", "head", NULL,
1069"head", "p", NULL,
1070"title", "p", NULL,
1071"body", "head", "style", "link", "title", "p", NULL,
1072"frameset", "head", "style", "link", "title", "p", NULL,
1073"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1074 "pre", "listing", "xmp", "head", "li", NULL,
1075"hr", "p", "head", NULL,
1076"h1", "p", "head", NULL,
1077"h2", "p", "head", NULL,
1078"h3", "p", "head", NULL,
1079"h4", "p", "head", NULL,
1080"h5", "p", "head", NULL,
1081"h6", "p", "head", NULL,
1082"dir", "p", "head", NULL,
1083"address", "p", "head", "ul", NULL,
1084"pre", "p", "head", "ul", NULL,
1085"listing", "p", "head", NULL,
1086"xmp", "p", "head", NULL,
1087"blockquote", "p", "head", NULL,
1088"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1089 "xmp", "head", NULL,
1090"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1091 "head", "dd", NULL,
1092"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1093 "head", "dt", NULL,
1094"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1095 "listing", "xmp", NULL,
1096"ol", "p", "head", "ul", NULL,
1097"menu", "p", "head", "ul", NULL,
1098"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1099"div", "p", "head", NULL,
1100"noscript", "script", NULL,
1101"center", "font", "b", "i", "p", "head", NULL,
1102"a", "a", "head", NULL,
1103"caption", "p", NULL,
1104"colgroup", "caption", "colgroup", "col", "p", NULL,
1105"col", "caption", "col", "p", NULL,
1106"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1107 "listing", "xmp", "a", NULL,
1108"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1109"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1110"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1111"thead", "caption", "col", "colgroup", NULL,
1112"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1113 "tbody", "p", NULL,
1114"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1115 "tfoot", "tbody", "p", NULL,
1116"optgroup", "option", NULL,
1117"option", "option", NULL,
1118"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1119 "pre", "listing", "xmp", "a", NULL,
1120/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1121"tt", "head", NULL,
1122"i", "head", NULL,
1123"b", "head", NULL,
1124"u", "head", NULL,
1125"s", "head", NULL,
1126"strike", "head", NULL,
1127"big", "head", NULL,
1128"small", "head", NULL,
1129
1130"em", "head", NULL,
1131"strong", "head", NULL,
1132"dfn", "head", NULL,
1133"code", "head", NULL,
1134"samp", "head", NULL,
1135"kbd", "head", NULL,
1136"var", "head", NULL,
1137"cite", "head", NULL,
1138"abbr", "head", NULL,
1139"acronym", "head", NULL,
1140
1141/* "a" */
1142"img", "head", NULL,
1143/* "applet" */
1144/* "embed" */
1145/* "object" */
1146"font", "head", NULL,
1147/* "basefont" */
1148"br", "head", NULL,
1149/* "script" */
1150"map", "head", NULL,
1151"q", "head", NULL,
1152"sub", "head", NULL,
1153"sup", "head", NULL,
1154"span", "head", NULL,
1155"bdo", "head", NULL,
1156"iframe", "head", NULL,
1157NULL
1158};
1159
1160/*
1161 * The list of HTML elements which are supposed not to have
1162 * CDATA content and where a p element will be implied
1163 *
1164 * TODO: extend that list by reading the HTML SGML DTD on
1165 * implied paragraph
1166 */
1167static const char *const htmlNoContentElements[] = {
1168 "html",
1169 "head",
1170 NULL
1171};
1172
1173/*
1174 * The list of HTML attributes which are of content %Script;
1175 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1176 * it assumes the name starts with 'on'
1177 */
1178static const char *const htmlScriptAttributes[] = {
1179 "onclick",
1180 "ondblclick",
1181 "onmousedown",
1182 "onmouseup",
1183 "onmouseover",
1184 "onmousemove",
1185 "onmouseout",
1186 "onkeypress",
1187 "onkeydown",
1188 "onkeyup",
1189 "onload",
1190 "onunload",
1191 "onfocus",
1192 "onblur",
1193 "onsubmit",
1194 "onreset",
1195 "onchange",
1196 "onselect"
1197};
1198
1199/*
1200 * This table is used by the htmlparser to know what to do with
1201 * broken html pages. By assigning different priorities to different
1202 * elements the parser can decide how to handle extra endtags.
1203 * Endtags are only allowed to close elements with lower or equal
1204 * priority.
1205 */
1206
1207typedef struct {
1208 const char *name;
1209 int priority;
1210} elementPriority;
1211
1212static const elementPriority htmlEndPriority[] = {
1213 {"div", 150},
1214 {"td", 160},
1215 {"th", 160},
1216 {"tr", 170},
1217 {"thead", 180},
1218 {"tbody", 180},
1219 {"tfoot", 180},
1220 {"table", 190},
1221 {"head", 200},
1222 {"body", 200},
1223 {"html", 220},
1224 {NULL, 100} /* Default priority */
1225};
1226
1227static const char** htmlStartCloseIndex[100];
1228static int htmlStartCloseIndexinitialized = 0;
1229
1230/************************************************************************
1231 * *
1232 * functions to handle HTML specific data *
1233 * *
1234 ************************************************************************/
1235
1236/**
1237 * htmlInitAutoClose:
1238 *
1239 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1240 * This is not reentrant. Call xmlInitParser() once before processing in
1241 * case of use in multithreaded programs.
1242 */
1243void
1244htmlInitAutoClose(void) {
1245 int indx, i = 0;
1246
1247 if (htmlStartCloseIndexinitialized) return;
1248
1249 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1250 indx = 0;
1251 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1252 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1253 while (htmlStartClose[i] != NULL) i++;
1254 i++;
1255 }
1256 htmlStartCloseIndexinitialized = 1;
1257}
1258
1259/**
1260 * htmlTagLookup:
1261 * @tag: The tag name in lowercase
1262 *
1263 * Lookup the HTML tag in the ElementTable
1264 *
1265 * Returns the related htmlElemDescPtr or NULL if not found.
1266 */
1267const htmlElemDesc *
1268htmlTagLookup(const xmlChar *tag) {
1269 unsigned int i;
1270
1271 for (i = 0; i < (sizeof(html40ElementTable) /
1272 sizeof(html40ElementTable[0]));i++) {
1273 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1274 return((htmlElemDescPtr) &html40ElementTable[i]);
1275 }
1276 return(NULL);
1277}
1278
1279/**
1280 * htmlGetEndPriority:
1281 * @name: The name of the element to look up the priority for.
1282 *
1283 * Return value: The "endtag" priority.
1284 **/
1285static int
1286htmlGetEndPriority (const xmlChar *name) {
1287 int i = 0;
1288
1289 while ((htmlEndPriority[i].name != NULL) &&
1290 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1291 i++;
1292
1293 return(htmlEndPriority[i].priority);
1294}
1295
1296
1297/**
1298 * htmlCheckAutoClose:
1299 * @newtag: The new tag name
1300 * @oldtag: The old tag name
1301 *
1302 * Checks whether the new tag is one of the registered valid tags for
1303 * closing old.
1304 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1305 *
1306 * Returns 0 if no, 1 if yes.
1307 */
1308static int
1309htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1310{
1311 int i, indx;
1312 const char **closed = NULL;
1313
1314 if (htmlStartCloseIndexinitialized == 0)
1315 htmlInitAutoClose();
1316
1317 /* inefficient, but not a big deal */
1318 for (indx = 0; indx < 100; indx++) {
1319 closed = htmlStartCloseIndex[indx];
1320 if (closed == NULL)
1321 return (0);
1322 if (xmlStrEqual(BAD_CAST * closed, newtag))
1323 break;
1324 }
1325
1326 i = closed - htmlStartClose;
1327 i++;
1328 while (htmlStartClose[i] != NULL) {
1329 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1330 return (1);
1331 }
1332 i++;
1333 }
1334 return (0);
1335}
1336
1337/**
1338 * htmlAutoCloseOnClose:
1339 * @ctxt: an HTML parser context
1340 * @newtag: The new tag name
1341 * @force: force the tag closure
1342 *
1343 * The HTML DTD allows an ending tag to implicitly close other tags.
1344 */
1345static void
1346htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1347{
1348 const htmlElemDesc *info;
1349 int i, priority;
1350
1351 priority = htmlGetEndPriority(newtag);
1352
1353 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1354
1355 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1356 break;
1357 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001358 * A misplaced endtag can only close elements with lower
Elliott Hughes7fbecab2019-01-10 16:42:03 -08001359 * or equal priority, so if we find an element with higher
1360 * priority before we find an element with
1361 * matching name, we just ignore this endtag
1362 */
1363 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1364 return;
1365 }
1366 if (i < 0)
1367 return;
1368
1369 while (!xmlStrEqual(newtag, ctxt->name)) {
1370 info = htmlTagLookup(ctxt->name);
1371 if ((info != NULL) && (info->endTag == 3)) {
1372 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1373 "Opening and ending tag mismatch: %s and %s\n",
1374 newtag, ctxt->name);
1375 }
1376 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1377 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1378 htmlnamePop(ctxt);
1379 }
1380}
1381
1382/**
1383 * htmlAutoCloseOnEnd:
1384 * @ctxt: an HTML parser context
1385 *
1386 * Close all remaining tags at the end of the stream
1387 */
1388static void
1389htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1390{
1391 int i;
1392
1393 if (ctxt->nameNr == 0)
1394 return;
1395 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1396 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1397 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1398 htmlnamePop(ctxt);
1399 }
1400}
1401
1402/**
1403 * htmlAutoClose:
1404 * @ctxt: an HTML parser context
1405 * @newtag: The new tag name or NULL
1406 *
1407 * The HTML DTD allows a tag to implicitly close other tags.
1408 * The list is kept in htmlStartClose array. This function is
1409 * called when a new tag has been detected and generates the
1410 * appropriates closes if possible/needed.
1411 * If newtag is NULL this mean we are at the end of the resource
1412 * and we should check
1413 */
1414static void
1415htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1416{
1417 while ((newtag != NULL) && (ctxt->name != NULL) &&
1418 (htmlCheckAutoClose(newtag, ctxt->name))) {
1419 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1420 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1421 htmlnamePop(ctxt);
1422 }
1423 if (newtag == NULL) {
1424 htmlAutoCloseOnEnd(ctxt);
1425 return;
1426 }
1427 while ((newtag == NULL) && (ctxt->name != NULL) &&
1428 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1429 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1430 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1431 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1432 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1433 htmlnamePop(ctxt);
1434 }
1435}
1436
1437/**
1438 * htmlAutoCloseTag:
1439 * @doc: the HTML document
1440 * @name: The tag name
1441 * @elem: the HTML element
1442 *
1443 * The HTML DTD allows a tag to implicitly close other tags.
1444 * The list is kept in htmlStartClose array. This function checks
1445 * if the element or one of it's children would autoclose the
1446 * given tag.
1447 *
1448 * Returns 1 if autoclose, 0 otherwise
1449 */
1450int
1451htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1452 htmlNodePtr child;
1453
1454 if (elem == NULL) return(1);
1455 if (xmlStrEqual(name, elem->name)) return(0);
1456 if (htmlCheckAutoClose(elem->name, name)) return(1);
1457 child = elem->children;
1458 while (child != NULL) {
1459 if (htmlAutoCloseTag(doc, name, child)) return(1);
1460 child = child->next;
1461 }
1462 return(0);
1463}
1464
1465/**
1466 * htmlIsAutoClosed:
1467 * @doc: the HTML document
1468 * @elem: the HTML element
1469 *
1470 * The HTML DTD allows a tag to implicitly close other tags.
1471 * The list is kept in htmlStartClose array. This function checks
1472 * if a tag is autoclosed by one of it's child
1473 *
1474 * Returns 1 if autoclosed, 0 otherwise
1475 */
1476int
1477htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1478 htmlNodePtr child;
1479
1480 if (elem == NULL) return(1);
1481 child = elem->children;
1482 while (child != NULL) {
1483 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1484 child = child->next;
1485 }
1486 return(0);
1487}
1488
1489/**
1490 * htmlCheckImplied:
1491 * @ctxt: an HTML parser context
1492 * @newtag: The new tag name
1493 *
1494 * The HTML DTD allows a tag to exists only implicitly
1495 * called when a new tag has been detected and generates the
1496 * appropriates implicit tags if missing
1497 */
1498static void
1499htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1500 int i;
1501
1502 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1503 return;
1504 if (!htmlOmittedDefaultValue)
1505 return;
1506 if (xmlStrEqual(newtag, BAD_CAST"html"))
1507 return;
1508 if (ctxt->nameNr <= 0) {
1509 htmlnamePush(ctxt, BAD_CAST"html");
1510 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1511 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1512 }
1513 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1514 return;
1515 if ((ctxt->nameNr <= 1) &&
1516 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1517 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1518 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1519 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1520 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1521 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1522 if (ctxt->html >= 3) {
1523 /* we already saw or generated an <head> before */
1524 return;
1525 }
1526 /*
1527 * dropped OBJECT ... i you put it first BODY will be
1528 * assumed !
1529 */
1530 htmlnamePush(ctxt, BAD_CAST"head");
1531 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1532 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1533 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1534 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1535 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1536 if (ctxt->html >= 10) {
1537 /* we already saw or generated a <body> before */
1538 return;
1539 }
1540 for (i = 0;i < ctxt->nameNr;i++) {
1541 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1542 return;
1543 }
1544 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1545 return;
1546 }
1547 }
1548
1549 htmlnamePush(ctxt, BAD_CAST"body");
1550 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1551 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1552 }
1553}
1554
1555/**
1556 * htmlCheckParagraph
1557 * @ctxt: an HTML parser context
1558 *
1559 * Check whether a p element need to be implied before inserting
1560 * characters in the current element.
1561 *
1562 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1563 * in case of error.
1564 */
1565
1566static int
1567htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1568 const xmlChar *tag;
1569 int i;
1570
1571 if (ctxt == NULL)
1572 return(-1);
1573 tag = ctxt->name;
1574 if (tag == NULL) {
1575 htmlAutoClose(ctxt, BAD_CAST"p");
1576 htmlCheckImplied(ctxt, BAD_CAST"p");
1577 htmlnamePush(ctxt, BAD_CAST"p");
1578 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1579 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1580 return(1);
1581 }
1582 if (!htmlOmittedDefaultValue)
1583 return(0);
1584 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1585 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1586 htmlAutoClose(ctxt, BAD_CAST"p");
1587 htmlCheckImplied(ctxt, BAD_CAST"p");
1588 htmlnamePush(ctxt, BAD_CAST"p");
1589 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1590 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1591 return(1);
1592 }
1593 }
1594 return(0);
1595}
1596
1597/**
1598 * htmlIsScriptAttribute:
1599 * @name: an attribute name
1600 *
1601 * Check if an attribute is of content type Script
1602 *
1603 * Returns 1 is the attribute is a script 0 otherwise
1604 */
1605int
1606htmlIsScriptAttribute(const xmlChar *name) {
1607 unsigned int i;
1608
1609 if (name == NULL)
1610 return(0);
1611 /*
1612 * all script attributes start with 'on'
1613 */
1614 if ((name[0] != 'o') || (name[1] != 'n'))
1615 return(0);
1616 for (i = 0;
1617 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1618 i++) {
1619 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1620 return(1);
1621 }
1622 return(0);
1623}
1624
1625/************************************************************************
1626 * *
1627 * The list of HTML predefined entities *
1628 * *
1629 ************************************************************************/
1630
1631
1632static const htmlEntityDesc html40EntitiesTable[] = {
1633/*
1634 * the 4 absolute ones, plus apostrophe.
1635 */
1636{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1637{ 38, "amp", "ampersand, U+0026 ISOnum" },
1638{ 39, "apos", "single quote" },
1639{ 60, "lt", "less-than sign, U+003C ISOnum" },
1640{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1641
1642/*
1643 * A bunch still in the 128-255 range
1644 * Replacing them depend really on the charset used.
1645 */
1646{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1647{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1648{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1649{ 163, "pound","pound sign, U+00A3 ISOnum" },
1650{ 164, "curren","currency sign, U+00A4 ISOnum" },
1651{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1652{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1653{ 167, "sect", "section sign, U+00A7 ISOnum" },
1654{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1655{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1656{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1657{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1658{ 172, "not", "not sign, U+00AC ISOnum" },
1659{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1660{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1661{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1662{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1663{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1664{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1665{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1666{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1667{ 181, "micro","micro sign, U+00B5 ISOnum" },
1668{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1669{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1670{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1671{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1672{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1673{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1674{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1675{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1676{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1677{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1678{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1679{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1680{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1681{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1682{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1683{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1684{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1685{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1686{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1687{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1688{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1689{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1690{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1691{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1692{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1693{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1694{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1695{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1696{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1697{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1698{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1699{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1700{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1701{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1702{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1703{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1704{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1705{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1706{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1707{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1708{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1709{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1710{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1711{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1712{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1713{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1714{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1715{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1716{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1717{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1718{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1719{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1720{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1721{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1722{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1723{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1724{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1725{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1726{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1727{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1728{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1729{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1730{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1731{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1732{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1733{ 247, "divide","division sign, U+00F7 ISOnum" },
1734{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1735{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1736{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1737{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1738{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1739{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1740{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1741{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1742
1743{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1744{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1745{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1746{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1747{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1748
1749/*
1750 * Anything below should really be kept as entities references
1751 */
1752{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1753
1754{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1755{ 732, "tilde","small tilde, U+02DC ISOdia" },
1756
1757{ 913, "Alpha","greek capital letter alpha, U+0391" },
1758{ 914, "Beta", "greek capital letter beta, U+0392" },
1759{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1760{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1761{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1762{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1763{ 919, "Eta", "greek capital letter eta, U+0397" },
1764{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1765{ 921, "Iota", "greek capital letter iota, U+0399" },
1766{ 922, "Kappa","greek capital letter kappa, U+039A" },
1767{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1768{ 924, "Mu", "greek capital letter mu, U+039C" },
1769{ 925, "Nu", "greek capital letter nu, U+039D" },
1770{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1771{ 927, "Omicron","greek capital letter omicron, U+039F" },
1772{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1773{ 929, "Rho", "greek capital letter rho, U+03A1" },
1774{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1775{ 932, "Tau", "greek capital letter tau, U+03A4" },
1776{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1777{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1778{ 935, "Chi", "greek capital letter chi, U+03A7" },
1779{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1780{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1781
1782{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1783{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1784{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1785{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1786{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1787{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1788{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1789{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1790{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1791{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1792{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1793{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1794{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1795{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1796{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1797{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1798{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1799{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1800{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1801{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1802{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1803{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1804{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1805{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1806{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1807{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1808{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1809{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1810
1811{ 8194, "ensp", "en space, U+2002 ISOpub" },
1812{ 8195, "emsp", "em space, U+2003 ISOpub" },
1813{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1814{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1815{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1816{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1817{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1818{ 8211, "ndash","en dash, U+2013 ISOpub" },
1819{ 8212, "mdash","em dash, U+2014 ISOpub" },
1820{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1821{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1822{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1823{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1824{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1825{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1826{ 8224, "dagger","dagger, U+2020 ISOpub" },
1827{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1828
1829{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1830{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1831
1832{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1833
1834{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1835{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1836
1837{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1838{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1839
1840{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1841{ 8260, "frasl","fraction slash, U+2044 NEW" },
1842
1843{ 8364, "euro", "euro sign, U+20AC NEW" },
1844
1845{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1846{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1847{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1848{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1849{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1850{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1851{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1852{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1853{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1854{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1855{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1856{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1857{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1858{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1859{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1860{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1861
1862{ 8704, "forall","for all, U+2200 ISOtech" },
1863{ 8706, "part", "partial differential, U+2202 ISOtech" },
1864{ 8707, "exist","there exists, U+2203 ISOtech" },
1865{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1866{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1867{ 8712, "isin", "element of, U+2208 ISOtech" },
1868{ 8713, "notin","not an element of, U+2209 ISOtech" },
1869{ 8715, "ni", "contains as member, U+220B ISOtech" },
1870{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1871{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1872{ 8722, "minus","minus sign, U+2212 ISOtech" },
1873{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1874{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1875{ 8733, "prop", "proportional to, U+221D ISOtech" },
1876{ 8734, "infin","infinity, U+221E ISOtech" },
1877{ 8736, "ang", "angle, U+2220 ISOamso" },
1878{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1879{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1880{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1881{ 8746, "cup", "union = cup, U+222A ISOtech" },
1882{ 8747, "int", "integral, U+222B ISOtech" },
1883{ 8756, "there4","therefore, U+2234 ISOtech" },
1884{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1885{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1886{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1887{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1888{ 8801, "equiv","identical to, U+2261 ISOtech" },
1889{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1890{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1891{ 8834, "sub", "subset of, U+2282 ISOtech" },
1892{ 8835, "sup", "superset of, U+2283 ISOtech" },
1893{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1894{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1895{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1896{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1897{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1898{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1899{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1900{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1901{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1902{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1903{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1904{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1905{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1906{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1907
1908{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1909{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1910{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1911{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1912
1913};
1914
1915/************************************************************************
1916 * *
1917 * Commodity functions to handle entities *
1918 * *
1919 ************************************************************************/
1920
1921/*
1922 * Macro used to grow the current buffer.
1923 */
1924#define growBuffer(buffer) { \
1925 xmlChar *tmp; \
1926 buffer##_size *= 2; \
1927 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1928 if (tmp == NULL) { \
1929 htmlErrMemory(ctxt, "growing buffer\n"); \
1930 xmlFree(buffer); \
1931 return(NULL); \
1932 } \
1933 buffer = tmp; \
1934}
1935
1936/**
1937 * htmlEntityLookup:
1938 * @name: the entity name
1939 *
1940 * Lookup the given entity in EntitiesTable
1941 *
1942 * TODO: the linear scan is really ugly, an hash table is really needed.
1943 *
1944 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1945 */
1946const htmlEntityDesc *
1947htmlEntityLookup(const xmlChar *name) {
1948 unsigned int i;
1949
1950 for (i = 0;i < (sizeof(html40EntitiesTable)/
1951 sizeof(html40EntitiesTable[0]));i++) {
1952 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1953 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1954 }
1955 }
1956 return(NULL);
1957}
1958
1959/**
1960 * htmlEntityValueLookup:
1961 * @value: the entity's unicode value
1962 *
1963 * Lookup the given entity in EntitiesTable
1964 *
1965 * TODO: the linear scan is really ugly, an hash table is really needed.
1966 *
1967 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1968 */
1969const htmlEntityDesc *
1970htmlEntityValueLookup(unsigned int value) {
1971 unsigned int i;
1972
1973 for (i = 0;i < (sizeof(html40EntitiesTable)/
1974 sizeof(html40EntitiesTable[0]));i++) {
1975 if (html40EntitiesTable[i].value >= value) {
1976 if (html40EntitiesTable[i].value > value)
1977 break;
1978 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1979 }
1980 }
1981 return(NULL);
1982}
1983
1984/**
1985 * UTF8ToHtml:
1986 * @out: a pointer to an array of bytes to store the result
1987 * @outlen: the length of @out
1988 * @in: a pointer to an array of UTF-8 chars
1989 * @inlen: the length of @in
1990 *
1991 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1992 * plus HTML entities block of chars out.
1993 *
1994 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1995 * The value of @inlen after return is the number of octets consumed
1996 * as the return value is positive, else unpredictable.
1997 * The value of @outlen after return is the number of octets consumed.
1998 */
1999int
2000UTF8ToHtml(unsigned char* out, int *outlen,
2001 const unsigned char* in, int *inlen) {
2002 const unsigned char* processed = in;
2003 const unsigned char* outend;
2004 const unsigned char* outstart = out;
2005 const unsigned char* instart = in;
2006 const unsigned char* inend;
2007 unsigned int c, d;
2008 int trailing;
2009
2010 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2011 if (in == NULL) {
2012 /*
2013 * initialization nothing to do
2014 */
2015 *outlen = 0;
2016 *inlen = 0;
2017 return(0);
2018 }
2019 inend = in + (*inlen);
2020 outend = out + (*outlen);
2021 while (in < inend) {
2022 d = *in++;
2023 if (d < 0x80) { c= d; trailing= 0; }
2024 else if (d < 0xC0) {
2025 /* trailing byte in leading position */
2026 *outlen = out - outstart;
2027 *inlen = processed - instart;
2028 return(-2);
2029 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2030 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2031 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2032 else {
2033 /* no chance for this in Ascii */
2034 *outlen = out - outstart;
2035 *inlen = processed - instart;
2036 return(-2);
2037 }
2038
2039 if (inend - in < trailing) {
2040 break;
2041 }
2042
2043 for ( ; trailing; trailing--) {
2044 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2045 break;
2046 c <<= 6;
2047 c |= d & 0x3F;
2048 }
2049
2050 /* assertion: c is a single UTF-4 value */
2051 if (c < 0x80) {
2052 if (out + 1 >= outend)
2053 break;
2054 *out++ = c;
2055 } else {
2056 int len;
2057 const htmlEntityDesc * ent;
2058 const char *cp;
2059 char nbuf[16];
2060
2061 /*
2062 * Try to lookup a predefined HTML entity for it
2063 */
2064
2065 ent = htmlEntityValueLookup(c);
2066 if (ent == NULL) {
2067 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2068 cp = nbuf;
2069 }
2070 else
2071 cp = ent->name;
2072 len = strlen(cp);
2073 if (out + 2 + len >= outend)
2074 break;
2075 *out++ = '&';
2076 memcpy(out, cp, len);
2077 out += len;
2078 *out++ = ';';
2079 }
2080 processed = in;
2081 }
2082 *outlen = out - outstart;
2083 *inlen = processed - instart;
2084 return(0);
2085}
2086
2087/**
2088 * htmlEncodeEntities:
2089 * @out: a pointer to an array of bytes to store the result
2090 * @outlen: the length of @out
2091 * @in: a pointer to an array of UTF-8 chars
2092 * @inlen: the length of @in
2093 * @quoteChar: the quote character to escape (' or ") or zero.
2094 *
2095 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2096 * plus HTML entities block of chars out.
2097 *
2098 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2099 * The value of @inlen after return is the number of octets consumed
2100 * as the return value is positive, else unpredictable.
2101 * The value of @outlen after return is the number of octets consumed.
2102 */
2103int
2104htmlEncodeEntities(unsigned char* out, int *outlen,
2105 const unsigned char* in, int *inlen, int quoteChar) {
2106 const unsigned char* processed = in;
2107 const unsigned char* outend;
2108 const unsigned char* outstart = out;
2109 const unsigned char* instart = in;
2110 const unsigned char* inend;
2111 unsigned int c, d;
2112 int trailing;
2113
2114 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2115 return(-1);
2116 outend = out + (*outlen);
2117 inend = in + (*inlen);
2118 while (in < inend) {
2119 d = *in++;
2120 if (d < 0x80) { c= d; trailing= 0; }
2121 else if (d < 0xC0) {
2122 /* trailing byte in leading position */
2123 *outlen = out - outstart;
2124 *inlen = processed - instart;
2125 return(-2);
2126 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2127 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2128 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2129 else {
2130 /* no chance for this in Ascii */
2131 *outlen = out - outstart;
2132 *inlen = processed - instart;
2133 return(-2);
2134 }
2135
2136 if (inend - in < trailing)
2137 break;
2138
2139 while (trailing--) {
2140 if (((d= *in++) & 0xC0) != 0x80) {
2141 *outlen = out - outstart;
2142 *inlen = processed - instart;
2143 return(-2);
2144 }
2145 c <<= 6;
2146 c |= d & 0x3F;
2147 }
2148
2149 /* assertion: c is a single UTF-4 value */
2150 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2151 (c != '&') && (c != '<') && (c != '>')) {
2152 if (out >= outend)
2153 break;
2154 *out++ = c;
2155 } else {
2156 const htmlEntityDesc * ent;
2157 const char *cp;
2158 char nbuf[16];
2159 int len;
2160
2161 /*
2162 * Try to lookup a predefined HTML entity for it
2163 */
2164 ent = htmlEntityValueLookup(c);
2165 if (ent == NULL) {
2166 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2167 cp = nbuf;
2168 }
2169 else
2170 cp = ent->name;
2171 len = strlen(cp);
2172 if (out + 2 + len > outend)
2173 break;
2174 *out++ = '&';
2175 memcpy(out, cp, len);
2176 out += len;
2177 *out++ = ';';
2178 }
2179 processed = in;
2180 }
2181 *outlen = out - outstart;
2182 *inlen = processed - instart;
2183 return(0);
2184}
2185
2186/************************************************************************
2187 * *
2188 * Commodity functions to handle streams *
2189 * *
2190 ************************************************************************/
2191
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002192#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002193/**
2194 * htmlNewInputStream:
2195 * @ctxt: an HTML parser context
2196 *
2197 * Create a new input stream structure
2198 * Returns the new input stream or NULL
2199 */
2200static htmlParserInputPtr
2201htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2202 htmlParserInputPtr input;
2203
2204 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2205 if (input == NULL) {
2206 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2207 return(NULL);
2208 }
2209 memset(input, 0, sizeof(htmlParserInput));
2210 input->filename = NULL;
2211 input->directory = NULL;
2212 input->base = NULL;
2213 input->cur = NULL;
2214 input->buf = NULL;
2215 input->line = 1;
2216 input->col = 1;
2217 input->buf = NULL;
2218 input->free = NULL;
2219 input->version = NULL;
2220 input->consumed = 0;
2221 input->length = 0;
2222 return(input);
2223}
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002224#endif
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002225
2226
2227/************************************************************************
2228 * *
2229 * Commodity functions, cleanup needed ? *
2230 * *
2231 ************************************************************************/
2232/*
2233 * all tags allowing pc data from the html 4.01 loose dtd
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002234 * NOTE: it might be more appropriate to integrate this information
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002235 * into the html40ElementTable array but I don't want to risk any
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002236 * binary incompatibility
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002237 */
2238static const char *allowPCData[] = {
2239 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2240 "blockquote", "body", "button", "caption", "center", "cite", "code",
2241 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2242 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2243 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2244 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2245};
2246
2247/**
2248 * areBlanks:
2249 * @ctxt: an HTML parser context
2250 * @str: a xmlChar *
2251 * @len: the size of @str
2252 *
2253 * Is this a sequence of blank chars that one can ignore ?
2254 *
2255 * Returns 1 if ignorable 0 otherwise.
2256 */
2257
2258static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2259 unsigned int i;
2260 int j;
2261 xmlNodePtr lastChild;
2262 xmlDtdPtr dtd;
2263
2264 for (j = 0;j < len;j++)
2265 if (!(IS_BLANK_CH(str[j]))) return(0);
2266
2267 if (CUR == 0) return(1);
2268 if (CUR != '<') return(0);
2269 if (ctxt->name == NULL)
2270 return(1);
2271 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2272 return(1);
2273 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2274 return(1);
2275
2276 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2277 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2278 dtd = xmlGetIntSubset(ctxt->myDoc);
2279 if (dtd != NULL && dtd->ExternalID != NULL) {
2280 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2281 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2282 return(1);
2283 }
2284 }
2285
2286 if (ctxt->node == NULL) return(0);
2287 lastChild = xmlGetLastChild(ctxt->node);
2288 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2289 lastChild = lastChild->prev;
2290 if (lastChild == NULL) {
2291 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2292 (ctxt->node->content != NULL)) return(0);
2293 /* keep ws in constructs like ...<b> </b>...
2294 for all tags "b" allowing PCDATA */
2295 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2296 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2297 return(0);
2298 }
2299 }
2300 } else if (xmlNodeIsText(lastChild)) {
2301 return(0);
2302 } else {
2303 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2304 for all tags "p" allowing PCDATA */
2305 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2306 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2307 return(0);
2308 }
2309 }
2310 }
2311 return(1);
2312}
2313
2314/**
2315 * htmlNewDocNoDtD:
2316 * @URI: URI for the dtd, or NULL
2317 * @ExternalID: the external ID of the DTD, or NULL
2318 *
2319 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2320 * are NULL
2321 *
2322 * Returns a new document, do not initialize the DTD if not provided
2323 */
2324htmlDocPtr
2325htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2326 xmlDocPtr cur;
2327
2328 /*
2329 * Allocate a new document and fill the fields.
2330 */
2331 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2332 if (cur == NULL) {
2333 htmlErrMemory(NULL, "HTML document creation failed\n");
2334 return(NULL);
2335 }
2336 memset(cur, 0, sizeof(xmlDoc));
2337
2338 cur->type = XML_HTML_DOCUMENT_NODE;
2339 cur->version = NULL;
2340 cur->intSubset = NULL;
2341 cur->doc = cur;
2342 cur->name = NULL;
2343 cur->children = NULL;
2344 cur->extSubset = NULL;
2345 cur->oldNs = NULL;
2346 cur->encoding = NULL;
2347 cur->standalone = 1;
2348 cur->compression = 0;
2349 cur->ids = NULL;
2350 cur->refs = NULL;
2351 cur->_private = NULL;
2352 cur->charset = XML_CHAR_ENCODING_UTF8;
2353 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2354 if ((ExternalID != NULL) ||
2355 (URI != NULL))
2356 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2357 return(cur);
2358}
2359
2360/**
2361 * htmlNewDoc:
2362 * @URI: URI for the dtd, or NULL
2363 * @ExternalID: the external ID of the DTD, or NULL
2364 *
2365 * Creates a new HTML document
2366 *
2367 * Returns a new document
2368 */
2369htmlDocPtr
2370htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2371 if ((URI == NULL) && (ExternalID == NULL))
2372 return(htmlNewDocNoDtD(
2373 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2374 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2375
2376 return(htmlNewDocNoDtD(URI, ExternalID));
2377}
2378
2379
2380/************************************************************************
2381 * *
2382 * The parser itself *
2383 * Relates to http://www.w3.org/TR/html40 *
2384 * *
2385 ************************************************************************/
2386
2387/************************************************************************
2388 * *
2389 * The parser itself *
2390 * *
2391 ************************************************************************/
2392
2393static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2394
2395/**
2396 * htmlParseHTMLName:
2397 * @ctxt: an HTML parser context
2398 *
2399 * parse an HTML tag or attribute name, note that we convert it to lowercase
2400 * since HTML names are not case-sensitive.
2401 *
2402 * Returns the Tag Name parsed or NULL
2403 */
2404
2405static const xmlChar *
2406htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2407 int i = 0;
2408 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2409
2410 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2411 (CUR != ':') && (CUR != '.')) return(NULL);
2412
2413 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2414 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2415 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2416 (CUR == '.'))) {
2417 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2418 else loc[i] = CUR;
2419 i++;
2420
2421 NEXT;
2422 }
2423
2424 return(xmlDictLookup(ctxt->dict, loc, i));
2425}
2426
2427
2428/**
2429 * htmlParseHTMLName_nonInvasive:
2430 * @ctxt: an HTML parser context
2431 *
2432 * parse an HTML tag or attribute name, note that we convert it to lowercase
2433 * since HTML names are not case-sensitive, this doesn't consume the data
2434 * from the stream, it's a look-ahead
2435 *
2436 * Returns the Tag Name parsed or NULL
2437 */
2438
2439static const xmlChar *
2440htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2441 int i = 0;
2442 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2443
2444 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2445 (NXT(1) != ':')) return(NULL);
2446
2447 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2448 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2449 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2450 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2451 else loc[i] = NXT(1+i);
2452 i++;
2453 }
2454
2455 return(xmlDictLookup(ctxt->dict, loc, i));
2456}
2457
2458
2459/**
2460 * htmlParseName:
2461 * @ctxt: an HTML parser context
2462 *
2463 * parse an HTML name, this routine is case sensitive.
2464 *
2465 * Returns the Name parsed or NULL
2466 */
2467
2468static const xmlChar *
2469htmlParseName(htmlParserCtxtPtr ctxt) {
2470 const xmlChar *in;
2471 const xmlChar *ret;
2472 int count = 0;
2473
2474 GROW;
2475
2476 /*
2477 * Accelerator for simple ASCII names
2478 */
2479 in = ctxt->input->cur;
2480 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2481 ((*in >= 0x41) && (*in <= 0x5A)) ||
2482 (*in == '_') || (*in == ':')) {
2483 in++;
2484 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2485 ((*in >= 0x41) && (*in <= 0x5A)) ||
2486 ((*in >= 0x30) && (*in <= 0x39)) ||
2487 (*in == '_') || (*in == '-') ||
2488 (*in == ':') || (*in == '.'))
2489 in++;
2490
2491 if (in == ctxt->input->end)
2492 return(NULL);
2493
2494 if ((*in > 0) && (*in < 0x80)) {
2495 count = in - ctxt->input->cur;
2496 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2497 ctxt->input->cur = in;
2498 ctxt->nbChars += count;
2499 ctxt->input->col += count;
2500 return(ret);
2501 }
2502 }
2503 return(htmlParseNameComplex(ctxt));
2504}
2505
2506static const xmlChar *
2507htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2508 int len = 0, l;
2509 int c;
2510 int count = 0;
2511 const xmlChar *base = ctxt->input->base;
2512
2513 /*
2514 * Handler for more complex cases
2515 */
2516 GROW;
2517 c = CUR_CHAR(l);
2518 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2519 (!IS_LETTER(c) && (c != '_') &&
2520 (c != ':'))) {
2521 return(NULL);
2522 }
2523
2524 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2525 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2526 (c == '.') || (c == '-') ||
2527 (c == '_') || (c == ':') ||
2528 (IS_COMBINING(c)) ||
2529 (IS_EXTENDER(c)))) {
2530 if (count++ > 100) {
2531 count = 0;
2532 GROW;
2533 }
2534 len += l;
2535 NEXTL(l);
2536 c = CUR_CHAR(l);
2537 if (ctxt->input->base != base) {
2538 /*
2539 * We changed encoding from an unknown encoding
2540 * Input buffer changed location, so we better start again
2541 */
2542 return(htmlParseNameComplex(ctxt));
2543 }
2544 }
2545
2546 if (ctxt->input->cur - ctxt->input->base < len) {
2547 /* Sanity check */
2548 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2549 "unexpected change of input buffer", NULL, NULL);
2550 return (NULL);
2551 }
2552
2553 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2554}
2555
2556
2557/**
2558 * htmlParseHTMLAttribute:
2559 * @ctxt: an HTML parser context
2560 * @stop: a char stop value
2561 *
2562 * parse an HTML attribute value till the stop (quote), if
2563 * stop is 0 then it stops at the first space
2564 *
2565 * Returns the attribute parsed or NULL
2566 */
2567
2568static xmlChar *
2569htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2570 xmlChar *buffer = NULL;
2571 int buffer_size = 0;
2572 xmlChar *out = NULL;
2573 const xmlChar *name = NULL;
2574 const xmlChar *cur = NULL;
2575 const htmlEntityDesc * ent;
2576
2577 /*
2578 * allocate a translation buffer.
2579 */
2580 buffer_size = HTML_PARSER_BUFFER_SIZE;
2581 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2582 if (buffer == NULL) {
2583 htmlErrMemory(ctxt, "buffer allocation failed\n");
2584 return(NULL);
2585 }
2586 out = buffer;
2587
2588 /*
2589 * Ok loop until we reach one of the ending chars
2590 */
2591 while ((CUR != 0) && (CUR != stop)) {
2592 if ((stop == 0) && (CUR == '>')) break;
2593 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2594 if (CUR == '&') {
2595 if (NXT(1) == '#') {
2596 unsigned int c;
2597 int bits;
2598
2599 c = htmlParseCharRef(ctxt);
2600 if (c < 0x80)
2601 { *out++ = c; bits= -6; }
2602 else if (c < 0x800)
2603 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2604 else if (c < 0x10000)
2605 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2606 else
2607 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2608
2609 for ( ; bits >= 0; bits-= 6) {
2610 *out++ = ((c >> bits) & 0x3F) | 0x80;
2611 }
2612
2613 if (out - buffer > buffer_size - 100) {
2614 int indx = out - buffer;
2615
2616 growBuffer(buffer);
2617 out = &buffer[indx];
2618 }
2619 } else {
2620 ent = htmlParseEntityRef(ctxt, &name);
2621 if (name == NULL) {
2622 *out++ = '&';
2623 if (out - buffer > buffer_size - 100) {
2624 int indx = out - buffer;
2625
2626 growBuffer(buffer);
2627 out = &buffer[indx];
2628 }
2629 } else if (ent == NULL) {
2630 *out++ = '&';
2631 cur = name;
2632 while (*cur != 0) {
2633 if (out - buffer > buffer_size - 100) {
2634 int indx = out - buffer;
2635
2636 growBuffer(buffer);
2637 out = &buffer[indx];
2638 }
2639 *out++ = *cur++;
2640 }
2641 } else {
2642 unsigned int c;
2643 int bits;
2644
2645 if (out - buffer > buffer_size - 100) {
2646 int indx = out - buffer;
2647
2648 growBuffer(buffer);
2649 out = &buffer[indx];
2650 }
2651 c = ent->value;
2652 if (c < 0x80)
2653 { *out++ = c; bits= -6; }
2654 else if (c < 0x800)
2655 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2656 else if (c < 0x10000)
2657 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2658 else
2659 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2660
2661 for ( ; bits >= 0; bits-= 6) {
2662 *out++ = ((c >> bits) & 0x3F) | 0x80;
2663 }
2664 }
2665 }
2666 } else {
2667 unsigned int c;
2668 int bits, l;
2669
2670 if (out - buffer > buffer_size - 100) {
2671 int indx = out - buffer;
2672
2673 growBuffer(buffer);
2674 out = &buffer[indx];
2675 }
2676 c = CUR_CHAR(l);
2677 if (c < 0x80)
2678 { *out++ = c; bits= -6; }
2679 else if (c < 0x800)
2680 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2681 else if (c < 0x10000)
2682 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2683 else
2684 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2685
2686 for ( ; bits >= 0; bits-= 6) {
2687 *out++ = ((c >> bits) & 0x3F) | 0x80;
2688 }
2689 NEXT;
2690 }
2691 }
2692 *out = 0;
2693 return(buffer);
2694}
2695
2696/**
2697 * htmlParseEntityRef:
2698 * @ctxt: an HTML parser context
2699 * @str: location to store the entity name
2700 *
2701 * parse an HTML ENTITY references
2702 *
2703 * [68] EntityRef ::= '&' Name ';'
2704 *
2705 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2706 * if non-NULL *str will have to be freed by the caller.
2707 */
2708const htmlEntityDesc *
2709htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2710 const xmlChar *name;
2711 const htmlEntityDesc * ent = NULL;
2712
2713 if (str != NULL) *str = NULL;
2714 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2715
2716 if (CUR == '&') {
2717 NEXT;
2718 name = htmlParseName(ctxt);
2719 if (name == NULL) {
2720 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2721 "htmlParseEntityRef: no name\n", NULL, NULL);
2722 } else {
2723 GROW;
2724 if (CUR == ';') {
2725 if (str != NULL)
2726 *str = name;
2727
2728 /*
2729 * Lookup the entity in the table.
2730 */
2731 ent = htmlEntityLookup(name);
2732 if (ent != NULL) /* OK that's ugly !!! */
2733 NEXT;
2734 } else {
2735 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2736 "htmlParseEntityRef: expecting ';'\n",
2737 NULL, NULL);
2738 if (str != NULL)
2739 *str = name;
2740 }
2741 }
2742 }
2743 return(ent);
2744}
2745
2746/**
2747 * htmlParseAttValue:
2748 * @ctxt: an HTML parser context
2749 *
2750 * parse a value for an attribute
2751 * Note: the parser won't do substitution of entities here, this
2752 * will be handled later in xmlStringGetNodeList, unless it was
2753 * asked for ctxt->replaceEntities != 0
2754 *
2755 * Returns the AttValue parsed or NULL.
2756 */
2757
2758static xmlChar *
2759htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2760 xmlChar *ret = NULL;
2761
2762 if (CUR == '"') {
2763 NEXT;
2764 ret = htmlParseHTMLAttribute(ctxt, '"');
2765 if (CUR != '"') {
2766 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2767 "AttValue: \" expected\n", NULL, NULL);
2768 } else
2769 NEXT;
2770 } else if (CUR == '\'') {
2771 NEXT;
2772 ret = htmlParseHTMLAttribute(ctxt, '\'');
2773 if (CUR != '\'') {
2774 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2775 "AttValue: ' expected\n", NULL, NULL);
2776 } else
2777 NEXT;
2778 } else {
2779 /*
2780 * That's an HTMLism, the attribute value may not be quoted
2781 */
2782 ret = htmlParseHTMLAttribute(ctxt, 0);
2783 if (ret == NULL) {
2784 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2785 "AttValue: no value found\n", NULL, NULL);
2786 }
2787 }
2788 return(ret);
2789}
2790
2791/**
2792 * htmlParseSystemLiteral:
2793 * @ctxt: an HTML parser context
2794 *
2795 * parse an HTML Literal
2796 *
2797 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2798 *
2799 * Returns the SystemLiteral parsed or NULL
2800 */
2801
2802static xmlChar *
2803htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2804 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002805 int err = 0;
2806 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002807 xmlChar *ret = NULL;
2808
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002809 if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002810 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002811 "SystemLiteral \" or ' expected\n", NULL, NULL);
2812 return(NULL);
2813 }
2814 quote = CUR;
2815 NEXT;
2816
2817 if (CUR_PTR < BASE_PTR)
2818 return(ret);
2819 startPosition = CUR_PTR - BASE_PTR;
2820
2821 while ((CUR != 0) && (CUR != quote)) {
2822 /* TODO: Handle UTF-8 */
2823 if (!IS_CHAR_CH(CUR)) {
2824 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2825 "Invalid char in SystemLiteral 0x%X\n", CUR);
2826 err = 1;
2827 }
2828 NEXT;
2829 len++;
2830 }
2831 if (CUR != quote) {
2832 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2833 "Unfinished SystemLiteral\n", NULL, NULL);
2834 } else {
2835 NEXT;
2836 if (err == 0)
2837 ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002838 }
2839
2840 return(ret);
2841}
2842
2843/**
2844 * htmlParsePubidLiteral:
2845 * @ctxt: an HTML parser context
2846 *
2847 * parse an HTML public literal
2848 *
2849 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2850 *
2851 * Returns the PubidLiteral parsed or NULL.
2852 */
2853
2854static xmlChar *
2855htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2856 size_t len = 0, startPosition = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002857 int err = 0;
2858 int quote;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002859 xmlChar *ret = NULL;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002860
2861 if ((CUR != '"') && (CUR != '\'')) {
2862 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2863 "PubidLiteral \" or ' expected\n", NULL, NULL);
2864 return(NULL);
2865 }
2866 quote = CUR;
2867 NEXT;
2868
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002869 /*
2870 * Name ::= (Letter | '_') (NameChar)*
2871 */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002872 if (CUR_PTR < BASE_PTR)
2873 return(ret);
2874 startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002875
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002876 while ((CUR != 0) && (CUR != quote)) {
2877 if (!IS_PUBIDCHAR_CH(CUR)) {
2878 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2879 "Invalid char in PubidLiteral 0x%X\n", CUR);
2880 err = 1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002881 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002882 len++;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002883 NEXT;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002884 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002885
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002886 if (CUR != '"') {
2887 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2888 "Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002889 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002890 NEXT;
2891 if (err == 0)
2892 ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002893 }
2894
2895 return(ret);
2896}
2897
2898/**
2899 * htmlParseScript:
2900 * @ctxt: an HTML parser context
2901 *
2902 * parse the content of an HTML SCRIPT or STYLE element
2903 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2904 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2905 * http://www.w3.org/TR/html4/types.html#type-script
2906 * http://www.w3.org/TR/html4/types.html#h-6.15
2907 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2908 *
2909 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2910 * element and the value of intrinsic event attributes. User agents must
2911 * not evaluate script data as HTML markup but instead must pass it on as
2912 * data to a script engine.
2913 * NOTES:
2914 * - The content is passed like CDATA
2915 * - the attributes for style and scripting "onXXX" are also described
2916 * as CDATA but SGML allows entities references in attributes so their
2917 * processing is identical as other attributes
2918 */
2919static void
2920htmlParseScript(htmlParserCtxtPtr ctxt) {
2921 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2922 int nbchar = 0;
2923 int cur,l;
2924
2925 SHRINK;
2926 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002927 while (cur != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002928 if ((cur == '<') && (NXT(1) == '/')) {
2929 /*
2930 * One should break here, the specification is clear:
2931 * Authors should therefore escape "</" within the content.
2932 * Escape mechanisms are specific to each scripting or
2933 * style sheet language.
2934 *
2935 * In recovery mode, only break if end tag match the
2936 * current tag, effectively ignoring all tags inside the
2937 * script/style block and treating the entire block as
2938 * CDATA.
2939 */
2940 if (ctxt->recovery) {
2941 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2942 xmlStrlen(ctxt->name)) == 0)
2943 {
2944 break; /* while */
2945 } else {
2946 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2947 "Element %s embeds close tag\n",
2948 ctxt->name, NULL);
2949 }
2950 } else {
2951 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2952 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2953 {
2954 break; /* while */
2955 }
2956 }
2957 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002958 if (IS_CHAR(cur)) {
2959 COPY_BUF(l,buf,nbchar,cur);
2960 } else {
2961 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2962 "Invalid char in CDATA 0x%X\n", cur);
2963 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002964 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002965 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002966 if (ctxt->sax->cdataBlock!= NULL) {
2967 /*
2968 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2969 */
2970 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2971 } else if (ctxt->sax->characters != NULL) {
2972 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2973 }
2974 nbchar = 0;
2975 }
2976 GROW;
2977 NEXTL(l);
2978 cur = CUR_CHAR(l);
2979 }
2980
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002981 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002982 buf[nbchar] = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08002983 if (ctxt->sax->cdataBlock!= NULL) {
2984 /*
2985 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2986 */
2987 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2988 } else if (ctxt->sax->characters != NULL) {
2989 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2990 }
2991 }
2992}
2993
2994
2995/**
2996 * htmlParseCharDataInternal:
2997 * @ctxt: an HTML parser context
2998 * @readahead: optional read ahead character in ascii range
2999 *
3000 * parse a CharData section.
3001 * if we are within a CDATA section ']]>' marks an end of section.
3002 *
3003 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3004 */
3005
3006static void
3007htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3008 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3009 int nbchar = 0;
3010 int cur, l;
3011 int chunk = 0;
3012
3013 if (readahead)
3014 buf[nbchar++] = readahead;
3015
3016 SHRINK;
3017 cur = CUR_CHAR(l);
3018 while (((cur != '<') || (ctxt->token == '<')) &&
3019 ((cur != '&') || (ctxt->token == '&')) &&
3020 (cur != 0)) {
3021 if (!(IS_CHAR(cur))) {
3022 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3023 "Invalid char in CDATA 0x%X\n", cur);
3024 } else {
3025 COPY_BUF(l,buf,nbchar,cur);
3026 }
3027 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003028 buf[nbchar] = 0;
3029
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003030 /*
3031 * Ok the segment is to be consumed as chars.
3032 */
3033 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3034 if (areBlanks(ctxt, buf, nbchar)) {
3035 if (ctxt->keepBlanks) {
3036 if (ctxt->sax->characters != NULL)
3037 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3038 } else {
3039 if (ctxt->sax->ignorableWhitespace != NULL)
3040 ctxt->sax->ignorableWhitespace(ctxt->userData,
3041 buf, nbchar);
3042 }
3043 } else {
3044 htmlCheckParagraph(ctxt);
3045 if (ctxt->sax->characters != NULL)
3046 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3047 }
3048 }
3049 nbchar = 0;
3050 }
3051 NEXTL(l);
3052 chunk++;
3053 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3054 chunk = 0;
3055 SHRINK;
3056 GROW;
3057 }
3058 cur = CUR_CHAR(l);
3059 if (cur == 0) {
3060 SHRINK;
3061 GROW;
3062 cur = CUR_CHAR(l);
3063 }
3064 }
3065 if (nbchar != 0) {
3066 buf[nbchar] = 0;
3067
3068 /*
3069 * Ok the segment is to be consumed as chars.
3070 */
3071 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3072 if (areBlanks(ctxt, buf, nbchar)) {
3073 if (ctxt->keepBlanks) {
3074 if (ctxt->sax->characters != NULL)
3075 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3076 } else {
3077 if (ctxt->sax->ignorableWhitespace != NULL)
3078 ctxt->sax->ignorableWhitespace(ctxt->userData,
3079 buf, nbchar);
3080 }
3081 } else {
3082 htmlCheckParagraph(ctxt);
3083 if (ctxt->sax->characters != NULL)
3084 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3085 }
3086 }
3087 } else {
3088 /*
3089 * Loop detection
3090 */
3091 if (cur == 0)
3092 ctxt->instate = XML_PARSER_EOF;
3093 }
3094}
3095
3096/**
3097 * htmlParseCharData:
3098 * @ctxt: an HTML parser context
3099 *
3100 * parse a CharData section.
3101 * if we are within a CDATA section ']]>' marks an end of section.
3102 *
3103 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3104 */
3105
3106static void
3107htmlParseCharData(htmlParserCtxtPtr ctxt) {
3108 htmlParseCharDataInternal(ctxt, 0);
3109}
3110
3111/**
3112 * htmlParseExternalID:
3113 * @ctxt: an HTML parser context
3114 * @publicID: a xmlChar** receiving PubidLiteral
3115 *
3116 * Parse an External ID or a Public ID
3117 *
3118 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3119 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3120 *
3121 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3122 *
3123 * Returns the function returns SystemLiteral and in the second
3124 * case publicID receives PubidLiteral, is strict is off
3125 * it is possible to return NULL and have publicID set.
3126 */
3127
3128static xmlChar *
3129htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3130 xmlChar *URI = NULL;
3131
3132 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3133 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3134 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3135 SKIP(6);
3136 if (!IS_BLANK_CH(CUR)) {
3137 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3138 "Space required after 'SYSTEM'\n", NULL, NULL);
3139 }
3140 SKIP_BLANKS;
3141 URI = htmlParseSystemLiteral(ctxt);
3142 if (URI == NULL) {
3143 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3144 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3145 }
3146 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3147 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3148 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3149 SKIP(6);
3150 if (!IS_BLANK_CH(CUR)) {
3151 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3152 "Space required after 'PUBLIC'\n", NULL, NULL);
3153 }
3154 SKIP_BLANKS;
3155 *publicID = htmlParsePubidLiteral(ctxt);
3156 if (*publicID == NULL) {
3157 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3158 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3159 NULL, NULL);
3160 }
3161 SKIP_BLANKS;
3162 if ((CUR == '"') || (CUR == '\'')) {
3163 URI = htmlParseSystemLiteral(ctxt);
3164 }
3165 }
3166 return(URI);
3167}
3168
3169/**
3170 * xmlParsePI:
3171 * @ctxt: an XML parser context
3172 *
3173 * parse an XML Processing Instruction.
3174 *
3175 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3176 */
3177static void
3178htmlParsePI(htmlParserCtxtPtr ctxt) {
3179 xmlChar *buf = NULL;
3180 int len = 0;
3181 int size = HTML_PARSER_BUFFER_SIZE;
3182 int cur, l;
3183 const xmlChar *target;
3184 xmlParserInputState state;
3185 int count = 0;
3186
3187 if ((RAW == '<') && (NXT(1) == '?')) {
3188 state = ctxt->instate;
3189 ctxt->instate = XML_PARSER_PI;
3190 /*
3191 * this is a Processing Instruction.
3192 */
3193 SKIP(2);
3194 SHRINK;
3195
3196 /*
3197 * Parse the target name and check for special support like
3198 * namespace.
3199 */
3200 target = htmlParseName(ctxt);
3201 if (target != NULL) {
3202 if (RAW == '>') {
3203 SKIP(1);
3204
3205 /*
3206 * SAX: PI detected.
3207 */
3208 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3209 (ctxt->sax->processingInstruction != NULL))
3210 ctxt->sax->processingInstruction(ctxt->userData,
3211 target, NULL);
3212 ctxt->instate = state;
3213 return;
3214 }
3215 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3216 if (buf == NULL) {
3217 htmlErrMemory(ctxt, NULL);
3218 ctxt->instate = state;
3219 return;
3220 }
3221 cur = CUR;
3222 if (!IS_BLANK(cur)) {
3223 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3224 "ParsePI: PI %s space expected\n", target, NULL);
3225 }
3226 SKIP_BLANKS;
3227 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003228 while ((cur != 0) && (cur != '>')) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003229 if (len + 5 >= size) {
3230 xmlChar *tmp;
3231
3232 size *= 2;
3233 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3234 if (tmp == NULL) {
3235 htmlErrMemory(ctxt, NULL);
3236 xmlFree(buf);
3237 ctxt->instate = state;
3238 return;
3239 }
3240 buf = tmp;
3241 }
3242 count++;
3243 if (count > 50) {
3244 GROW;
3245 count = 0;
3246 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003247 if (IS_CHAR(cur)) {
3248 COPY_BUF(l,buf,len,cur);
3249 } else {
3250 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3251 "Invalid char in processing instruction "
3252 "0x%X\n", cur);
3253 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003254 NEXTL(l);
3255 cur = CUR_CHAR(l);
3256 if (cur == 0) {
3257 SHRINK;
3258 GROW;
3259 cur = CUR_CHAR(l);
3260 }
3261 }
3262 buf[len] = 0;
3263 if (cur != '>') {
3264 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3265 "ParsePI: PI %s never end ...\n", target, NULL);
3266 } else {
3267 SKIP(1);
3268
3269 /*
3270 * SAX: PI detected.
3271 */
3272 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3273 (ctxt->sax->processingInstruction != NULL))
3274 ctxt->sax->processingInstruction(ctxt->userData,
3275 target, buf);
3276 }
3277 xmlFree(buf);
3278 } else {
3279 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3280 "PI is not started correctly", NULL, NULL);
3281 }
3282 ctxt->instate = state;
3283 }
3284}
3285
3286/**
3287 * htmlParseComment:
3288 * @ctxt: an HTML parser context
3289 *
3290 * Parse an XML (SGML) comment <!-- .... -->
3291 *
3292 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3293 */
3294static void
3295htmlParseComment(htmlParserCtxtPtr ctxt) {
3296 xmlChar *buf = NULL;
3297 int len;
3298 int size = HTML_PARSER_BUFFER_SIZE;
3299 int q, ql;
3300 int r, rl;
3301 int cur, l;
3302 xmlParserInputState state;
3303
3304 /*
3305 * Check that there is a comment right here.
3306 */
3307 if ((RAW != '<') || (NXT(1) != '!') ||
3308 (NXT(2) != '-') || (NXT(3) != '-')) return;
3309
3310 state = ctxt->instate;
3311 ctxt->instate = XML_PARSER_COMMENT;
3312 SHRINK;
3313 SKIP(4);
3314 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3315 if (buf == NULL) {
3316 htmlErrMemory(ctxt, "buffer allocation failed\n");
3317 ctxt->instate = state;
3318 return;
3319 }
3320 len = 0;
3321 buf[len] = 0;
3322 q = CUR_CHAR(ql);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003323 if (q == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003324 goto unfinished;
3325 NEXTL(ql);
3326 r = CUR_CHAR(rl);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003327 if (r == 0)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003328 goto unfinished;
3329 NEXTL(rl);
3330 cur = CUR_CHAR(l);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003331 while ((cur != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003332 ((cur != '>') ||
3333 (r != '-') || (q != '-'))) {
3334 if (len + 5 >= size) {
3335 xmlChar *tmp;
3336
3337 size *= 2;
3338 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3339 if (tmp == NULL) {
3340 xmlFree(buf);
3341 htmlErrMemory(ctxt, "growing buffer failed\n");
3342 ctxt->instate = state;
3343 return;
3344 }
3345 buf = tmp;
3346 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003347 if (IS_CHAR(q)) {
3348 COPY_BUF(ql,buf,len,q);
3349 } else {
3350 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3351 "Invalid char in comment 0x%X\n", q);
3352 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003353 q = r;
3354 ql = rl;
3355 r = cur;
3356 rl = l;
3357 NEXTL(l);
3358 cur = CUR_CHAR(l);
3359 if (cur == 0) {
3360 SHRINK;
3361 GROW;
3362 cur = CUR_CHAR(l);
3363 }
3364 }
3365 buf[len] = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003366 if (cur == '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003367 NEXT;
3368 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3369 (!ctxt->disableSAX))
3370 ctxt->sax->comment(ctxt->userData, buf);
3371 xmlFree(buf);
3372 ctxt->instate = state;
3373 return;
3374 }
3375
3376unfinished:
3377 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3378 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3379 xmlFree(buf);
3380}
3381
3382/**
3383 * htmlParseCharRef:
3384 * @ctxt: an HTML parser context
3385 *
3386 * parse Reference declarations
3387 *
3388 * [66] CharRef ::= '&#' [0-9]+ ';' |
3389 * '&#x' [0-9a-fA-F]+ ';'
3390 *
3391 * Returns the value parsed (as an int)
3392 */
3393int
3394htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3395 int val = 0;
3396
3397 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3398 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3399 "htmlParseCharRef: context error\n",
3400 NULL, NULL);
3401 return(0);
3402 }
3403 if ((CUR == '&') && (NXT(1) == '#') &&
3404 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3405 SKIP(3);
3406 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003407 if ((CUR >= '0') && (CUR <= '9')) {
3408 if (val < 0x110000)
3409 val = val * 16 + (CUR - '0');
3410 } else if ((CUR >= 'a') && (CUR <= 'f')) {
3411 if (val < 0x110000)
3412 val = val * 16 + (CUR - 'a') + 10;
3413 } else if ((CUR >= 'A') && (CUR <= 'F')) {
3414 if (val < 0x110000)
3415 val = val * 16 + (CUR - 'A') + 10;
3416 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003417 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3418 "htmlParseCharRef: missing semicolon\n",
3419 NULL, NULL);
3420 break;
3421 }
3422 NEXT;
3423 }
3424 if (CUR == ';')
3425 NEXT;
3426 } else if ((CUR == '&') && (NXT(1) == '#')) {
3427 SKIP(2);
3428 while (CUR != ';') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003429 if ((CUR >= '0') && (CUR <= '9')) {
3430 if (val < 0x110000)
3431 val = val * 10 + (CUR - '0');
3432 } else {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003433 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3434 "htmlParseCharRef: missing semicolon\n",
3435 NULL, NULL);
3436 break;
3437 }
3438 NEXT;
3439 }
3440 if (CUR == ';')
3441 NEXT;
3442 } else {
3443 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3444 "htmlParseCharRef: invalid value\n", NULL, NULL);
3445 }
3446 /*
3447 * Check the value IS_CHAR ...
3448 */
3449 if (IS_CHAR(val)) {
3450 return(val);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003451 } else if (val >= 0x110000) {
3452 htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3453 "htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003454 } else {
3455 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3456 "htmlParseCharRef: invalid xmlChar value %d\n",
3457 val);
3458 }
3459 return(0);
3460}
3461
3462
3463/**
3464 * htmlParseDocTypeDecl:
3465 * @ctxt: an HTML parser context
3466 *
3467 * parse a DOCTYPE declaration
3468 *
3469 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3470 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3471 */
3472
3473static void
3474htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3475 const xmlChar *name;
3476 xmlChar *ExternalID = NULL;
3477 xmlChar *URI = NULL;
3478
3479 /*
3480 * We know that '<!DOCTYPE' has been detected.
3481 */
3482 SKIP(9);
3483
3484 SKIP_BLANKS;
3485
3486 /*
3487 * Parse the DOCTYPE name.
3488 */
3489 name = htmlParseName(ctxt);
3490 if (name == NULL) {
3491 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3492 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3493 NULL, NULL);
3494 }
3495 /*
3496 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3497 */
3498
3499 SKIP_BLANKS;
3500
3501 /*
3502 * Check for SystemID and ExternalID
3503 */
3504 URI = htmlParseExternalID(ctxt, &ExternalID);
3505 SKIP_BLANKS;
3506
3507 /*
3508 * We should be at the end of the DOCTYPE declaration.
3509 */
3510 if (CUR != '>') {
3511 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3512 "DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003513 /* Ignore bogus content */
3514 while ((CUR != 0) && (CUR != '>'))
3515 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003516 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003517 if (CUR == '>')
3518 NEXT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003519
3520 /*
3521 * Create or update the document accordingly to the DOCTYPE
3522 */
3523 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3524 (!ctxt->disableSAX))
3525 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3526
3527 /*
3528 * Cleanup, since we don't use all those identifiers
3529 */
3530 if (URI != NULL) xmlFree(URI);
3531 if (ExternalID != NULL) xmlFree(ExternalID);
3532}
3533
3534/**
3535 * htmlParseAttribute:
3536 * @ctxt: an HTML parser context
3537 * @value: a xmlChar ** used to store the value of the attribute
3538 *
3539 * parse an attribute
3540 *
3541 * [41] Attribute ::= Name Eq AttValue
3542 *
3543 * [25] Eq ::= S? '=' S?
3544 *
3545 * With namespace:
3546 *
3547 * [NS 11] Attribute ::= QName Eq AttValue
3548 *
3549 * Also the case QName == xmlns:??? is handled independently as a namespace
3550 * definition.
3551 *
3552 * Returns the attribute name, and the value in *value.
3553 */
3554
3555static const xmlChar *
3556htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3557 const xmlChar *name;
3558 xmlChar *val = NULL;
3559
3560 *value = NULL;
3561 name = htmlParseHTMLName(ctxt);
3562 if (name == NULL) {
3563 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3564 "error parsing attribute name\n", NULL, NULL);
3565 return(NULL);
3566 }
3567
3568 /*
3569 * read the value
3570 */
3571 SKIP_BLANKS;
3572 if (CUR == '=') {
3573 NEXT;
3574 SKIP_BLANKS;
3575 val = htmlParseAttValue(ctxt);
3576 }
3577
3578 *value = val;
3579 return(name);
3580}
3581
3582/**
3583 * htmlCheckEncodingDirect:
3584 * @ctxt: an HTML parser context
3585 * @attvalue: the attribute value
3586 *
3587 * Checks an attribute value to detect
3588 * the encoding
3589 * If a new encoding is detected the parser is switched to decode
3590 * it and pass UTF8
3591 */
3592static void
3593htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3594
3595 if ((ctxt == NULL) || (encoding == NULL) ||
3596 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3597 return;
3598
3599 /* do not change encoding */
3600 if (ctxt->input->encoding != NULL)
3601 return;
3602
3603 if (encoding != NULL) {
3604 xmlCharEncoding enc;
3605 xmlCharEncodingHandlerPtr handler;
3606
3607 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3608
3609 if (ctxt->input->encoding != NULL)
3610 xmlFree((xmlChar *) ctxt->input->encoding);
3611 ctxt->input->encoding = xmlStrdup(encoding);
3612
3613 enc = xmlParseCharEncoding((const char *) encoding);
3614 /*
3615 * registered set of known encodings
3616 */
3617 if (enc != XML_CHAR_ENCODING_ERROR) {
3618 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3619 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3620 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3621 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3622 (ctxt->input->buf != NULL) &&
3623 (ctxt->input->buf->encoder == NULL)) {
3624 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3625 "htmlCheckEncoding: wrong encoding meta\n",
3626 NULL, NULL);
3627 } else {
3628 xmlSwitchEncoding(ctxt, enc);
3629 }
3630 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3631 } else {
3632 /*
3633 * fallback for unknown encodings
3634 */
3635 handler = xmlFindCharEncodingHandler((const char *) encoding);
3636 if (handler != NULL) {
3637 xmlSwitchToEncoding(ctxt, handler);
3638 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3639 } else {
3640 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3641 "htmlCheckEncoding: unknown encoding %s\n",
3642 encoding, NULL);
3643 }
3644 }
3645
3646 if ((ctxt->input->buf != NULL) &&
3647 (ctxt->input->buf->encoder != NULL) &&
3648 (ctxt->input->buf->raw != NULL) &&
3649 (ctxt->input->buf->buffer != NULL)) {
3650 int nbchars;
3651 int processed;
3652
3653 /*
3654 * convert as much as possible to the parser reading buffer.
3655 */
3656 processed = ctxt->input->cur - ctxt->input->base;
3657 xmlBufShrink(ctxt->input->buf->buffer, processed);
3658 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3659 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3660 if (nbchars < 0) {
3661 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3662 "htmlCheckEncoding: encoder error\n",
3663 NULL, NULL);
3664 }
3665 }
3666 }
3667}
3668
3669/**
3670 * htmlCheckEncoding:
3671 * @ctxt: an HTML parser context
3672 * @attvalue: the attribute value
3673 *
3674 * Checks an http-equiv attribute from a Meta tag to detect
3675 * the encoding
3676 * If a new encoding is detected the parser is switched to decode
3677 * it and pass UTF8
3678 */
3679static void
3680htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3681 const xmlChar *encoding;
3682
3683 if (!attvalue)
3684 return;
3685
3686 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3687 if (encoding != NULL) {
3688 encoding += 7;
3689 }
3690 /*
3691 * skip blank
3692 */
3693 if (encoding && IS_BLANK_CH(*encoding))
3694 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3695 if (encoding && *encoding == '=') {
3696 encoding ++;
3697 htmlCheckEncodingDirect(ctxt, encoding);
3698 }
3699}
3700
3701/**
3702 * htmlCheckMeta:
3703 * @ctxt: an HTML parser context
3704 * @atts: the attributes values
3705 *
3706 * Checks an attributes from a Meta tag
3707 */
3708static void
3709htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3710 int i;
3711 const xmlChar *att, *value;
3712 int http = 0;
3713 const xmlChar *content = NULL;
3714
3715 if ((ctxt == NULL) || (atts == NULL))
3716 return;
3717
3718 i = 0;
3719 att = atts[i++];
3720 while (att != NULL) {
3721 value = atts[i++];
3722 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3723 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3724 http = 1;
3725 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3726 htmlCheckEncodingDirect(ctxt, value);
3727 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3728 content = value;
3729 att = atts[i++];
3730 }
3731 if ((http) && (content != NULL))
3732 htmlCheckEncoding(ctxt, content);
3733
3734}
3735
3736/**
3737 * htmlParseStartTag:
3738 * @ctxt: an HTML parser context
3739 *
3740 * parse a start of tag either for rule element or
3741 * EmptyElement. In both case we don't parse the tag closing chars.
3742 *
3743 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3744 *
3745 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3746 *
3747 * With namespace:
3748 *
3749 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3750 *
3751 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3752 *
3753 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3754 */
3755
3756static int
3757htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3758 const xmlChar *name;
3759 const xmlChar *attname;
3760 xmlChar *attvalue;
3761 const xmlChar **atts;
3762 int nbatts = 0;
3763 int maxatts;
3764 int meta = 0;
3765 int i;
3766 int discardtag = 0;
3767
3768 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3769 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3770 "htmlParseStartTag: context error\n", NULL, NULL);
3771 return -1;
3772 }
3773 if (ctxt->instate == XML_PARSER_EOF)
3774 return(-1);
3775 if (CUR != '<') return -1;
3776 NEXT;
3777
3778 atts = ctxt->atts;
3779 maxatts = ctxt->maxatts;
3780
3781 GROW;
3782 name = htmlParseHTMLName(ctxt);
3783 if (name == NULL) {
3784 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3785 "htmlParseStartTag: invalid element name\n",
3786 NULL, NULL);
3787 /* if recover preserve text on classic misconstructs */
3788 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3789 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3790 htmlParseCharDataInternal(ctxt, '<');
3791 return(-1);
3792 }
3793
3794
3795 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003796 while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003797 (ctxt->instate != XML_PARSER_EOF))
3798 NEXT;
3799 return -1;
3800 }
3801 if (xmlStrEqual(name, BAD_CAST"meta"))
3802 meta = 1;
3803
3804 /*
3805 * Check for auto-closure of HTML elements.
3806 */
3807 htmlAutoClose(ctxt, name);
3808
3809 /*
3810 * Check for implied HTML elements.
3811 */
3812 htmlCheckImplied(ctxt, name);
3813
3814 /*
3815 * Avoid html at any level > 0, head at any level != 1
3816 * or any attempt to recurse body
3817 */
3818 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3819 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3820 "htmlParseStartTag: misplaced <html> tag\n",
3821 name, NULL);
3822 discardtag = 1;
3823 ctxt->depth++;
3824 }
3825 if ((ctxt->nameNr != 1) &&
3826 (xmlStrEqual(name, BAD_CAST"head"))) {
3827 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3828 "htmlParseStartTag: misplaced <head> tag\n",
3829 name, NULL);
3830 discardtag = 1;
3831 ctxt->depth++;
3832 }
3833 if (xmlStrEqual(name, BAD_CAST"body")) {
3834 int indx;
3835 for (indx = 0;indx < ctxt->nameNr;indx++) {
3836 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3837 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3838 "htmlParseStartTag: misplaced <body> tag\n",
3839 name, NULL);
3840 discardtag = 1;
3841 ctxt->depth++;
3842 }
3843 }
3844 }
3845
3846 /*
3847 * Now parse the attributes, it ends up with the ending
3848 *
3849 * (S Attribute)* S?
3850 */
3851 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003852 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003853 (CUR != '>') &&
3854 ((CUR != '/') || (NXT(1) != '>'))) {
3855 long cons = ctxt->nbChars;
3856
3857 GROW;
3858 attname = htmlParseAttribute(ctxt, &attvalue);
3859 if (attname != NULL) {
3860
3861 /*
3862 * Well formedness requires at most one declaration of an attribute
3863 */
3864 for (i = 0; i < nbatts;i += 2) {
3865 if (xmlStrEqual(atts[i], attname)) {
3866 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3867 "Attribute %s redefined\n", attname, NULL);
3868 if (attvalue != NULL)
3869 xmlFree(attvalue);
3870 goto failed;
3871 }
3872 }
3873
3874 /*
3875 * Add the pair to atts
3876 */
3877 if (atts == NULL) {
3878 maxatts = 22; /* allow for 10 attrs by default */
3879 atts = (const xmlChar **)
3880 xmlMalloc(maxatts * sizeof(xmlChar *));
3881 if (atts == NULL) {
3882 htmlErrMemory(ctxt, NULL);
3883 if (attvalue != NULL)
3884 xmlFree(attvalue);
3885 goto failed;
3886 }
3887 ctxt->atts = atts;
3888 ctxt->maxatts = maxatts;
3889 } else if (nbatts + 4 > maxatts) {
3890 const xmlChar **n;
3891
3892 maxatts *= 2;
3893 n = (const xmlChar **) xmlRealloc((void *) atts,
3894 maxatts * sizeof(const xmlChar *));
3895 if (n == NULL) {
3896 htmlErrMemory(ctxt, NULL);
3897 if (attvalue != NULL)
3898 xmlFree(attvalue);
3899 goto failed;
3900 }
3901 atts = n;
3902 ctxt->atts = atts;
3903 ctxt->maxatts = maxatts;
3904 }
3905 atts[nbatts++] = attname;
3906 atts[nbatts++] = attvalue;
3907 atts[nbatts] = NULL;
3908 atts[nbatts + 1] = NULL;
3909 }
3910 else {
3911 if (attvalue != NULL)
3912 xmlFree(attvalue);
3913 /* Dump the bogus attribute string up to the next blank or
3914 * the end of the tag. */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003915 while ((CUR != 0) &&
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003916 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3917 ((CUR != '/') || (NXT(1) != '>')))
3918 NEXT;
3919 }
3920
3921failed:
3922 SKIP_BLANKS;
3923 if (cons == ctxt->nbChars) {
3924 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3925 "htmlParseStartTag: problem parsing attributes\n",
3926 NULL, NULL);
3927 break;
3928 }
3929 }
3930
3931 /*
3932 * Handle specific association to the META tag
3933 */
3934 if (meta && (nbatts != 0))
3935 htmlCheckMeta(ctxt, atts);
3936
3937 /*
3938 * SAX: Start of Element !
3939 */
3940 if (!discardtag) {
3941 htmlnamePush(ctxt, name);
3942 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3943 if (nbatts != 0)
3944 ctxt->sax->startElement(ctxt->userData, name, atts);
3945 else
3946 ctxt->sax->startElement(ctxt->userData, name, NULL);
3947 }
3948 }
3949
3950 if (atts != NULL) {
3951 for (i = 1;i < nbatts;i += 2) {
3952 if (atts[i] != NULL)
3953 xmlFree((xmlChar *) atts[i]);
3954 }
3955 }
3956
3957 return(discardtag);
3958}
3959
3960/**
3961 * htmlParseEndTag:
3962 * @ctxt: an HTML parser context
3963 *
3964 * parse an end of tag
3965 *
3966 * [42] ETag ::= '</' Name S? '>'
3967 *
3968 * With namespace
3969 *
3970 * [NS 9] ETag ::= '</' QName S? '>'
3971 *
3972 * Returns 1 if the current level should be closed.
3973 */
3974
3975static int
3976htmlParseEndTag(htmlParserCtxtPtr ctxt)
3977{
3978 const xmlChar *name;
3979 const xmlChar *oldname;
3980 int i, ret;
3981
3982 if ((CUR != '<') || (NXT(1) != '/')) {
3983 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3984 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3985 return (0);
3986 }
3987 SKIP(2);
3988
3989 name = htmlParseHTMLName(ctxt);
3990 if (name == NULL)
3991 return (0);
3992 /*
3993 * We should definitely be at the ending "S? '>'" part
3994 */
3995 SKIP_BLANKS;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003996 if (CUR != '>') {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08003997 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3998 "End tag : expected '>'\n", NULL, NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003999 /* Skip to next '>' */
4000 while ((CUR != 0) && (CUR != '>'))
4001 NEXT;
4002 }
4003 if (CUR == '>')
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004004 NEXT;
4005
4006 /*
4007 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4008 * out now.
4009 */
4010 if ((ctxt->depth > 0) &&
4011 (xmlStrEqual(name, BAD_CAST "html") ||
4012 xmlStrEqual(name, BAD_CAST "body") ||
4013 xmlStrEqual(name, BAD_CAST "head"))) {
4014 ctxt->depth--;
4015 return (0);
4016 }
4017
4018 /*
4019 * If the name read is not one of the element in the parsing stack
4020 * then return, it's just an error.
4021 */
4022 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4023 if (xmlStrEqual(name, ctxt->nameTab[i]))
4024 break;
4025 }
4026 if (i < 0) {
4027 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4028 "Unexpected end tag : %s\n", name, NULL);
4029 return (0);
4030 }
4031
4032
4033 /*
4034 * Check for auto-closure of HTML elements.
4035 */
4036
4037 htmlAutoCloseOnClose(ctxt, name);
4038
4039 /*
4040 * Well formedness constraints, opening and closing must match.
4041 * With the exception that the autoclose may have popped stuff out
4042 * of the stack.
4043 */
4044 if (!xmlStrEqual(name, ctxt->name)) {
4045 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4046 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4047 "Opening and ending tag mismatch: %s and %s\n",
4048 name, ctxt->name);
4049 }
4050 }
4051
4052 /*
4053 * SAX: End of Tag
4054 */
4055 oldname = ctxt->name;
4056 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4057 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4058 ctxt->sax->endElement(ctxt->userData, name);
4059 htmlNodeInfoPop(ctxt);
4060 htmlnamePop(ctxt);
4061 ret = 1;
4062 } else {
4063 ret = 0;
4064 }
4065
4066 return (ret);
4067}
4068
4069
4070/**
4071 * htmlParseReference:
4072 * @ctxt: an HTML parser context
4073 *
4074 * parse and handle entity references in content,
4075 * this will end-up in a call to character() since this is either a
4076 * CharRef, or a predefined entity.
4077 */
4078static void
4079htmlParseReference(htmlParserCtxtPtr ctxt) {
4080 const htmlEntityDesc * ent;
4081 xmlChar out[6];
4082 const xmlChar *name;
4083 if (CUR != '&') return;
4084
4085 if (NXT(1) == '#') {
4086 unsigned int c;
4087 int bits, i = 0;
4088
4089 c = htmlParseCharRef(ctxt);
4090 if (c == 0)
4091 return;
4092
4093 if (c < 0x80) { out[i++]= c; bits= -6; }
4094 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4095 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4096 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4097
4098 for ( ; bits >= 0; bits-= 6) {
4099 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4100 }
4101 out[i] = 0;
4102
4103 htmlCheckParagraph(ctxt);
4104 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4105 ctxt->sax->characters(ctxt->userData, out, i);
4106 } else {
4107 ent = htmlParseEntityRef(ctxt, &name);
4108 if (name == NULL) {
4109 htmlCheckParagraph(ctxt);
4110 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4111 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4112 return;
4113 }
4114 if ((ent == NULL) || !(ent->value > 0)) {
4115 htmlCheckParagraph(ctxt);
4116 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4117 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4118 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4119 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4120 }
4121 } else {
4122 unsigned int c;
4123 int bits, i = 0;
4124
4125 c = ent->value;
4126 if (c < 0x80)
4127 { out[i++]= c; bits= -6; }
4128 else if (c < 0x800)
4129 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4130 else if (c < 0x10000)
4131 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4132 else
4133 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4134
4135 for ( ; bits >= 0; bits-= 6) {
4136 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4137 }
4138 out[i] = 0;
4139
4140 htmlCheckParagraph(ctxt);
4141 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4142 ctxt->sax->characters(ctxt->userData, out, i);
4143 }
4144 }
4145}
4146
4147/**
4148 * htmlParseContent:
4149 * @ctxt: an HTML parser context
4150 *
4151 * Parse a content: comment, sub-element, reference or text.
4152 * Kept for compatibility with old code
4153 */
4154
4155static void
4156htmlParseContent(htmlParserCtxtPtr ctxt) {
4157 xmlChar *currentNode;
4158 int depth;
4159 const xmlChar *name;
4160
4161 currentNode = xmlStrdup(ctxt->name);
4162 depth = ctxt->nameNr;
4163 while (1) {
4164 long cons = ctxt->nbChars;
4165
4166 GROW;
4167
4168 if (ctxt->instate == XML_PARSER_EOF)
4169 break;
4170
4171 /*
4172 * Our tag or one of it's parent or children is ending.
4173 */
4174 if ((CUR == '<') && (NXT(1) == '/')) {
4175 if (htmlParseEndTag(ctxt) &&
4176 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4177 if (currentNode != NULL)
4178 xmlFree(currentNode);
4179 return;
4180 }
4181 continue; /* while */
4182 }
4183
4184 else if ((CUR == '<') &&
4185 ((IS_ASCII_LETTER(NXT(1))) ||
4186 (NXT(1) == '_') || (NXT(1) == ':'))) {
4187 name = htmlParseHTMLName_nonInvasive(ctxt);
4188 if (name == NULL) {
4189 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4190 "htmlParseStartTag: invalid element name\n",
4191 NULL, NULL);
4192 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004193 while ((CUR != 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004194 NEXT;
4195
4196 if (currentNode != NULL)
4197 xmlFree(currentNode);
4198 return;
4199 }
4200
4201 if (ctxt->name != NULL) {
4202 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4203 htmlAutoClose(ctxt, name);
4204 continue;
4205 }
4206 }
4207 }
4208
4209 /*
4210 * Has this node been popped out during parsing of
4211 * the next element
4212 */
4213 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4214 (!xmlStrEqual(currentNode, ctxt->name)))
4215 {
4216 if (currentNode != NULL) xmlFree(currentNode);
4217 return;
4218 }
4219
4220 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4221 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4222 /*
4223 * Handle SCRIPT/STYLE separately
4224 */
4225 htmlParseScript(ctxt);
4226 } else {
4227 /*
4228 * Sometimes DOCTYPE arrives in the middle of the document
4229 */
4230 if ((CUR == '<') && (NXT(1) == '!') &&
4231 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4232 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4233 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4234 (UPP(8) == 'E')) {
4235 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4236 "Misplaced DOCTYPE declaration\n",
4237 BAD_CAST "DOCTYPE" , NULL);
4238 htmlParseDocTypeDecl(ctxt);
4239 }
4240
4241 /*
4242 * First case : a comment
4243 */
4244 if ((CUR == '<') && (NXT(1) == '!') &&
4245 (NXT(2) == '-') && (NXT(3) == '-')) {
4246 htmlParseComment(ctxt);
4247 }
4248
4249 /*
4250 * Second case : a Processing Instruction.
4251 */
4252 else if ((CUR == '<') && (NXT(1) == '?')) {
4253 htmlParsePI(ctxt);
4254 }
4255
4256 /*
4257 * Third case : a sub-element.
4258 */
4259 else if (CUR == '<') {
4260 htmlParseElement(ctxt);
4261 }
4262
4263 /*
4264 * Fourth case : a reference. If if has not been resolved,
4265 * parsing returns it's Name, create the node
4266 */
4267 else if (CUR == '&') {
4268 htmlParseReference(ctxt);
4269 }
4270
4271 /*
4272 * Fifth case : end of the resource
4273 */
4274 else if (CUR == 0) {
4275 htmlAutoCloseOnEnd(ctxt);
4276 break;
4277 }
4278
4279 /*
4280 * Last case, text. Note that References are handled directly.
4281 */
4282 else {
4283 htmlParseCharData(ctxt);
4284 }
4285
4286 if (cons == ctxt->nbChars) {
4287 if (ctxt->node != NULL) {
4288 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4289 "detected an error in element content\n",
4290 NULL, NULL);
4291 }
4292 break;
4293 }
4294 }
4295 GROW;
4296 }
4297 if (currentNode != NULL) xmlFree(currentNode);
4298}
4299
4300/**
4301 * htmlParseElement:
4302 * @ctxt: an HTML parser context
4303 *
4304 * parse an HTML element, this is highly recursive
4305 * this is kept for compatibility with previous code versions
4306 *
4307 * [39] element ::= EmptyElemTag | STag content ETag
4308 *
4309 * [41] Attribute ::= Name Eq AttValue
4310 */
4311
4312void
4313htmlParseElement(htmlParserCtxtPtr ctxt) {
4314 const xmlChar *name;
4315 xmlChar *currentNode = NULL;
4316 const htmlElemDesc * info;
4317 htmlParserNodeInfo node_info;
4318 int failed;
4319 int depth;
4320 const xmlChar *oldptr;
4321
4322 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4323 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4324 "htmlParseElement: context error\n", NULL, NULL);
4325 return;
4326 }
4327
4328 if (ctxt->instate == XML_PARSER_EOF)
4329 return;
4330
4331 /* Capture start position */
4332 if (ctxt->record_info) {
4333 node_info.begin_pos = ctxt->input->consumed +
4334 (CUR_PTR - ctxt->input->base);
4335 node_info.begin_line = ctxt->input->line;
4336 }
4337
4338 failed = htmlParseStartTag(ctxt);
4339 name = ctxt->name;
4340 if ((failed == -1) || (name == NULL)) {
4341 if (CUR == '>')
4342 NEXT;
4343 return;
4344 }
4345
4346 /*
4347 * Lookup the info for that element.
4348 */
4349 info = htmlTagLookup(name);
4350 if (info == NULL) {
4351 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4352 "Tag %s invalid\n", name, NULL);
4353 }
4354
4355 /*
4356 * Check for an Empty Element labeled the XML/SGML way
4357 */
4358 if ((CUR == '/') && (NXT(1) == '>')) {
4359 SKIP(2);
4360 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4361 ctxt->sax->endElement(ctxt->userData, name);
4362 htmlnamePop(ctxt);
4363 return;
4364 }
4365
4366 if (CUR == '>') {
4367 NEXT;
4368 } else {
4369 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4370 "Couldn't find end of Start Tag %s\n", name, NULL);
4371
4372 /*
4373 * end of parsing of this node.
4374 */
4375 if (xmlStrEqual(name, ctxt->name)) {
4376 nodePop(ctxt);
4377 htmlnamePop(ctxt);
4378 }
4379
4380 /*
4381 * Capture end position and add node
4382 */
4383 if (ctxt->record_info) {
4384 node_info.end_pos = ctxt->input->consumed +
4385 (CUR_PTR - ctxt->input->base);
4386 node_info.end_line = ctxt->input->line;
4387 node_info.node = ctxt->node;
4388 xmlParserAddNodeInfo(ctxt, &node_info);
4389 }
4390 return;
4391 }
4392
4393 /*
4394 * Check for an Empty Element from DTD definition
4395 */
4396 if ((info != NULL) && (info->empty)) {
4397 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4398 ctxt->sax->endElement(ctxt->userData, name);
4399 htmlnamePop(ctxt);
4400 return;
4401 }
4402
4403 /*
4404 * Parse the content of the element:
4405 */
4406 currentNode = xmlStrdup(ctxt->name);
4407 depth = ctxt->nameNr;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004408 while (CUR != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004409 oldptr = ctxt->input->cur;
4410 htmlParseContent(ctxt);
4411 if (oldptr==ctxt->input->cur) break;
4412 if (ctxt->nameNr < depth) break;
4413 }
4414
4415 /*
4416 * Capture end position and add node
4417 */
4418 if ( currentNode != NULL && ctxt->record_info ) {
4419 node_info.end_pos = ctxt->input->consumed +
4420 (CUR_PTR - ctxt->input->base);
4421 node_info.end_line = ctxt->input->line;
4422 node_info.node = ctxt->node;
4423 xmlParserAddNodeInfo(ctxt, &node_info);
4424 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004425 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004426 htmlAutoCloseOnEnd(ctxt);
4427 }
4428
4429 if (currentNode != NULL)
4430 xmlFree(currentNode);
4431}
4432
4433static void
4434htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4435 /*
4436 * Capture end position and add node
4437 */
4438 if ( ctxt->node != NULL && ctxt->record_info ) {
4439 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4440 (CUR_PTR - ctxt->input->base);
4441 ctxt->nodeInfo->end_line = ctxt->input->line;
4442 ctxt->nodeInfo->node = ctxt->node;
4443 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4444 htmlNodeInfoPop(ctxt);
4445 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004446 if (CUR == 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004447 htmlAutoCloseOnEnd(ctxt);
4448 }
4449}
4450
4451/**
4452 * htmlParseElementInternal:
4453 * @ctxt: an HTML parser context
4454 *
4455 * parse an HTML element, new version, non recursive
4456 *
4457 * [39] element ::= EmptyElemTag | STag content ETag
4458 *
4459 * [41] Attribute ::= Name Eq AttValue
4460 */
4461
4462static void
4463htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4464 const xmlChar *name;
4465 const htmlElemDesc * info;
4466 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4467 int failed;
4468
4469 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4470 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4471 "htmlParseElementInternal: context error\n", NULL, NULL);
4472 return;
4473 }
4474
4475 if (ctxt->instate == XML_PARSER_EOF)
4476 return;
4477
4478 /* Capture start position */
4479 if (ctxt->record_info) {
4480 node_info.begin_pos = ctxt->input->consumed +
4481 (CUR_PTR - ctxt->input->base);
4482 node_info.begin_line = ctxt->input->line;
4483 }
4484
4485 failed = htmlParseStartTag(ctxt);
4486 name = ctxt->name;
4487 if ((failed == -1) || (name == NULL)) {
4488 if (CUR == '>')
4489 NEXT;
4490 return;
4491 }
4492
4493 /*
4494 * Lookup the info for that element.
4495 */
4496 info = htmlTagLookup(name);
4497 if (info == NULL) {
4498 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4499 "Tag %s invalid\n", name, NULL);
4500 }
4501
4502 /*
4503 * Check for an Empty Element labeled the XML/SGML way
4504 */
4505 if ((CUR == '/') && (NXT(1) == '>')) {
4506 SKIP(2);
4507 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4508 ctxt->sax->endElement(ctxt->userData, name);
4509 htmlnamePop(ctxt);
4510 return;
4511 }
4512
4513 if (CUR == '>') {
4514 NEXT;
4515 } else {
4516 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4517 "Couldn't find end of Start Tag %s\n", name, NULL);
4518
4519 /*
4520 * end of parsing of this node.
4521 */
4522 if (xmlStrEqual(name, ctxt->name)) {
4523 nodePop(ctxt);
4524 htmlnamePop(ctxt);
4525 }
4526
4527 if (ctxt->record_info)
4528 htmlNodeInfoPush(ctxt, &node_info);
4529 htmlParserFinishElementParsing(ctxt);
4530 return;
4531 }
4532
4533 /*
4534 * Check for an Empty Element from DTD definition
4535 */
4536 if ((info != NULL) && (info->empty)) {
4537 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4538 ctxt->sax->endElement(ctxt->userData, name);
4539 htmlnamePop(ctxt);
4540 return;
4541 }
4542
4543 if (ctxt->record_info)
4544 htmlNodeInfoPush(ctxt, &node_info);
4545}
4546
4547/**
4548 * htmlParseContentInternal:
4549 * @ctxt: an HTML parser context
4550 *
4551 * Parse a content: comment, sub-element, reference or text.
4552 * New version for non recursive htmlParseElementInternal
4553 */
4554
4555static void
4556htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4557 xmlChar *currentNode;
4558 int depth;
4559 const xmlChar *name;
4560
4561 currentNode = xmlStrdup(ctxt->name);
4562 depth = ctxt->nameNr;
4563 while (1) {
4564 long cons = ctxt->nbChars;
4565
4566 GROW;
4567
4568 if (ctxt->instate == XML_PARSER_EOF)
4569 break;
4570
4571 /*
4572 * Our tag or one of it's parent or children is ending.
4573 */
4574 if ((CUR == '<') && (NXT(1) == '/')) {
4575 if (htmlParseEndTag(ctxt) &&
4576 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4577 if (currentNode != NULL)
4578 xmlFree(currentNode);
4579
4580 currentNode = xmlStrdup(ctxt->name);
4581 depth = ctxt->nameNr;
4582 }
4583 continue; /* while */
4584 }
4585
4586 else if ((CUR == '<') &&
4587 ((IS_ASCII_LETTER(NXT(1))) ||
4588 (NXT(1) == '_') || (NXT(1) == ':'))) {
4589 name = htmlParseHTMLName_nonInvasive(ctxt);
4590 if (name == NULL) {
4591 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4592 "htmlParseStartTag: invalid element name\n",
4593 NULL, NULL);
4594 /* Dump the bogus tag like browsers do */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004595 while ((CUR == 0) && (CUR != '>'))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08004596 NEXT;
4597
4598 htmlParserFinishElementParsing(ctxt);
4599 if (currentNode != NULL)
4600 xmlFree(currentNode);
4601
4602 currentNode = xmlStrdup(ctxt->name);
4603 depth = ctxt->nameNr;
4604 continue;
4605 }
4606
4607 if (ctxt->name != NULL) {
4608 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4609 htmlAutoClose(ctxt, name);
4610 continue;
4611 }
4612 }
4613 }
4614
4615 /*
4616 * Has this node been popped out during parsing of
4617 * the next element
4618 */
4619 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4620 (!xmlStrEqual(currentNode, ctxt->name)))
4621 {
4622 htmlParserFinishElementParsing(ctxt);
4623 if (currentNode != NULL) xmlFree(currentNode);
4624
4625 currentNode = xmlStrdup(ctxt->name);
4626 depth = ctxt->nameNr;
4627 continue;
4628 }
4629
4630 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4631 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4632 /*
4633 * Handle SCRIPT/STYLE separately
4634 */
4635 htmlParseScript(ctxt);
4636 } else {
4637 /*
4638 * Sometimes DOCTYPE arrives in the middle of the document
4639 */
4640 if ((CUR == '<') && (NXT(1) == '!') &&
4641 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4642 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4643 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4644 (UPP(8) == 'E')) {
4645 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4646 "Misplaced DOCTYPE declaration\n",
4647 BAD_CAST "DOCTYPE" , NULL);
4648 htmlParseDocTypeDecl(ctxt);
4649 }
4650
4651 /*
4652 * First case : a comment
4653 */
4654 if ((CUR == '<') && (NXT(1) == '!') &&
4655 (NXT(2) == '-') && (NXT(3) == '-')) {
4656 htmlParseComment(ctxt);
4657 }
4658
4659 /*
4660 * Second case : a Processing Instruction.
4661 */
4662 else if ((CUR == '<') && (NXT(1) == '?')) {
4663 htmlParsePI(ctxt);
4664 }
4665
4666 /*
4667 * Third case : a sub-element.
4668 */
4669 else if (CUR == '<') {
4670 htmlParseElementInternal(ctxt);
4671 if (currentNode != NULL) xmlFree(currentNode);
4672
4673 currentNode = xmlStrdup(ctxt->name);
4674 depth = ctxt->nameNr;
4675 }
4676
4677 /*
4678 * Fourth case : a reference. If if has not been resolved,
4679 * parsing returns it's Name, create the node
4680 */
4681 else if (CUR == '&') {
4682 htmlParseReference(ctxt);
4683 }
4684
4685 /*
4686 * Fifth case : end of the resource
4687 */
4688 else if (CUR == 0) {
4689 htmlAutoCloseOnEnd(ctxt);
4690 break;
4691 }
4692
4693 /*
4694 * Last case, text. Note that References are handled directly.
4695 */
4696 else {
4697 htmlParseCharData(ctxt);
4698 }
4699
4700 if (cons == ctxt->nbChars) {
4701 if (ctxt->node != NULL) {
4702 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4703 "detected an error in element content\n",
4704 NULL, NULL);
4705 }
4706 break;
4707 }
4708 }
4709 GROW;
4710 }
4711 if (currentNode != NULL) xmlFree(currentNode);
4712}
4713
4714/**
4715 * htmlParseContent:
4716 * @ctxt: an HTML parser context
4717 *
4718 * Parse a content: comment, sub-element, reference or text.
4719 * This is the entry point when called from parser.c
4720 */
4721
4722void
4723__htmlParseContent(void *ctxt) {
4724 if (ctxt != NULL)
4725 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4726}
4727
4728/**
4729 * htmlParseDocument:
4730 * @ctxt: an HTML parser context
4731 *
4732 * parse an HTML document (and build a tree if using the standard SAX
4733 * interface).
4734 *
4735 * Returns 0, -1 in case of error. the parser context is augmented
4736 * as a result of the parsing.
4737 */
4738
4739int
4740htmlParseDocument(htmlParserCtxtPtr ctxt) {
4741 xmlChar start[4];
4742 xmlCharEncoding enc;
4743 xmlDtdPtr dtd;
4744
4745 xmlInitParser();
4746
4747 htmlDefaultSAXHandlerInit();
4748
4749 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4750 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4751 "htmlParseDocument: context error\n", NULL, NULL);
4752 return(XML_ERR_INTERNAL_ERROR);
4753 }
4754 ctxt->html = 1;
4755 ctxt->linenumbers = 1;
4756 GROW;
4757 /*
4758 * SAX: beginning of the document processing.
4759 */
4760 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4761 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4762
4763 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4764 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4765 /*
4766 * Get the 4 first bytes and decode the charset
4767 * if enc != XML_CHAR_ENCODING_NONE
4768 * plug some encoding conversion routines.
4769 */
4770 start[0] = RAW;
4771 start[1] = NXT(1);
4772 start[2] = NXT(2);
4773 start[3] = NXT(3);
4774 enc = xmlDetectCharEncoding(&start[0], 4);
4775 if (enc != XML_CHAR_ENCODING_NONE) {
4776 xmlSwitchEncoding(ctxt, enc);
4777 }
4778 }
4779
4780 /*
4781 * Wipe out everything which is before the first '<'
4782 */
4783 SKIP_BLANKS;
4784 if (CUR == 0) {
4785 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4786 "Document is empty\n", NULL, NULL);
4787 }
4788
4789 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4790 ctxt->sax->startDocument(ctxt->userData);
4791
4792
4793 /*
4794 * Parse possible comments and PIs before any content
4795 */
4796 while (((CUR == '<') && (NXT(1) == '!') &&
4797 (NXT(2) == '-') && (NXT(3) == '-')) ||
4798 ((CUR == '<') && (NXT(1) == '?'))) {
4799 htmlParseComment(ctxt);
4800 htmlParsePI(ctxt);
4801 SKIP_BLANKS;
4802 }
4803
4804
4805 /*
4806 * Then possibly doc type declaration(s) and more Misc
4807 * (doctypedecl Misc*)?
4808 */
4809 if ((CUR == '<') && (NXT(1) == '!') &&
4810 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4811 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4812 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4813 (UPP(8) == 'E')) {
4814 htmlParseDocTypeDecl(ctxt);
4815 }
4816 SKIP_BLANKS;
4817
4818 /*
4819 * Parse possible comments and PIs before any content
4820 */
4821 while (((CUR == '<') && (NXT(1) == '!') &&
4822 (NXT(2) == '-') && (NXT(3) == '-')) ||
4823 ((CUR == '<') && (NXT(1) == '?'))) {
4824 htmlParseComment(ctxt);
4825 htmlParsePI(ctxt);
4826 SKIP_BLANKS;
4827 }
4828
4829 /*
4830 * Time to start parsing the tree itself
4831 */
4832 htmlParseContentInternal(ctxt);
4833
4834 /*
4835 * autoclose
4836 */
4837 if (CUR == 0)
4838 htmlAutoCloseOnEnd(ctxt);
4839
4840
4841 /*
4842 * SAX: end of the document processing.
4843 */
4844 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4845 ctxt->sax->endDocument(ctxt->userData);
4846
4847 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4848 dtd = xmlGetIntSubset(ctxt->myDoc);
4849 if (dtd == NULL)
4850 ctxt->myDoc->intSubset =
4851 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4852 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4853 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4854 }
4855 if (! ctxt->wellFormed) return(-1);
4856 return(0);
4857}
4858
4859
4860/************************************************************************
4861 * *
4862 * Parser contexts handling *
4863 * *
4864 ************************************************************************/
4865
4866/**
4867 * htmlInitParserCtxt:
4868 * @ctxt: an HTML parser context
4869 *
4870 * Initialize a parser context
4871 *
4872 * Returns 0 in case of success and -1 in case of error
4873 */
4874
4875static int
4876htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4877{
4878 htmlSAXHandler *sax;
4879
4880 if (ctxt == NULL) return(-1);
4881 memset(ctxt, 0, sizeof(htmlParserCtxt));
4882
4883 ctxt->dict = xmlDictCreate();
4884 if (ctxt->dict == NULL) {
4885 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4886 return(-1);
4887 }
4888 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4889 if (sax == NULL) {
4890 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4891 return(-1);
4892 }
4893 else
4894 memset(sax, 0, sizeof(htmlSAXHandler));
4895
4896 /* Allocate the Input stack */
4897 ctxt->inputTab = (htmlParserInputPtr *)
4898 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4899 if (ctxt->inputTab == NULL) {
4900 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4901 ctxt->inputNr = 0;
4902 ctxt->inputMax = 0;
4903 ctxt->input = NULL;
4904 return(-1);
4905 }
4906 ctxt->inputNr = 0;
4907 ctxt->inputMax = 5;
4908 ctxt->input = NULL;
4909 ctxt->version = NULL;
4910 ctxt->encoding = NULL;
4911 ctxt->standalone = -1;
4912 ctxt->instate = XML_PARSER_START;
4913
4914 /* Allocate the Node stack */
4915 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4916 if (ctxt->nodeTab == NULL) {
4917 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4918 ctxt->nodeNr = 0;
4919 ctxt->nodeMax = 0;
4920 ctxt->node = NULL;
4921 ctxt->inputNr = 0;
4922 ctxt->inputMax = 0;
4923 ctxt->input = NULL;
4924 return(-1);
4925 }
4926 ctxt->nodeNr = 0;
4927 ctxt->nodeMax = 10;
4928 ctxt->node = NULL;
4929
4930 /* Allocate the Name stack */
4931 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4932 if (ctxt->nameTab == NULL) {
4933 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4934 ctxt->nameNr = 0;
4935 ctxt->nameMax = 0;
4936 ctxt->name = NULL;
4937 ctxt->nodeNr = 0;
4938 ctxt->nodeMax = 0;
4939 ctxt->node = NULL;
4940 ctxt->inputNr = 0;
4941 ctxt->inputMax = 0;
4942 ctxt->input = NULL;
4943 return(-1);
4944 }
4945 ctxt->nameNr = 0;
4946 ctxt->nameMax = 10;
4947 ctxt->name = NULL;
4948
4949 ctxt->nodeInfoTab = NULL;
4950 ctxt->nodeInfoNr = 0;
4951 ctxt->nodeInfoMax = 0;
4952
4953 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4954 else {
4955 ctxt->sax = sax;
4956 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4957 }
4958 ctxt->userData = ctxt;
4959 ctxt->myDoc = NULL;
4960 ctxt->wellFormed = 1;
4961 ctxt->replaceEntities = 0;
4962 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4963 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4964 ctxt->html = 1;
4965 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4966 ctxt->vctxt.userData = ctxt;
4967 ctxt->vctxt.error = xmlParserValidityError;
4968 ctxt->vctxt.warning = xmlParserValidityWarning;
4969 ctxt->record_info = 0;
4970 ctxt->validate = 0;
4971 ctxt->nbChars = 0;
4972 ctxt->checkIndex = 0;
4973 ctxt->catalogs = NULL;
4974 xmlInitNodeInfoSeq(&ctxt->node_seq);
4975 return(0);
4976}
4977
4978/**
4979 * htmlFreeParserCtxt:
4980 * @ctxt: an HTML parser context
4981 *
4982 * Free all the memory used by a parser context. However the parsed
4983 * document in ctxt->myDoc is not freed.
4984 */
4985
4986void
4987htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4988{
4989 xmlFreeParserCtxt(ctxt);
4990}
4991
4992/**
4993 * htmlNewParserCtxt:
4994 *
4995 * Allocate and initialize a new parser context.
4996 *
4997 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4998 */
4999
5000htmlParserCtxtPtr
5001htmlNewParserCtxt(void)
5002{
5003 xmlParserCtxtPtr ctxt;
5004
5005 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5006 if (ctxt == NULL) {
5007 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5008 return(NULL);
5009 }
5010 memset(ctxt, 0, sizeof(xmlParserCtxt));
5011 if (htmlInitParserCtxt(ctxt) < 0) {
5012 htmlFreeParserCtxt(ctxt);
5013 return(NULL);
5014 }
5015 return(ctxt);
5016}
5017
5018/**
5019 * htmlCreateMemoryParserCtxt:
5020 * @buffer: a pointer to a char array
5021 * @size: the size of the array
5022 *
5023 * Create a parser context for an HTML in-memory document.
5024 *
5025 * Returns the new parser context or NULL
5026 */
5027htmlParserCtxtPtr
5028htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5029 xmlParserCtxtPtr ctxt;
5030 xmlParserInputPtr input;
5031 xmlParserInputBufferPtr buf;
5032
5033 if (buffer == NULL)
5034 return(NULL);
5035 if (size <= 0)
5036 return(NULL);
5037
5038 ctxt = htmlNewParserCtxt();
5039 if (ctxt == NULL)
5040 return(NULL);
5041
5042 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5043 if (buf == NULL) return(NULL);
5044
5045 input = xmlNewInputStream(ctxt);
5046 if (input == NULL) {
5047 xmlFreeParserCtxt(ctxt);
5048 return(NULL);
5049 }
5050
5051 input->filename = NULL;
5052 input->buf = buf;
5053 xmlBufResetInput(buf->buffer, input);
5054
5055 inputPush(ctxt, input);
5056 return(ctxt);
5057}
5058
5059/**
5060 * htmlCreateDocParserCtxt:
5061 * @cur: a pointer to an array of xmlChar
5062 * @encoding: a free form C string describing the HTML document encoding, or NULL
5063 *
5064 * Create a parser context for an HTML document.
5065 *
5066 * TODO: check the need to add encoding handling there
5067 *
5068 * Returns the new parser context or NULL
5069 */
5070static htmlParserCtxtPtr
5071htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5072 int len;
5073 htmlParserCtxtPtr ctxt;
5074
5075 if (cur == NULL)
5076 return(NULL);
5077 len = xmlStrlen(cur);
5078 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5079 if (ctxt == NULL)
5080 return(NULL);
5081
5082 if (encoding != NULL) {
5083 xmlCharEncoding enc;
5084 xmlCharEncodingHandlerPtr handler;
5085
5086 if (ctxt->input->encoding != NULL)
5087 xmlFree((xmlChar *) ctxt->input->encoding);
5088 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5089
5090 enc = xmlParseCharEncoding(encoding);
5091 /*
5092 * registered set of known encodings
5093 */
5094 if (enc != XML_CHAR_ENCODING_ERROR) {
5095 xmlSwitchEncoding(ctxt, enc);
5096 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5097 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5098 "Unsupported encoding %s\n",
5099 (const xmlChar *) encoding, NULL);
5100 }
5101 } else {
5102 /*
5103 * fallback for unknown encodings
5104 */
5105 handler = xmlFindCharEncodingHandler((const char *) encoding);
5106 if (handler != NULL) {
5107 xmlSwitchToEncoding(ctxt, handler);
5108 } else {
5109 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5110 "Unsupported encoding %s\n",
5111 (const xmlChar *) encoding, NULL);
5112 }
5113 }
5114 }
5115 return(ctxt);
5116}
5117
5118#ifdef LIBXML_PUSH_ENABLED
5119/************************************************************************
5120 * *
5121 * Progressive parsing interfaces *
5122 * *
5123 ************************************************************************/
5124
5125/**
5126 * htmlParseLookupSequence:
5127 * @ctxt: an HTML parser context
5128 * @first: the first char to lookup
5129 * @next: the next char to lookup or zero
5130 * @third: the next char to lookup or zero
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005131 * @ignoreattrval: skip over attribute values
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005132 *
5133 * Try to find if a sequence (first, next, third) or just (first next) or
5134 * (first) is available in the input stream.
5135 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5136 * to avoid rescanning sequences of bytes, it DOES change the state of the
5137 * parser, do not use liberally.
5138 * This is basically similar to xmlParseLookupSequence()
5139 *
5140 * Returns the index to the current parsing point if the full sequence
5141 * is available, -1 otherwise.
5142 */
5143static int
5144htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005145 xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005146{
5147 int base, len;
5148 htmlParserInputPtr in;
5149 const xmlChar *buf;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005150 int invalue = 0;
5151 char valdellim = 0x0;
5152
5153 in = ctxt->input;
5154 if (in == NULL)
5155 return (-1);
5156
5157 base = in->cur - in->base;
5158 if (base < 0)
5159 return (-1);
5160
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005161 if (ctxt->checkIndex > base) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005162 base = ctxt->checkIndex;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005163 /* Abuse hasPErefs member to restore current state. */
5164 invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5165 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005166
5167 if (in->buf == NULL) {
5168 buf = in->base;
5169 len = in->length;
5170 } else {
5171 buf = xmlBufContent(in->buf->buffer);
5172 len = xmlBufUse(in->buf->buffer);
5173 }
5174
5175 /* take into account the sequence length */
5176 if (third)
5177 len -= 2;
5178 else if (next)
5179 len--;
5180 for (; base < len; base++) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005181 if (ignoreattrval) {
5182 if (buf[base] == '"' || buf[base] == '\'') {
5183 if (invalue) {
5184 if (buf[base] == valdellim) {
5185 invalue = 0;
5186 continue;
5187 }
5188 } else {
5189 valdellim = buf[base];
5190 invalue = 1;
5191 continue;
5192 }
5193 } else if (invalue) {
5194 continue;
5195 }
5196 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005197 if (buf[base] == first) {
5198 if (third != 0) {
5199 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5200 continue;
5201 } else if (next != 0) {
5202 if (buf[base + 1] != next)
5203 continue;
5204 }
5205 ctxt->checkIndex = 0;
5206#ifdef DEBUG_PUSH
5207 if (next == 0)
5208 xmlGenericError(xmlGenericErrorContext,
5209 "HPP: lookup '%c' found at %d\n",
5210 first, base);
5211 else if (third == 0)
5212 xmlGenericError(xmlGenericErrorContext,
5213 "HPP: lookup '%c%c' found at %d\n",
5214 first, next, base);
5215 else
5216 xmlGenericError(xmlGenericErrorContext,
5217 "HPP: lookup '%c%c%c' found at %d\n",
5218 first, next, third, base);
5219#endif
5220 return (base - (in->cur - in->base));
5221 }
5222 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005223 ctxt->checkIndex = base;
5224 /* Abuse hasPErefs member to track current state. */
5225 if (invalue)
5226 ctxt->hasPErefs |= 1;
5227 else
5228 ctxt->hasPErefs &= ~1;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005229#ifdef DEBUG_PUSH
5230 if (next == 0)
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: lookup '%c' failed\n", first);
5233 else if (third == 0)
5234 xmlGenericError(xmlGenericErrorContext,
5235 "HPP: lookup '%c%c' failed\n", first, next);
5236 else
5237 xmlGenericError(xmlGenericErrorContext,
5238 "HPP: lookup '%c%c%c' failed\n", first, next,
5239 third);
5240#endif
5241 return (-1);
5242}
5243
5244/**
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005245 * htmlParseTryOrFinish:
5246 * @ctxt: an HTML parser context
5247 * @terminate: last chunk indicator
5248 *
5249 * Try to progress on parsing
5250 *
5251 * Returns zero if no parsing was possible
5252 */
5253static int
5254htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5255 int ret = 0;
5256 htmlParserInputPtr in;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005257 ptrdiff_t avail = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005258 xmlChar cur, next;
5259
5260 htmlParserNodeInfo node_info;
5261
5262#ifdef DEBUG_PUSH
5263 switch (ctxt->instate) {
5264 case XML_PARSER_EOF:
5265 xmlGenericError(xmlGenericErrorContext,
5266 "HPP: try EOF\n"); break;
5267 case XML_PARSER_START:
5268 xmlGenericError(xmlGenericErrorContext,
5269 "HPP: try START\n"); break;
5270 case XML_PARSER_MISC:
5271 xmlGenericError(xmlGenericErrorContext,
5272 "HPP: try MISC\n");break;
5273 case XML_PARSER_COMMENT:
5274 xmlGenericError(xmlGenericErrorContext,
5275 "HPP: try COMMENT\n");break;
5276 case XML_PARSER_PROLOG:
5277 xmlGenericError(xmlGenericErrorContext,
5278 "HPP: try PROLOG\n");break;
5279 case XML_PARSER_START_TAG:
5280 xmlGenericError(xmlGenericErrorContext,
5281 "HPP: try START_TAG\n");break;
5282 case XML_PARSER_CONTENT:
5283 xmlGenericError(xmlGenericErrorContext,
5284 "HPP: try CONTENT\n");break;
5285 case XML_PARSER_CDATA_SECTION:
5286 xmlGenericError(xmlGenericErrorContext,
5287 "HPP: try CDATA_SECTION\n");break;
5288 case XML_PARSER_END_TAG:
5289 xmlGenericError(xmlGenericErrorContext,
5290 "HPP: try END_TAG\n");break;
5291 case XML_PARSER_ENTITY_DECL:
5292 xmlGenericError(xmlGenericErrorContext,
5293 "HPP: try ENTITY_DECL\n");break;
5294 case XML_PARSER_ENTITY_VALUE:
5295 xmlGenericError(xmlGenericErrorContext,
5296 "HPP: try ENTITY_VALUE\n");break;
5297 case XML_PARSER_ATTRIBUTE_VALUE:
5298 xmlGenericError(xmlGenericErrorContext,
5299 "HPP: try ATTRIBUTE_VALUE\n");break;
5300 case XML_PARSER_DTD:
5301 xmlGenericError(xmlGenericErrorContext,
5302 "HPP: try DTD\n");break;
5303 case XML_PARSER_EPILOG:
5304 xmlGenericError(xmlGenericErrorContext,
5305 "HPP: try EPILOG\n");break;
5306 case XML_PARSER_PI:
5307 xmlGenericError(xmlGenericErrorContext,
5308 "HPP: try PI\n");break;
5309 case XML_PARSER_SYSTEM_LITERAL:
5310 xmlGenericError(xmlGenericErrorContext,
5311 "HPP: try SYSTEM_LITERAL\n");break;
5312 }
5313#endif
5314
5315 while (1) {
5316
5317 in = ctxt->input;
5318 if (in == NULL) break;
5319 if (in->buf == NULL)
5320 avail = in->length - (in->cur - in->base);
5321 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005322 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5323 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005324 if ((avail == 0) && (terminate)) {
5325 htmlAutoCloseOnEnd(ctxt);
5326 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5327 /*
5328 * SAX: end of the document processing.
5329 */
5330 ctxt->instate = XML_PARSER_EOF;
5331 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5332 ctxt->sax->endDocument(ctxt->userData);
5333 }
5334 }
5335 if (avail < 1)
5336 goto done;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005337 /*
5338 * This is done to make progress and avoid an infinite loop
5339 * if a parsing attempt was aborted by hitting a NUL byte. After
5340 * changing htmlCurrentChar, this probably isn't necessary anymore.
5341 * We should consider removing this check.
5342 */
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005343 cur = in->cur[0];
5344 if (cur == 0) {
5345 SKIP(1);
5346 continue;
5347 }
5348
5349 switch (ctxt->instate) {
5350 case XML_PARSER_EOF:
5351 /*
5352 * Document parsing is done !
5353 */
5354 goto done;
5355 case XML_PARSER_START:
5356 /*
5357 * Very first chars read from the document flow.
5358 */
5359 cur = in->cur[0];
5360 if (IS_BLANK_CH(cur)) {
5361 SKIP_BLANKS;
5362 if (in->buf == NULL)
5363 avail = in->length - (in->cur - in->base);
5364 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005365 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5366 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005367 }
5368 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5369 ctxt->sax->setDocumentLocator(ctxt->userData,
5370 &xmlDefaultSAXLocator);
5371 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5372 (!ctxt->disableSAX))
5373 ctxt->sax->startDocument(ctxt->userData);
5374
5375 cur = in->cur[0];
5376 next = in->cur[1];
5377 if ((cur == '<') && (next == '!') &&
5378 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5379 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5380 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5381 (UPP(8) == 'E')) {
5382 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005383 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005384 goto done;
5385#ifdef DEBUG_PUSH
5386 xmlGenericError(xmlGenericErrorContext,
5387 "HPP: Parsing internal subset\n");
5388#endif
5389 htmlParseDocTypeDecl(ctxt);
5390 ctxt->instate = XML_PARSER_PROLOG;
5391#ifdef DEBUG_PUSH
5392 xmlGenericError(xmlGenericErrorContext,
5393 "HPP: entering PROLOG\n");
5394#endif
5395 } else {
5396 ctxt->instate = XML_PARSER_MISC;
5397#ifdef DEBUG_PUSH
5398 xmlGenericError(xmlGenericErrorContext,
5399 "HPP: entering MISC\n");
5400#endif
5401 }
5402 break;
5403 case XML_PARSER_MISC:
5404 SKIP_BLANKS;
5405 if (in->buf == NULL)
5406 avail = in->length - (in->cur - in->base);
5407 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005408 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5409 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005410 /*
5411 * no chars in buffer
5412 */
5413 if (avail < 1)
5414 goto done;
5415 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005416 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005417 */
5418 if (avail < 2) {
5419 if (!terminate)
5420 goto done;
5421 else
5422 next = ' ';
5423 } else {
5424 next = in->cur[1];
5425 }
5426 cur = in->cur[0];
5427 if ((cur == '<') && (next == '!') &&
5428 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5429 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005430 (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005431 goto done;
5432#ifdef DEBUG_PUSH
5433 xmlGenericError(xmlGenericErrorContext,
5434 "HPP: Parsing Comment\n");
5435#endif
5436 htmlParseComment(ctxt);
5437 ctxt->instate = XML_PARSER_MISC;
5438 } else if ((cur == '<') && (next == '?')) {
5439 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005440 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005441 goto done;
5442#ifdef DEBUG_PUSH
5443 xmlGenericError(xmlGenericErrorContext,
5444 "HPP: Parsing PI\n");
5445#endif
5446 htmlParsePI(ctxt);
5447 ctxt->instate = XML_PARSER_MISC;
5448 } else if ((cur == '<') && (next == '!') &&
5449 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5450 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5451 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5452 (UPP(8) == 'E')) {
5453 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005454 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005455 goto done;
5456#ifdef DEBUG_PUSH
5457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: Parsing internal subset\n");
5459#endif
5460 htmlParseDocTypeDecl(ctxt);
5461 ctxt->instate = XML_PARSER_PROLOG;
5462#ifdef DEBUG_PUSH
5463 xmlGenericError(xmlGenericErrorContext,
5464 "HPP: entering PROLOG\n");
5465#endif
5466 } else if ((cur == '<') && (next == '!') &&
5467 (avail < 9)) {
5468 goto done;
5469 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005470 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005471#ifdef DEBUG_PUSH
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: entering START_TAG\n");
5474#endif
5475 }
5476 break;
5477 case XML_PARSER_PROLOG:
5478 SKIP_BLANKS;
5479 if (in->buf == NULL)
5480 avail = in->length - (in->cur - in->base);
5481 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005482 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5483 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005484 if (avail < 2)
5485 goto done;
5486 cur = in->cur[0];
5487 next = in->cur[1];
5488 if ((cur == '<') && (next == '!') &&
5489 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5490 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005491 (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005492 goto done;
5493#ifdef DEBUG_PUSH
5494 xmlGenericError(xmlGenericErrorContext,
5495 "HPP: Parsing Comment\n");
5496#endif
5497 htmlParseComment(ctxt);
5498 ctxt->instate = XML_PARSER_PROLOG;
5499 } else if ((cur == '<') && (next == '?')) {
5500 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005501 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005502 goto done;
5503#ifdef DEBUG_PUSH
5504 xmlGenericError(xmlGenericErrorContext,
5505 "HPP: Parsing PI\n");
5506#endif
5507 htmlParsePI(ctxt);
5508 ctxt->instate = XML_PARSER_PROLOG;
5509 } else if ((cur == '<') && (next == '!') &&
5510 (avail < 4)) {
5511 goto done;
5512 } else {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005513 ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005514#ifdef DEBUG_PUSH
5515 xmlGenericError(xmlGenericErrorContext,
5516 "HPP: entering START_TAG\n");
5517#endif
5518 }
5519 break;
5520 case XML_PARSER_EPILOG:
5521 if (in->buf == NULL)
5522 avail = in->length - (in->cur - in->base);
5523 else
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005524 avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5525 (in->cur - in->base);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005526 if (avail < 1)
5527 goto done;
5528 cur = in->cur[0];
5529 if (IS_BLANK_CH(cur)) {
5530 htmlParseCharData(ctxt);
5531 goto done;
5532 }
5533 if (avail < 2)
5534 goto done;
5535 next = in->cur[1];
5536 if ((cur == '<') && (next == '!') &&
5537 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5538 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005539 (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005540 goto done;
5541#ifdef DEBUG_PUSH
5542 xmlGenericError(xmlGenericErrorContext,
5543 "HPP: Parsing Comment\n");
5544#endif
5545 htmlParseComment(ctxt);
5546 ctxt->instate = XML_PARSER_EPILOG;
5547 } else if ((cur == '<') && (next == '?')) {
5548 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005549 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005550 goto done;
5551#ifdef DEBUG_PUSH
5552 xmlGenericError(xmlGenericErrorContext,
5553 "HPP: Parsing PI\n");
5554#endif
5555 htmlParsePI(ctxt);
5556 ctxt->instate = XML_PARSER_EPILOG;
5557 } else if ((cur == '<') && (next == '!') &&
5558 (avail < 4)) {
5559 goto done;
5560 } else {
5561 ctxt->errNo = XML_ERR_DOCUMENT_END;
5562 ctxt->wellFormed = 0;
5563 ctxt->instate = XML_PARSER_EOF;
5564#ifdef DEBUG_PUSH
5565 xmlGenericError(xmlGenericErrorContext,
5566 "HPP: entering EOF\n");
5567#endif
5568 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5569 ctxt->sax->endDocument(ctxt->userData);
5570 goto done;
5571 }
5572 break;
5573 case XML_PARSER_START_TAG: {
5574 const xmlChar *name;
5575 int failed;
5576 const htmlElemDesc * info;
5577
5578 /*
5579 * no chars in buffer
5580 */
5581 if (avail < 1)
5582 goto done;
5583 /*
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005584 * not enough chars in buffer
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005585 */
5586 if (avail < 2) {
5587 if (!terminate)
5588 goto done;
5589 else
5590 next = ' ';
5591 } else {
5592 next = in->cur[1];
5593 }
5594 cur = in->cur[0];
5595 if (cur != '<') {
5596 ctxt->instate = XML_PARSER_CONTENT;
5597#ifdef DEBUG_PUSH
5598 xmlGenericError(xmlGenericErrorContext,
5599 "HPP: entering CONTENT\n");
5600#endif
5601 break;
5602 }
5603 if (next == '/') {
5604 ctxt->instate = XML_PARSER_END_TAG;
5605 ctxt->checkIndex = 0;
5606#ifdef DEBUG_PUSH
5607 xmlGenericError(xmlGenericErrorContext,
5608 "HPP: entering END_TAG\n");
5609#endif
5610 break;
5611 }
5612 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005613 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005614 goto done;
5615
5616 /* Capture start position */
5617 if (ctxt->record_info) {
5618 node_info.begin_pos = ctxt->input->consumed +
5619 (CUR_PTR - ctxt->input->base);
5620 node_info.begin_line = ctxt->input->line;
5621 }
5622
5623
5624 failed = htmlParseStartTag(ctxt);
5625 name = ctxt->name;
5626 if ((failed == -1) ||
5627 (name == NULL)) {
5628 if (CUR == '>')
5629 NEXT;
5630 break;
5631 }
5632
5633 /*
5634 * Lookup the info for that element.
5635 */
5636 info = htmlTagLookup(name);
5637 if (info == NULL) {
5638 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5639 "Tag %s invalid\n", name, NULL);
5640 }
5641
5642 /*
5643 * Check for an Empty Element labeled the XML/SGML way
5644 */
5645 if ((CUR == '/') && (NXT(1) == '>')) {
5646 SKIP(2);
5647 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5648 ctxt->sax->endElement(ctxt->userData, name);
5649 htmlnamePop(ctxt);
5650 ctxt->instate = XML_PARSER_CONTENT;
5651#ifdef DEBUG_PUSH
5652 xmlGenericError(xmlGenericErrorContext,
5653 "HPP: entering CONTENT\n");
5654#endif
5655 break;
5656 }
5657
5658 if (CUR == '>') {
5659 NEXT;
5660 } else {
5661 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5662 "Couldn't find end of Start Tag %s\n",
5663 name, NULL);
5664
5665 /*
5666 * end of parsing of this node.
5667 */
5668 if (xmlStrEqual(name, ctxt->name)) {
5669 nodePop(ctxt);
5670 htmlnamePop(ctxt);
5671 }
5672
5673 if (ctxt->record_info)
5674 htmlNodeInfoPush(ctxt, &node_info);
5675
5676 ctxt->instate = XML_PARSER_CONTENT;
5677#ifdef DEBUG_PUSH
5678 xmlGenericError(xmlGenericErrorContext,
5679 "HPP: entering CONTENT\n");
5680#endif
5681 break;
5682 }
5683
5684 /*
5685 * Check for an Empty Element from DTD definition
5686 */
5687 if ((info != NULL) && (info->empty)) {
5688 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5689 ctxt->sax->endElement(ctxt->userData, name);
5690 htmlnamePop(ctxt);
5691 }
5692
5693 if (ctxt->record_info)
5694 htmlNodeInfoPush(ctxt, &node_info);
5695
5696 ctxt->instate = XML_PARSER_CONTENT;
5697#ifdef DEBUG_PUSH
5698 xmlGenericError(xmlGenericErrorContext,
5699 "HPP: entering CONTENT\n");
5700#endif
5701 break;
5702 }
5703 case XML_PARSER_CONTENT: {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005704 xmlChar chr[2] = { 0, 0 };
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005705 long cons;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005706
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005707 /*
5708 * Handle preparsed entities and charRef
5709 */
5710 if (ctxt->token != 0) {
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005711 chr[0] = (xmlChar) ctxt->token;
5712 htmlCheckParagraph(ctxt);
5713 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5714 ctxt->sax->characters(ctxt->userData, chr, 1);
5715 ctxt->token = 0;
5716 ctxt->checkIndex = 0;
5717 }
5718 if ((avail == 1) && (terminate)) {
5719 cur = in->cur[0];
5720 if ((cur != '<') && (cur != '&')) {
5721 if (ctxt->sax != NULL) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005722 chr[0] = cur;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005723 if (IS_BLANK_CH(cur)) {
5724 if (ctxt->keepBlanks) {
5725 if (ctxt->sax->characters != NULL)
5726 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005727 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005728 } else {
5729 if (ctxt->sax->ignorableWhitespace != NULL)
5730 ctxt->sax->ignorableWhitespace(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005731 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005732 }
5733 } else {
5734 htmlCheckParagraph(ctxt);
5735 if (ctxt->sax->characters != NULL)
5736 ctxt->sax->characters(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005737 ctxt->userData, chr, 1);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005738 }
5739 }
5740 ctxt->token = 0;
5741 ctxt->checkIndex = 0;
5742 in->cur++;
5743 break;
5744 }
5745 }
5746 if (avail < 2)
5747 goto done;
5748 cur = in->cur[0];
5749 next = in->cur[1];
5750 cons = ctxt->nbChars;
5751 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5752 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5753 /*
5754 * Handle SCRIPT/STYLE separately
5755 */
5756 if (!terminate) {
5757 int idx;
5758 xmlChar val;
5759
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005760 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005761 if (idx < 0)
5762 goto done;
5763 val = in->cur[idx + 2];
5764 if (val == 0) /* bad cut of input */
5765 goto done;
5766 }
5767 htmlParseScript(ctxt);
5768 if ((cur == '<') && (next == '/')) {
5769 ctxt->instate = XML_PARSER_END_TAG;
5770 ctxt->checkIndex = 0;
5771#ifdef DEBUG_PUSH
5772 xmlGenericError(xmlGenericErrorContext,
5773 "HPP: entering END_TAG\n");
5774#endif
5775 break;
5776 }
5777 } else {
5778 /*
5779 * Sometimes DOCTYPE arrives in the middle of the document
5780 */
5781 if ((cur == '<') && (next == '!') &&
5782 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5783 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5784 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5785 (UPP(8) == 'E')) {
5786 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005787 (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005788 goto done;
5789 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5790 "Misplaced DOCTYPE declaration\n",
5791 BAD_CAST "DOCTYPE" , NULL);
5792 htmlParseDocTypeDecl(ctxt);
5793 } else if ((cur == '<') && (next == '!') &&
5794 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5795 if ((!terminate) &&
5796 (htmlParseLookupSequence(
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005797 ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005798 goto done;
5799#ifdef DEBUG_PUSH
5800 xmlGenericError(xmlGenericErrorContext,
5801 "HPP: Parsing Comment\n");
5802#endif
5803 htmlParseComment(ctxt);
5804 ctxt->instate = XML_PARSER_CONTENT;
5805 } else if ((cur == '<') && (next == '?')) {
5806 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005807 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005808 goto done;
5809#ifdef DEBUG_PUSH
5810 xmlGenericError(xmlGenericErrorContext,
5811 "HPP: Parsing PI\n");
5812#endif
5813 htmlParsePI(ctxt);
5814 ctxt->instate = XML_PARSER_CONTENT;
5815 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5816 goto done;
5817 } else if ((cur == '<') && (next == '/')) {
5818 ctxt->instate = XML_PARSER_END_TAG;
5819 ctxt->checkIndex = 0;
5820#ifdef DEBUG_PUSH
5821 xmlGenericError(xmlGenericErrorContext,
5822 "HPP: entering END_TAG\n");
5823#endif
5824 break;
5825 } else if (cur == '<') {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005826 if ((!terminate) && (next == 0))
5827 goto done;
5828 /*
5829 * Only switch to START_TAG if the next character
5830 * starts a valid name. Otherwise, htmlParseStartTag
5831 * might return without consuming all characters
5832 * up to the final '>'.
5833 */
5834 if ((IS_ASCII_LETTER(next)) ||
5835 (next == '_') || (next == ':') || (next == '.')) {
5836 ctxt->instate = XML_PARSER_START_TAG;
5837 ctxt->checkIndex = 0;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005838#ifdef DEBUG_PUSH
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005839 xmlGenericError(xmlGenericErrorContext,
5840 "HPP: entering START_TAG\n");
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005841#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005842 } else {
5843 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
5844 "htmlParseTryOrFinish: "
5845 "invalid element name\n",
5846 NULL, NULL);
5847 htmlCheckParagraph(ctxt);
5848 if ((ctxt->sax != NULL) &&
5849 (ctxt->sax->characters != NULL))
5850 ctxt->sax->characters(ctxt->userData,
5851 in->cur, 1);
5852 NEXT;
5853 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005854 break;
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005855 } else {
5856 /*
5857 * check that the text sequence is complete
5858 * before handing out the data to the parser
5859 * to avoid problems with erroneous end of
5860 * data detection.
5861 */
5862 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005863 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005864 goto done;
5865 ctxt->checkIndex = 0;
5866#ifdef DEBUG_PUSH
5867 xmlGenericError(xmlGenericErrorContext,
5868 "HPP: Parsing char data\n");
5869#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005870 while ((cur != '<') && (cur != 0)) {
5871 if (cur == '&') {
5872 htmlParseReference(ctxt);
5873 } else {
5874 htmlParseCharData(ctxt);
5875 }
5876 cur = in->cur[0];
5877 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005878 }
5879 }
5880 if (cons == ctxt->nbChars) {
5881 if (ctxt->node != NULL) {
5882 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5883 "detected an error in element content\n",
5884 NULL, NULL);
5885 }
5886 NEXT;
5887 break;
5888 }
5889
5890 break;
5891 }
5892 case XML_PARSER_END_TAG:
5893 if (avail < 2)
5894 goto done;
5895 if ((!terminate) &&
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005896 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes7fbecab2019-01-10 16:42:03 -08005897 goto done;
5898 htmlParseEndTag(ctxt);
5899 if (ctxt->nameNr == 0) {
5900 ctxt->instate = XML_PARSER_EPILOG;
5901 } else {
5902 ctxt->instate = XML_PARSER_CONTENT;
5903 }
5904 ctxt->checkIndex = 0;
5905#ifdef DEBUG_PUSH
5906 xmlGenericError(xmlGenericErrorContext,
5907 "HPP: entering CONTENT\n");
5908#endif
5909 break;
5910 case XML_PARSER_CDATA_SECTION:
5911 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5912 "HPP: internal error, state == CDATA\n",
5913 NULL, NULL);
5914 ctxt->instate = XML_PARSER_CONTENT;
5915 ctxt->checkIndex = 0;
5916#ifdef DEBUG_PUSH
5917 xmlGenericError(xmlGenericErrorContext,
5918 "HPP: entering CONTENT\n");
5919#endif
5920 break;
5921 case XML_PARSER_DTD:
5922 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5923 "HPP: internal error, state == DTD\n",
5924 NULL, NULL);
5925 ctxt->instate = XML_PARSER_CONTENT;
5926 ctxt->checkIndex = 0;
5927#ifdef DEBUG_PUSH
5928 xmlGenericError(xmlGenericErrorContext,
5929 "HPP: entering CONTENT\n");
5930#endif
5931 break;
5932 case XML_PARSER_COMMENT:
5933 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5934 "HPP: internal error, state == COMMENT\n",
5935 NULL, NULL);
5936 ctxt->instate = XML_PARSER_CONTENT;
5937 ctxt->checkIndex = 0;
5938#ifdef DEBUG_PUSH
5939 xmlGenericError(xmlGenericErrorContext,
5940 "HPP: entering CONTENT\n");
5941#endif
5942 break;
5943 case XML_PARSER_PI:
5944 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5945 "HPP: internal error, state == PI\n",
5946 NULL, NULL);
5947 ctxt->instate = XML_PARSER_CONTENT;
5948 ctxt->checkIndex = 0;
5949#ifdef DEBUG_PUSH
5950 xmlGenericError(xmlGenericErrorContext,
5951 "HPP: entering CONTENT\n");
5952#endif
5953 break;
5954 case XML_PARSER_ENTITY_DECL:
5955 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5956 "HPP: internal error, state == ENTITY_DECL\n",
5957 NULL, NULL);
5958 ctxt->instate = XML_PARSER_CONTENT;
5959 ctxt->checkIndex = 0;
5960#ifdef DEBUG_PUSH
5961 xmlGenericError(xmlGenericErrorContext,
5962 "HPP: entering CONTENT\n");
5963#endif
5964 break;
5965 case XML_PARSER_ENTITY_VALUE:
5966 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5967 "HPP: internal error, state == ENTITY_VALUE\n",
5968 NULL, NULL);
5969 ctxt->instate = XML_PARSER_CONTENT;
5970 ctxt->checkIndex = 0;
5971#ifdef DEBUG_PUSH
5972 xmlGenericError(xmlGenericErrorContext,
5973 "HPP: entering DTD\n");
5974#endif
5975 break;
5976 case XML_PARSER_ATTRIBUTE_VALUE:
5977 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5978 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5979 NULL, NULL);
5980 ctxt->instate = XML_PARSER_START_TAG;
5981 ctxt->checkIndex = 0;
5982#ifdef DEBUG_PUSH
5983 xmlGenericError(xmlGenericErrorContext,
5984 "HPP: entering START_TAG\n");
5985#endif
5986 break;
5987 case XML_PARSER_SYSTEM_LITERAL:
5988 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5989 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5990 NULL, NULL);
5991 ctxt->instate = XML_PARSER_CONTENT;
5992 ctxt->checkIndex = 0;
5993#ifdef DEBUG_PUSH
5994 xmlGenericError(xmlGenericErrorContext,
5995 "HPP: entering CONTENT\n");
5996#endif
5997 break;
5998 case XML_PARSER_IGNORE:
5999 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6000 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6001 NULL, NULL);
6002 ctxt->instate = XML_PARSER_CONTENT;
6003 ctxt->checkIndex = 0;
6004#ifdef DEBUG_PUSH
6005 xmlGenericError(xmlGenericErrorContext,
6006 "HPP: entering CONTENT\n");
6007#endif
6008 break;
6009 case XML_PARSER_PUBLIC_LITERAL:
6010 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6011 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6012 NULL, NULL);
6013 ctxt->instate = XML_PARSER_CONTENT;
6014 ctxt->checkIndex = 0;
6015#ifdef DEBUG_PUSH
6016 xmlGenericError(xmlGenericErrorContext,
6017 "HPP: entering CONTENT\n");
6018#endif
6019 break;
6020
6021 }
6022 }
6023done:
6024 if ((avail == 0) && (terminate)) {
6025 htmlAutoCloseOnEnd(ctxt);
6026 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6027 /*
6028 * SAX: end of the document processing.
6029 */
6030 ctxt->instate = XML_PARSER_EOF;
6031 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6032 ctxt->sax->endDocument(ctxt->userData);
6033 }
6034 }
6035 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6036 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6037 (ctxt->instate == XML_PARSER_EPILOG))) {
6038 xmlDtdPtr dtd;
6039 dtd = xmlGetIntSubset(ctxt->myDoc);
6040 if (dtd == NULL)
6041 ctxt->myDoc->intSubset =
6042 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6043 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6044 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6045 }
6046#ifdef DEBUG_PUSH
6047 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6048#endif
6049 return(ret);
6050}
6051
6052/**
6053 * htmlParseChunk:
6054 * @ctxt: an HTML parser context
6055 * @chunk: an char array
6056 * @size: the size in byte of the chunk
6057 * @terminate: last chunk indicator
6058 *
6059 * Parse a Chunk of memory
6060 *
6061 * Returns zero if no error, the xmlParserErrors otherwise.
6062 */
6063int
6064htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6065 int terminate) {
6066 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6067 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6068 "htmlParseChunk: context error\n", NULL, NULL);
6069 return(XML_ERR_INTERNAL_ERROR);
6070 }
6071 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6072 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6073 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6074 size_t cur = ctxt->input->cur - ctxt->input->base;
6075 int res;
6076
6077 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006078 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006079 if (res < 0) {
6080 ctxt->errNo = XML_PARSER_EOF;
6081 ctxt->disableSAX = 1;
6082 return (XML_PARSER_EOF);
6083 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006084#ifdef DEBUG_PUSH
6085 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6086#endif
6087
6088#if 0
6089 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6090 htmlParseTryOrFinish(ctxt, terminate);
6091#endif
6092 } else if (ctxt->instate != XML_PARSER_EOF) {
6093 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6094 xmlParserInputBufferPtr in = ctxt->input->buf;
6095 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6096 (in->raw != NULL)) {
6097 int nbchars;
6098 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6099 size_t current = ctxt->input->cur - ctxt->input->base;
6100
6101 nbchars = xmlCharEncInput(in, terminate);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006102 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006103 if (nbchars < 0) {
6104 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6105 "encoder error\n", NULL, NULL);
6106 return(XML_ERR_INVALID_ENCODING);
6107 }
Elliott Hughes7fbecab2019-01-10 16:42:03 -08006108 }
6109 }
6110 }
6111 htmlParseTryOrFinish(ctxt, terminate);
6112 if (terminate) {
6113 if ((ctxt->instate != XML_PARSER_EOF) &&
6114 (ctxt->instate != XML_PARSER_EPILOG) &&
6115 (ctxt->instate != XML_PARSER_MISC)) {
6116 ctxt->errNo = XML_ERR_DOCUMENT_END;
6117 ctxt->wellFormed = 0;
6118 }
6119 if (ctxt->instate != XML_PARSER_EOF) {
6120 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6121 ctxt->sax->endDocument(ctxt->userData);
6122 }
6123 ctxt->instate = XML_PARSER_EOF;
6124 }
6125 return((xmlParserErrors) ctxt->errNo);
6126}
6127
6128/************************************************************************
6129 * *
6130 * User entry points *
6131 * *
6132 ************************************************************************/
6133
6134/**
6135 * htmlCreatePushParserCtxt:
6136 * @sax: a SAX handler
6137 * @user_data: The user data returned on SAX callbacks
6138 * @chunk: a pointer to an array of chars
6139 * @size: number of chars in the array
6140 * @filename: an optional file name or URI
6141 * @enc: an optional encoding
6142 *
6143 * Create a parser context for using the HTML parser in push mode
6144 * The value of @filename is used for fetching external entities
6145 * and error/warning reports.
6146 *
6147 * Returns the new parser context or NULL
6148 */
6149htmlParserCtxtPtr
6150htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6151 const char *chunk, int size, const char *filename,
6152 xmlCharEncoding enc) {
6153 htmlParserCtxtPtr ctxt;
6154 htmlParserInputPtr inputStream;
6155 xmlParserInputBufferPtr buf;
6156
6157 xmlInitParser();
6158
6159 buf = xmlAllocParserInputBuffer(enc);
6160 if (buf == NULL) return(NULL);
6161
6162 ctxt = htmlNewParserCtxt();
6163 if (ctxt == NULL) {
6164 xmlFreeParserInputBuffer(buf);
6165 return(NULL);
6166 }
6167 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6168 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6169 if (sax != NULL) {
6170 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6171 xmlFree(ctxt->sax);
6172 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6173 if (ctxt->sax == NULL) {
6174 xmlFree(buf);
6175 xmlFree(ctxt);
6176 return(NULL);
6177 }
6178 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6179 if (user_data != NULL)
6180 ctxt->userData = user_data;
6181 }
6182 if (filename == NULL) {
6183 ctxt->directory = NULL;
6184 } else {
6185 ctxt->directory = xmlParserGetDirectory(filename);
6186 }
6187
6188 inputStream = htmlNewInputStream(ctxt);
6189 if (inputStream == NULL) {
6190 xmlFreeParserCtxt(ctxt);
6191 xmlFree(buf);
6192 return(NULL);
6193 }
6194
6195 if (filename == NULL)
6196 inputStream->filename = NULL;
6197 else
6198 inputStream->filename = (char *)
6199 xmlCanonicPath((const xmlChar *) filename);
6200 inputStream->buf = buf;
6201 xmlBufResetInput(buf->buffer, inputStream);
6202
6203 inputPush(ctxt, inputStream);
6204
6205 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6206 (ctxt->input->buf != NULL)) {
6207 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6208 size_t cur = ctxt->input->cur - ctxt->input->base;
6209
6210 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6211
6212 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6213#ifdef DEBUG_PUSH
6214 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6215#endif
6216 }
6217 ctxt->progressive = 1;
6218
6219 return(ctxt);
6220}
6221#endif /* LIBXML_PUSH_ENABLED */
6222
6223/**
6224 * htmlSAXParseDoc:
6225 * @cur: a pointer to an array of xmlChar
6226 * @encoding: a free form C string describing the HTML document encoding, or NULL
6227 * @sax: the SAX handler block
6228 * @userData: if using SAX, this pointer will be provided on callbacks.
6229 *
6230 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6231 * to handle parse events. If sax is NULL, fallback to the default DOM
6232 * behavior and return a tree.
6233 *
6234 * Returns the resulting document tree unless SAX is NULL or the document is
6235 * not well formed.
6236 */
6237
6238htmlDocPtr
6239htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6240 htmlSAXHandlerPtr sax, void *userData) {
6241 htmlDocPtr ret;
6242 htmlParserCtxtPtr ctxt;
6243
6244 xmlInitParser();
6245
6246 if (cur == NULL) return(NULL);
6247
6248
6249 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6250 if (ctxt == NULL) return(NULL);
6251 if (sax != NULL) {
6252 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6253 ctxt->sax = sax;
6254 ctxt->userData = userData;
6255 }
6256
6257 htmlParseDocument(ctxt);
6258 ret = ctxt->myDoc;
6259 if (sax != NULL) {
6260 ctxt->sax = NULL;
6261 ctxt->userData = NULL;
6262 }
6263 htmlFreeParserCtxt(ctxt);
6264
6265 return(ret);
6266}
6267
6268/**
6269 * htmlParseDoc:
6270 * @cur: a pointer to an array of xmlChar
6271 * @encoding: a free form C string describing the HTML document encoding, or NULL
6272 *
6273 * parse an HTML in-memory document and build a tree.
6274 *
6275 * Returns the resulting document tree
6276 */
6277
6278htmlDocPtr
6279htmlParseDoc(const xmlChar *cur, const char *encoding) {
6280 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6281}
6282
6283
6284/**
6285 * htmlCreateFileParserCtxt:
6286 * @filename: the filename
6287 * @encoding: a free form C string describing the HTML document encoding, or NULL
6288 *
6289 * Create a parser context for a file content.
6290 * Automatic support for ZLIB/Compress compressed document is provided
6291 * by default if found at compile-time.
6292 *
6293 * Returns the new parser context or NULL
6294 */
6295htmlParserCtxtPtr
6296htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6297{
6298 htmlParserCtxtPtr ctxt;
6299 htmlParserInputPtr inputStream;
6300 char *canonicFilename;
6301 /* htmlCharEncoding enc; */
6302 xmlChar *content, *content_line = (xmlChar *) "charset=";
6303
6304 if (filename == NULL)
6305 return(NULL);
6306
6307 ctxt = htmlNewParserCtxt();
6308 if (ctxt == NULL) {
6309 return(NULL);
6310 }
6311 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6312 if (canonicFilename == NULL) {
6313#ifdef LIBXML_SAX1_ENABLED
6314 if (xmlDefaultSAXHandler.error != NULL) {
6315 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6316 }
6317#endif
6318 xmlFreeParserCtxt(ctxt);
6319 return(NULL);
6320 }
6321
6322 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6323 xmlFree(canonicFilename);
6324 if (inputStream == NULL) {
6325 xmlFreeParserCtxt(ctxt);
6326 return(NULL);
6327 }
6328
6329 inputPush(ctxt, inputStream);
6330
6331 /* set encoding */
6332 if (encoding) {
6333 size_t l = strlen(encoding);
6334
6335 if (l < 1000) {
6336 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6337 if (content) {
6338 strcpy ((char *)content, (char *)content_line);
6339 strcat ((char *)content, (char *)encoding);
6340 htmlCheckEncoding (ctxt, content);
6341 xmlFree (content);
6342 }
6343 }
6344 }
6345
6346 return(ctxt);
6347}
6348
6349/**
6350 * htmlSAXParseFile:
6351 * @filename: the filename
6352 * @encoding: a free form C string describing the HTML document encoding, or NULL
6353 * @sax: the SAX handler block
6354 * @userData: if using SAX, this pointer will be provided on callbacks.
6355 *
6356 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6357 * compressed document is provided by default if found at compile-time.
6358 * It use the given SAX function block to handle the parsing callback.
6359 * If sax is NULL, fallback to the default DOM tree building routines.
6360 *
6361 * Returns the resulting document tree unless SAX is NULL or the document is
6362 * not well formed.
6363 */
6364
6365htmlDocPtr
6366htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6367 void *userData) {
6368 htmlDocPtr ret;
6369 htmlParserCtxtPtr ctxt;
6370 htmlSAXHandlerPtr oldsax = NULL;
6371
6372 xmlInitParser();
6373
6374 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6375 if (ctxt == NULL) return(NULL);
6376 if (sax != NULL) {
6377 oldsax = ctxt->sax;
6378 ctxt->sax = sax;
6379 ctxt->userData = userData;
6380 }
6381
6382 htmlParseDocument(ctxt);
6383
6384 ret = ctxt->myDoc;
6385 if (sax != NULL) {
6386 ctxt->sax = oldsax;
6387 ctxt->userData = NULL;
6388 }
6389 htmlFreeParserCtxt(ctxt);
6390
6391 return(ret);
6392}
6393
6394/**
6395 * htmlParseFile:
6396 * @filename: the filename
6397 * @encoding: a free form C string describing the HTML document encoding, or NULL
6398 *
6399 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6400 * compressed document is provided by default if found at compile-time.
6401 *
6402 * Returns the resulting document tree
6403 */
6404
6405htmlDocPtr
6406htmlParseFile(const char *filename, const char *encoding) {
6407 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6408}
6409
6410/**
6411 * htmlHandleOmittedElem:
6412 * @val: int 0 or 1
6413 *
6414 * Set and return the previous value for handling HTML omitted tags.
6415 *
6416 * Returns the last value for 0 for no handling, 1 for auto insertion.
6417 */
6418
6419int
6420htmlHandleOmittedElem(int val) {
6421 int old = htmlOmittedDefaultValue;
6422
6423 htmlOmittedDefaultValue = val;
6424 return(old);
6425}
6426
6427/**
6428 * htmlElementAllowedHere:
6429 * @parent: HTML parent element
6430 * @elt: HTML element
6431 *
6432 * Checks whether an HTML element may be a direct child of a parent element.
6433 * Note - doesn't check for deprecated elements
6434 *
6435 * Returns 1 if allowed; 0 otherwise.
6436 */
6437int
6438htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6439 const char** p ;
6440
6441 if ( ! elt || ! parent || ! parent->subelts )
6442 return 0 ;
6443
6444 for ( p = parent->subelts; *p; ++p )
6445 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6446 return 1 ;
6447
6448 return 0 ;
6449}
6450/**
6451 * htmlElementStatusHere:
6452 * @parent: HTML parent element
6453 * @elt: HTML element
6454 *
6455 * Checks whether an HTML element may be a direct child of a parent element.
6456 * and if so whether it is valid or deprecated.
6457 *
6458 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6459 */
6460htmlStatus
6461htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6462 if ( ! parent || ! elt )
6463 return HTML_INVALID ;
6464 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6465 return HTML_INVALID ;
6466
6467 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6468}
6469/**
6470 * htmlAttrAllowed:
6471 * @elt: HTML element
6472 * @attr: HTML attribute
6473 * @legacy: whether to allow deprecated attributes
6474 *
6475 * Checks whether an attribute is valid for an element
6476 * Has full knowledge of Required and Deprecated attributes
6477 *
6478 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6479 */
6480htmlStatus
6481htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6482 const char** p ;
6483
6484 if ( !elt || ! attr )
6485 return HTML_INVALID ;
6486
6487 if ( elt->attrs_req )
6488 for ( p = elt->attrs_req; *p; ++p)
6489 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6490 return HTML_REQUIRED ;
6491
6492 if ( elt->attrs_opt )
6493 for ( p = elt->attrs_opt; *p; ++p)
6494 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6495 return HTML_VALID ;
6496
6497 if ( legacy && elt->attrs_depr )
6498 for ( p = elt->attrs_depr; *p; ++p)
6499 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6500 return HTML_DEPRECATED ;
6501
6502 return HTML_INVALID ;
6503}
6504/**
6505 * htmlNodeStatus:
6506 * @node: an htmlNodePtr in a tree
6507 * @legacy: whether to allow deprecated elements (YES is faster here
6508 * for Element nodes)
6509 *
6510 * Checks whether the tree node is valid. Experimental (the author
6511 * only uses the HTML enhancements in a SAX parser)
6512 *
6513 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6514 * legacy allowed) or htmlElementStatusHere (otherwise).
6515 * for Attribute nodes, a return from htmlAttrAllowed
6516 * for other nodes, HTML_NA (no checks performed)
6517 */
6518htmlStatus
6519htmlNodeStatus(const htmlNodePtr node, int legacy) {
6520 if ( ! node )
6521 return HTML_INVALID ;
6522
6523 switch ( node->type ) {
6524 case XML_ELEMENT_NODE:
6525 return legacy
6526 ? ( htmlElementAllowedHere (
6527 htmlTagLookup(node->parent->name) , node->name
6528 ) ? HTML_VALID : HTML_INVALID )
6529 : htmlElementStatusHere(
6530 htmlTagLookup(node->parent->name) ,
6531 htmlTagLookup(node->name) )
6532 ;
6533 case XML_ATTRIBUTE_NODE:
6534 return htmlAttrAllowed(
6535 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6536 default: return HTML_NA ;
6537 }
6538}
6539/************************************************************************
6540 * *
6541 * New set (2.6.0) of simpler and more flexible APIs *
6542 * *
6543 ************************************************************************/
6544/**
6545 * DICT_FREE:
6546 * @str: a string
6547 *
6548 * Free a string if it is not owned by the "dict" dictionary in the
6549 * current scope
6550 */
6551#define DICT_FREE(str) \
6552 if ((str) && ((!dict) || \
6553 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6554 xmlFree((char *)(str));
6555
6556/**
6557 * htmlCtxtReset:
6558 * @ctxt: an HTML parser context
6559 *
6560 * Reset a parser context
6561 */
6562void
6563htmlCtxtReset(htmlParserCtxtPtr ctxt)
6564{
6565 xmlParserInputPtr input;
6566 xmlDictPtr dict;
6567
6568 if (ctxt == NULL)
6569 return;
6570
6571 xmlInitParser();
6572 dict = ctxt->dict;
6573
6574 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6575 xmlFreeInputStream(input);
6576 }
6577 ctxt->inputNr = 0;
6578 ctxt->input = NULL;
6579
6580 ctxt->spaceNr = 0;
6581 if (ctxt->spaceTab != NULL) {
6582 ctxt->spaceTab[0] = -1;
6583 ctxt->space = &ctxt->spaceTab[0];
6584 } else {
6585 ctxt->space = NULL;
6586 }
6587
6588
6589 ctxt->nodeNr = 0;
6590 ctxt->node = NULL;
6591
6592 ctxt->nameNr = 0;
6593 ctxt->name = NULL;
6594
6595 DICT_FREE(ctxt->version);
6596 ctxt->version = NULL;
6597 DICT_FREE(ctxt->encoding);
6598 ctxt->encoding = NULL;
6599 DICT_FREE(ctxt->directory);
6600 ctxt->directory = NULL;
6601 DICT_FREE(ctxt->extSubURI);
6602 ctxt->extSubURI = NULL;
6603 DICT_FREE(ctxt->extSubSystem);
6604 ctxt->extSubSystem = NULL;
6605 if (ctxt->myDoc != NULL)
6606 xmlFreeDoc(ctxt->myDoc);
6607 ctxt->myDoc = NULL;
6608
6609 ctxt->standalone = -1;
6610 ctxt->hasExternalSubset = 0;
6611 ctxt->hasPErefs = 0;
6612 ctxt->html = 1;
6613 ctxt->external = 0;
6614 ctxt->instate = XML_PARSER_START;
6615 ctxt->token = 0;
6616
6617 ctxt->wellFormed = 1;
6618 ctxt->nsWellFormed = 1;
6619 ctxt->disableSAX = 0;
6620 ctxt->valid = 1;
6621 ctxt->vctxt.userData = ctxt;
6622 ctxt->vctxt.error = xmlParserValidityError;
6623 ctxt->vctxt.warning = xmlParserValidityWarning;
6624 ctxt->record_info = 0;
6625 ctxt->nbChars = 0;
6626 ctxt->checkIndex = 0;
6627 ctxt->inSubset = 0;
6628 ctxt->errNo = XML_ERR_OK;
6629 ctxt->depth = 0;
6630 ctxt->charset = XML_CHAR_ENCODING_NONE;
6631 ctxt->catalogs = NULL;
6632 xmlInitNodeInfoSeq(&ctxt->node_seq);
6633
6634 if (ctxt->attsDefault != NULL) {
6635 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6636 ctxt->attsDefault = NULL;
6637 }
6638 if (ctxt->attsSpecial != NULL) {
6639 xmlHashFree(ctxt->attsSpecial, NULL);
6640 ctxt->attsSpecial = NULL;
6641 }
6642}
6643
6644/**
6645 * htmlCtxtUseOptions:
6646 * @ctxt: an HTML parser context
6647 * @options: a combination of htmlParserOption(s)
6648 *
6649 * Applies the options to the parser context
6650 *
6651 * Returns 0 in case of success, the set of unknown or unimplemented options
6652 * in case of error.
6653 */
6654int
6655htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6656{
6657 if (ctxt == NULL)
6658 return(-1);
6659
6660 if (options & HTML_PARSE_NOWARNING) {
6661 ctxt->sax->warning = NULL;
6662 ctxt->vctxt.warning = NULL;
6663 options -= XML_PARSE_NOWARNING;
6664 ctxt->options |= XML_PARSE_NOWARNING;
6665 }
6666 if (options & HTML_PARSE_NOERROR) {
6667 ctxt->sax->error = NULL;
6668 ctxt->vctxt.error = NULL;
6669 ctxt->sax->fatalError = NULL;
6670 options -= XML_PARSE_NOERROR;
6671 ctxt->options |= XML_PARSE_NOERROR;
6672 }
6673 if (options & HTML_PARSE_PEDANTIC) {
6674 ctxt->pedantic = 1;
6675 options -= XML_PARSE_PEDANTIC;
6676 ctxt->options |= XML_PARSE_PEDANTIC;
6677 } else
6678 ctxt->pedantic = 0;
6679 if (options & XML_PARSE_NOBLANKS) {
6680 ctxt->keepBlanks = 0;
6681 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6682 options -= XML_PARSE_NOBLANKS;
6683 ctxt->options |= XML_PARSE_NOBLANKS;
6684 } else
6685 ctxt->keepBlanks = 1;
6686 if (options & HTML_PARSE_RECOVER) {
6687 ctxt->recovery = 1;
6688 options -= HTML_PARSE_RECOVER;
6689 } else
6690 ctxt->recovery = 0;
6691 if (options & HTML_PARSE_COMPACT) {
6692 ctxt->options |= HTML_PARSE_COMPACT;
6693 options -= HTML_PARSE_COMPACT;
6694 }
6695 if (options & XML_PARSE_HUGE) {
6696 ctxt->options |= XML_PARSE_HUGE;
6697 options -= XML_PARSE_HUGE;
6698 }
6699 if (options & HTML_PARSE_NODEFDTD) {
6700 ctxt->options |= HTML_PARSE_NODEFDTD;
6701 options -= HTML_PARSE_NODEFDTD;
6702 }
6703 if (options & HTML_PARSE_IGNORE_ENC) {
6704 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6705 options -= HTML_PARSE_IGNORE_ENC;
6706 }
6707 if (options & HTML_PARSE_NOIMPLIED) {
6708 ctxt->options |= HTML_PARSE_NOIMPLIED;
6709 options -= HTML_PARSE_NOIMPLIED;
6710 }
6711 ctxt->dictNames = 0;
6712 return (options);
6713}
6714
6715/**
6716 * htmlDoRead:
6717 * @ctxt: an HTML parser context
6718 * @URL: the base URL to use for the document
6719 * @encoding: the document encoding, or NULL
6720 * @options: a combination of htmlParserOption(s)
6721 * @reuse: keep the context for reuse
6722 *
6723 * Common front-end for the htmlRead functions
6724 *
6725 * Returns the resulting document tree or NULL
6726 */
6727static htmlDocPtr
6728htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6729 int options, int reuse)
6730{
6731 htmlDocPtr ret;
6732
6733 htmlCtxtUseOptions(ctxt, options);
6734 ctxt->html = 1;
6735 if (encoding != NULL) {
6736 xmlCharEncodingHandlerPtr hdlr;
6737
6738 hdlr = xmlFindCharEncodingHandler(encoding);
6739 if (hdlr != NULL) {
6740 xmlSwitchToEncoding(ctxt, hdlr);
6741 if (ctxt->input->encoding != NULL)
6742 xmlFree((xmlChar *) ctxt->input->encoding);
6743 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6744 }
6745 }
6746 if ((URL != NULL) && (ctxt->input != NULL) &&
6747 (ctxt->input->filename == NULL))
6748 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6749 htmlParseDocument(ctxt);
6750 ret = ctxt->myDoc;
6751 ctxt->myDoc = NULL;
6752 if (!reuse) {
6753 if ((ctxt->dictNames) &&
6754 (ret != NULL) &&
6755 (ret->dict == ctxt->dict))
6756 ctxt->dict = NULL;
6757 xmlFreeParserCtxt(ctxt);
6758 }
6759 return (ret);
6760}
6761
6762/**
6763 * htmlReadDoc:
6764 * @cur: a pointer to a zero terminated string
6765 * @URL: the base URL to use for the document
6766 * @encoding: the document encoding, or NULL
6767 * @options: a combination of htmlParserOption(s)
6768 *
6769 * parse an XML in-memory document and build a tree.
6770 *
6771 * Returns the resulting document tree
6772 */
6773htmlDocPtr
6774htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6775{
6776 htmlParserCtxtPtr ctxt;
6777
6778 if (cur == NULL)
6779 return (NULL);
6780
6781 xmlInitParser();
6782 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6783 if (ctxt == NULL)
6784 return (NULL);
6785 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6786}
6787
6788/**
6789 * htmlReadFile:
6790 * @filename: a file or URL
6791 * @encoding: the document encoding, or NULL
6792 * @options: a combination of htmlParserOption(s)
6793 *
6794 * parse an XML file from the filesystem or the network.
6795 *
6796 * Returns the resulting document tree
6797 */
6798htmlDocPtr
6799htmlReadFile(const char *filename, const char *encoding, int options)
6800{
6801 htmlParserCtxtPtr ctxt;
6802
6803 xmlInitParser();
6804 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6805 if (ctxt == NULL)
6806 return (NULL);
6807 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6808}
6809
6810/**
6811 * htmlReadMemory:
6812 * @buffer: a pointer to a char array
6813 * @size: the size of the array
6814 * @URL: the base URL to use for the document
6815 * @encoding: the document encoding, or NULL
6816 * @options: a combination of htmlParserOption(s)
6817 *
6818 * parse an XML in-memory document and build a tree.
6819 *
6820 * Returns the resulting document tree
6821 */
6822htmlDocPtr
6823htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6824{
6825 htmlParserCtxtPtr ctxt;
6826
6827 xmlInitParser();
6828 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6829 if (ctxt == NULL)
6830 return (NULL);
6831 htmlDefaultSAXHandlerInit();
6832 if (ctxt->sax != NULL)
6833 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6834 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6835}
6836
6837/**
6838 * htmlReadFd:
6839 * @fd: an open file descriptor
6840 * @URL: the base URL to use for the document
6841 * @encoding: the document encoding, or NULL
6842 * @options: a combination of htmlParserOption(s)
6843 *
6844 * parse an XML from a file descriptor and build a tree.
6845 *
6846 * Returns the resulting document tree
6847 */
6848htmlDocPtr
6849htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6850{
6851 htmlParserCtxtPtr ctxt;
6852 xmlParserInputBufferPtr input;
6853 xmlParserInputPtr stream;
6854
6855 if (fd < 0)
6856 return (NULL);
6857 xmlInitParser();
6858
6859 xmlInitParser();
6860 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6861 if (input == NULL)
6862 return (NULL);
6863 ctxt = xmlNewParserCtxt();
6864 if (ctxt == NULL) {
6865 xmlFreeParserInputBuffer(input);
6866 return (NULL);
6867 }
6868 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6869 if (stream == NULL) {
6870 xmlFreeParserInputBuffer(input);
6871 xmlFreeParserCtxt(ctxt);
6872 return (NULL);
6873 }
6874 inputPush(ctxt, stream);
6875 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6876}
6877
6878/**
6879 * htmlReadIO:
6880 * @ioread: an I/O read function
6881 * @ioclose: an I/O close function
6882 * @ioctx: an I/O handler
6883 * @URL: the base URL to use for the document
6884 * @encoding: the document encoding, or NULL
6885 * @options: a combination of htmlParserOption(s)
6886 *
6887 * parse an HTML document from I/O functions and source and build a tree.
6888 *
6889 * Returns the resulting document tree
6890 */
6891htmlDocPtr
6892htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6893 void *ioctx, const char *URL, const char *encoding, int options)
6894{
6895 htmlParserCtxtPtr ctxt;
6896 xmlParserInputBufferPtr input;
6897 xmlParserInputPtr stream;
6898
6899 if (ioread == NULL)
6900 return (NULL);
6901 xmlInitParser();
6902
6903 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6904 XML_CHAR_ENCODING_NONE);
6905 if (input == NULL) {
6906 if (ioclose != NULL)
6907 ioclose(ioctx);
6908 return (NULL);
6909 }
6910 ctxt = htmlNewParserCtxt();
6911 if (ctxt == NULL) {
6912 xmlFreeParserInputBuffer(input);
6913 return (NULL);
6914 }
6915 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6916 if (stream == NULL) {
6917 xmlFreeParserInputBuffer(input);
6918 xmlFreeParserCtxt(ctxt);
6919 return (NULL);
6920 }
6921 inputPush(ctxt, stream);
6922 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6923}
6924
6925/**
6926 * htmlCtxtReadDoc:
6927 * @ctxt: an HTML parser context
6928 * @cur: a pointer to a zero terminated string
6929 * @URL: the base URL to use for the document
6930 * @encoding: the document encoding, or NULL
6931 * @options: a combination of htmlParserOption(s)
6932 *
6933 * parse an XML in-memory document and build a tree.
6934 * This reuses the existing @ctxt parser context
6935 *
6936 * Returns the resulting document tree
6937 */
6938htmlDocPtr
6939htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6940 const char *URL, const char *encoding, int options)
6941{
6942 xmlParserInputPtr stream;
6943
6944 if (cur == NULL)
6945 return (NULL);
6946 if (ctxt == NULL)
6947 return (NULL);
6948 xmlInitParser();
6949
6950 htmlCtxtReset(ctxt);
6951
6952 stream = xmlNewStringInputStream(ctxt, cur);
6953 if (stream == NULL) {
6954 return (NULL);
6955 }
6956 inputPush(ctxt, stream);
6957 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6958}
6959
6960/**
6961 * htmlCtxtReadFile:
6962 * @ctxt: an HTML parser context
6963 * @filename: a file or URL
6964 * @encoding: the document encoding, or NULL
6965 * @options: a combination of htmlParserOption(s)
6966 *
6967 * parse an XML file from the filesystem or the network.
6968 * This reuses the existing @ctxt parser context
6969 *
6970 * Returns the resulting document tree
6971 */
6972htmlDocPtr
6973htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6974 const char *encoding, int options)
6975{
6976 xmlParserInputPtr stream;
6977
6978 if (filename == NULL)
6979 return (NULL);
6980 if (ctxt == NULL)
6981 return (NULL);
6982 xmlInitParser();
6983
6984 htmlCtxtReset(ctxt);
6985
6986 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6987 if (stream == NULL) {
6988 return (NULL);
6989 }
6990 inputPush(ctxt, stream);
6991 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6992}
6993
6994/**
6995 * htmlCtxtReadMemory:
6996 * @ctxt: an HTML parser context
6997 * @buffer: a pointer to a char array
6998 * @size: the size of the array
6999 * @URL: the base URL to use for the document
7000 * @encoding: the document encoding, or NULL
7001 * @options: a combination of htmlParserOption(s)
7002 *
7003 * parse an XML in-memory document and build a tree.
7004 * This reuses the existing @ctxt parser context
7005 *
7006 * Returns the resulting document tree
7007 */
7008htmlDocPtr
7009htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7010 const char *URL, const char *encoding, int options)
7011{
7012 xmlParserInputBufferPtr input;
7013 xmlParserInputPtr stream;
7014
7015 if (ctxt == NULL)
7016 return (NULL);
7017 if (buffer == NULL)
7018 return (NULL);
7019 xmlInitParser();
7020
7021 htmlCtxtReset(ctxt);
7022
7023 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7024 if (input == NULL) {
7025 return(NULL);
7026 }
7027
7028 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7029 if (stream == NULL) {
7030 xmlFreeParserInputBuffer(input);
7031 return(NULL);
7032 }
7033
7034 inputPush(ctxt, stream);
7035 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7036}
7037
7038/**
7039 * htmlCtxtReadFd:
7040 * @ctxt: an HTML parser context
7041 * @fd: an open file descriptor
7042 * @URL: the base URL to use for the document
7043 * @encoding: the document encoding, or NULL
7044 * @options: a combination of htmlParserOption(s)
7045 *
7046 * parse an XML from a file descriptor and build a tree.
7047 * This reuses the existing @ctxt parser context
7048 *
7049 * Returns the resulting document tree
7050 */
7051htmlDocPtr
7052htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7053 const char *URL, const char *encoding, int options)
7054{
7055 xmlParserInputBufferPtr input;
7056 xmlParserInputPtr stream;
7057
7058 if (fd < 0)
7059 return (NULL);
7060 if (ctxt == NULL)
7061 return (NULL);
7062 xmlInitParser();
7063
7064 htmlCtxtReset(ctxt);
7065
7066
7067 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7068 if (input == NULL)
7069 return (NULL);
7070 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7071 if (stream == NULL) {
7072 xmlFreeParserInputBuffer(input);
7073 return (NULL);
7074 }
7075 inputPush(ctxt, stream);
7076 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7077}
7078
7079/**
7080 * htmlCtxtReadIO:
7081 * @ctxt: an HTML parser context
7082 * @ioread: an I/O read function
7083 * @ioclose: an I/O close function
7084 * @ioctx: an I/O handler
7085 * @URL: the base URL to use for the document
7086 * @encoding: the document encoding, or NULL
7087 * @options: a combination of htmlParserOption(s)
7088 *
7089 * parse an HTML document from I/O functions and source and build a tree.
7090 * This reuses the existing @ctxt parser context
7091 *
7092 * Returns the resulting document tree
7093 */
7094htmlDocPtr
7095htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7096 xmlInputCloseCallback ioclose, void *ioctx,
7097 const char *URL,
7098 const char *encoding, int options)
7099{
7100 xmlParserInputBufferPtr input;
7101 xmlParserInputPtr stream;
7102
7103 if (ioread == NULL)
7104 return (NULL);
7105 if (ctxt == NULL)
7106 return (NULL);
7107 xmlInitParser();
7108
7109 htmlCtxtReset(ctxt);
7110
7111 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7112 XML_CHAR_ENCODING_NONE);
7113 if (input == NULL) {
7114 if (ioclose != NULL)
7115 ioclose(ioctx);
7116 return (NULL);
7117 }
7118 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7119 if (stream == NULL) {
7120 xmlFreeParserInputBuffer(input);
7121 return (NULL);
7122 }
7123 inputPush(ctxt, stream);
7124 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7125}
7126
7127#define bottom_HTMLparser
7128#include "elfgcchack.h"
7129#endif /* LIBXML_HTML_ENABLED */