Blame - HTMLparser.c - platform/external/libxml2

blob: c9a64c780d9f4fb24aaa56acf5bdaeebee2d10e6 [file] [log] [blame]

Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* daniel@veillard.com
				7	*/
				8
				9	#define IN_LIBXML
				10	#include "libxml.h"
				11	#ifdef LIBXML_HTML_ENABLED
				12
				13	#include <string.h>
				14	#ifdef HAVE_CTYPE_H
				15	#include <ctype.h>
				16	#endif
				17	#ifdef HAVE_STDLIB_H
				18	#include <stdlib.h>
				19	#endif
				20	#ifdef HAVE_SYS_STAT_H
				21	#include <sys/stat.h>
				22	#endif
				23	#ifdef HAVE_FCNTL_H
				24	#include <fcntl.h>
				25	#endif
				26	#ifdef HAVE_UNISTD_H
				27	#include <unistd.h>
				28	#endif
				29	#ifdef LIBXML_ZLIB_ENABLED
				30	#include <zlib.h>
				31	#endif
				32
				33	#include <libxml/xmlmemory.h>
				34	#include <libxml/tree.h>
				35	#include <libxml/parser.h>
				36	#include <libxml/parserInternals.h>
				37	#include <libxml/xmlerror.h>
				38	#include <libxml/HTMLparser.h>
				39	#include <libxml/HTMLtree.h>
				40	#include <libxml/entities.h>
				41	#include <libxml/encoding.h>
				42	#include <libxml/valid.h>
				43	#include <libxml/xmlIO.h>
				44	#include <libxml/globals.h>
				45	#include <libxml/uri.h>
				46
				47	#include "buf.h"
				48	#include "enc.h"
				49
				50	#define HTML_MAX_NAMELEN 1000
				51	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				52	#define HTML_PARSER_BUFFER_SIZE 100
				53
				54	/* #define DEBUG */
				55	/* #define DEBUG_PUSH */
				56
				57	static int htmlOmittedDefaultValue = 1;
				58
				59	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				60	xmlChar end, xmlChar end2, xmlChar end3);
				61	static void htmlParseComment(htmlParserCtxtPtr ctxt);
				62
				63	/************************************************************************
				64	* *
				65	* Some factorized error routines *
				66	* *
				67	************************************************************************/
				68
				69	/**
				70	* htmlErrMemory:
				71	* @ctxt: an HTML parser context
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	72	* @extra: extra information
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	73	*
				74	* Handle a redefinition of attribute error
				75	*/
				76	static void
				77	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
				78	{
				79	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				80	(ctxt->instate == XML_PARSER_EOF))
				81	return;
				82	if (ctxt != NULL) {
				83	ctxt->errNo = XML_ERR_NO_MEMORY;
				84	ctxt->instate = XML_PARSER_EOF;
				85	ctxt->disableSAX = 1;
				86	}
				87	if (extra)
				88	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
				89	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
				90	NULL, NULL, 0, 0,
				91	"Memory allocation failed : %s\n", extra);
				92	else
				93	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
				94	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
				95	NULL, NULL, 0, 0, "Memory allocation failed\n");
				96	}
				97
				98	/**
				99	* htmlParseErr:
				100	* @ctxt: an HTML parser context
				101	* @error: the error number
				102	* @msg: the error message
				103	* @str1: string infor
				104	* @str2: string infor
				105	*
				106	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
				107	*/
				108	static void LIBXML_ATTR_FORMAT(3,0)
				109	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
				110	const char msg, const xmlChar str1, const xmlChar *str2)
				111	{
				112	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				113	(ctxt->instate == XML_PARSER_EOF))
				114	return;
				115	if (ctxt != NULL)
				116	ctxt->errNo = error;
				117	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
				118	XML_ERR_ERROR, NULL, 0,
				119	(const char ) str1, (const char ) str2,
				120	NULL, 0, 0,
				121	msg, str1, str2);
				122	if (ctxt != NULL)
				123	ctxt->wellFormed = 0;
				124	}
				125
				126	/**
				127	* htmlParseErrInt:
				128	* @ctxt: an HTML parser context
				129	* @error: the error number
				130	* @msg: the error message
				131	* @val: integer info
				132	*
				133	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
				134	*/
				135	static void LIBXML_ATTR_FORMAT(3,0)
				136	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
				137	const char *msg, int val)
				138	{
				139	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				140	(ctxt->instate == XML_PARSER_EOF))
				141	return;
				142	if (ctxt != NULL)
				143	ctxt->errNo = error;
				144	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
				145	XML_ERR_ERROR, NULL, 0, NULL, NULL,
				146	NULL, val, 0, msg, val);
				147	if (ctxt != NULL)
				148	ctxt->wellFormed = 0;
				149	}
				150
				151	/************************************************************************
				152	* *
				153	* Parser stacks related functions and macros *
				154	* *
				155	************************************************************************/
				156
				157	/**
				158	* htmlnamePush:
				159	* @ctxt: an HTML parser context
				160	* @value: the element name
				161	*
				162	* Pushes a new element name on top of the name stack
				163	*
				164	* Returns 0 in case of error, the index in the stack otherwise
				165	*/
				166	static int
				167	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
				168	{
				169	if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
				170	ctxt->html = 3;
				171	if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
				172	ctxt->html = 10;
				173	if (ctxt->nameNr >= ctxt->nameMax) {
				174	ctxt->nameMax *= 2;
				175	ctxt->nameTab = (const xmlChar * *)
				176	xmlRealloc((xmlChar * *)ctxt->nameTab,
				177	ctxt->nameMax *
				178	sizeof(ctxt->nameTab[0]));
				179	if (ctxt->nameTab == NULL) {
				180	htmlErrMemory(ctxt, NULL);
				181	return (0);
				182	}
				183	}
				184	ctxt->nameTab[ctxt->nameNr] = value;
				185	ctxt->name = value;
				186	return (ctxt->nameNr++);
				187	}
				188	/**
				189	* htmlnamePop:
				190	* @ctxt: an HTML parser context
				191	*
				192	* Pops the top element name from the name stack
				193	*
				194	* Returns the name just removed
				195	*/
				196	static const xmlChar *
				197	htmlnamePop(htmlParserCtxtPtr ctxt)
				198	{
				199	const xmlChar *ret;
				200
				201	if (ctxt->nameNr <= 0)
				202	return (NULL);
				203	ctxt->nameNr--;
				204	if (ctxt->nameNr < 0)
				205	return (NULL);
				206	if (ctxt->nameNr > 0)
				207	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
				208	else
				209	ctxt->name = NULL;
				210	ret = ctxt->nameTab[ctxt->nameNr];
				211	ctxt->nameTab[ctxt->nameNr] = NULL;
				212	return (ret);
				213	}
				214
				215	/**
				216	* htmlNodeInfoPush:
				217	* @ctxt: an HTML parser context
				218	* @value: the node info
				219	*
				220	* Pushes a new element name on top of the node info stack
				221	*
				222	* Returns 0 in case of error, the index in the stack otherwise
				223	*/
				224	static int
				225	htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
				226	{
				227	if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
				228	if (ctxt->nodeInfoMax == 0)
				229	ctxt->nodeInfoMax = 5;
				230	ctxt->nodeInfoMax *= 2;
				231	ctxt->nodeInfoTab = (htmlParserNodeInfo *)
				232	xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
				233	ctxt->nodeInfoMax *
				234	sizeof(ctxt->nodeInfoTab[0]));
				235	if (ctxt->nodeInfoTab == NULL) {
				236	htmlErrMemory(ctxt, NULL);
				237	return (0);
				238	}
				239	}
				240	ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
				241	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
				242	return (ctxt->nodeInfoNr++);
				243	}
				244
				245	/**
				246	* htmlNodeInfoPop:
				247	* @ctxt: an HTML parser context
				248	*
				249	* Pops the top element name from the node info stack
				250	*
				251	* Returns 0 in case of error, the pointer to NodeInfo otherwise
				252	*/
				253	static htmlParserNodeInfo *
				254	htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
				255	{
				256	if (ctxt->nodeInfoNr <= 0)
				257	return (NULL);
				258	ctxt->nodeInfoNr--;
				259	if (ctxt->nodeInfoNr < 0)
				260	return (NULL);
				261	if (ctxt->nodeInfoNr > 0)
				262	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
				263	else
				264	ctxt->nodeInfo = NULL;
				265	return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
				266	}
				267
				268	/*
				269	* Macros for accessing the content. Those should be used only by the parser,
				270	* and not exported.
				271	*
				272	* Dirty macros, i.e. one need to make assumption on the context to use them
				273	*
				274	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				275	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				276	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				277	* in UNICODE mode. This should be used internally by the parser
				278	* only to compare to ASCII values otherwise it would break when
				279	* running with UTF-8 encoding.
				280	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				281	* to compare on ASCII based substring.
				282	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				283	* it should be used only to compare on ASCII based substring.
				284	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				285	* strings without newlines within the parser.
				286	*
				287	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				288	*
				289	* CURRENT Returns the current char value, with the full decoding of
				290	* UTF-8 if we are using this mode. It returns an int.
				291	* NEXT Skip to the next character, this does the proper decoding
				292	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				293	* NEXTL(l) Skip the current unicode character of l xmlChars long.
				294	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				295	*/
				296
				297	#define UPPER (toupper(*ctxt->input->cur))
				298
Haibo Huang	f0a546b	2020-09-01 20:28:19 -0700	[diff] [blame]	299	#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	300
				301	#define NXT(val) ctxt->input->cur[(val)]
				302
				303	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				304
				305	#define CUR_PTR ctxt->input->cur
				306	#define BASE_PTR ctxt->input->base
				307
				308	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
				309	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
				310	xmlParserInputShrink(ctxt->input)
				311
				312	#define GROW if ((ctxt->progressive == 0) && \
				313	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
				314	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				315
				316	#define CURRENT ((int) (*ctxt->input->cur))
				317
				318	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				319
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	320	/* Imported from XML */
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	321
				322	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				323	#define CUR ((int) (*ctxt->input->cur))
				324	#define NEXT xmlNextChar(ctxt)
				325
				326	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				327
				328
				329	#define NEXTL(l) do { \
				330	if (*(ctxt->input->cur) == '\n') { \
				331	ctxt->input->line++; ctxt->input->col = 1; \
				332	} else ctxt->input->col++; \
Haibo Huang	f0a546b	2020-09-01 20:28:19 -0700	[diff] [blame]	333	ctxt->token = 0; ctxt->input->cur += l; \
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	334	} while (0)
				335
				336	/************
				337	\
				338	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				339	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				340	************/
				341
				342	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				343	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				344
				345	#define COPY_BUF(l,b,i,v) \
				346	if (l == 1) b[i++] = (xmlChar) v; \
				347	else i += xmlCopyChar(l,&b[i],v)
				348
				349	/**
				350	* htmlFindEncoding:
				351	* @the HTML parser context
				352	*
				353	* Ty to find and encoding in the current data available in the input
				354	* buffer this is needed to try to switch to the proper encoding when
				355	* one face a character error.
				356	* That's an heuristic, since it's operating outside of parsing it could
				357	* try to use a meta which had been commented out, that's the reason it
				358	* should only be used in case of error, not as a default.
				359	*
				360	* Returns an encoding string or NULL if not found, the string need to
				361	* be freed
				362	*/
				363	static xmlChar *
				364	htmlFindEncoding(xmlParserCtxtPtr ctxt) {
				365	const xmlChar start, cur, *end;
				366
				367	if ((ctxt == NULL) \|\| (ctxt->input == NULL) \|\|
				368	(ctxt->input->encoding != NULL) \|\| (ctxt->input->buf == NULL) \|\|
				369	(ctxt->input->buf->encoder != NULL))
				370	return(NULL);
				371	if ((ctxt->input->cur == NULL) \|\| (ctxt->input->end == NULL))
				372	return(NULL);
				373
				374	start = ctxt->input->cur;
				375	end = ctxt->input->end;
				376	/* we also expect the input buffer to be zero terminated */
				377	if (*end != 0)
				378	return(NULL);
				379
				380	cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
				381	if (cur == NULL)
				382	return(NULL);
				383	cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
				384	if (cur == NULL)
				385	return(NULL);
				386	cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
				387	if (cur == NULL)
				388	return(NULL);
				389	cur += 8;
				390	start = cur;
				391	while (((cur >= 'A') && (cur <= 'Z')) \|\|
				392	((cur >= 'a') && (cur <= 'z')) \|\|
				393	((cur >= '0') && (cur <= '9')) \|\|
				394	(cur == '-') \|\| (cur == '_') \|\| (cur == ':') \|\| (cur == '/'))
				395	cur++;
				396	if (cur == start)
				397	return(NULL);
				398	return(xmlStrndup(start, cur - start));
				399	}
				400
				401	/**
				402	* htmlCurrentChar:
				403	* @ctxt: the HTML parser context
				404	* @len: pointer to the length of the char read
				405	*
				406	* The current char value, if using UTF-8 this may actually span multiple
				407	* bytes in the input buffer. Implement the end of line normalization:
				408	* 2.11 End-of-Line Handling
				409	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				410	* char, then the encoding converter is plugged in automatically.
				411	*
				412	* Returns the current char value and its length
				413	*/
				414
				415	static int
				416	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	417	const unsigned char *cur;
				418	unsigned char c;
				419	unsigned int val;
				420
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	421	if (ctxt->instate == XML_PARSER_EOF)
				422	return(0);
				423
				424	if (ctxt->token != 0) {
				425	*len = 0;
				426	return(ctxt->token);
				427	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	428	if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	429	xmlChar * guess;
				430	xmlCharEncodingHandlerPtr handler;
				431
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	432	/*
				433	* Assume it's a fixed length encoding (1) with
				434	* a compatible encoding for the ASCII set, since
				435	* HTML constructs only use < 128 chars
				436	*/
				437	if ((int) *ctxt->input->cur < 0x80) {
				438	*len = 1;
				439	if ((*ctxt->input->cur == 0) &&
				440	(ctxt->input->cur < ctxt->input->end)) {
				441	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				442	"Char 0x%X out of allowed range\n", 0);
				443	return(' ');
				444	}
				445	return((int) *ctxt->input->cur);
				446	}
				447
				448	/*
				449	* Humm this is bad, do an automatic flow conversion
				450	*/
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	451	guess = htmlFindEncoding(ctxt);
				452	if (guess == NULL) {
				453	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				454	} else {
				455	if (ctxt->input->encoding != NULL)
				456	xmlFree((xmlChar *) ctxt->input->encoding);
				457	ctxt->input->encoding = guess;
				458	handler = xmlFindCharEncodingHandler((const char *) guess);
				459	if (handler != NULL) {
Haibo Huang	735158e	2021-02-23 17:48:08 -0800	[diff] [blame^]	460	/*
				461	* Don't use UTF-8 encoder which isn't required and
				462	* can produce invalid UTF-8.
				463	*/
				464	if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
				465	xmlSwitchToEncoding(ctxt, handler);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	466	} else {
				467	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				468	"Unsupported encoding %s", guess, NULL);
				469	}
				470	}
				471	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				472	}
				473
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	474	/*
				475	* We are supposed to handle UTF8, check it's valid
				476	* From rfc2044: encoding of the Unicode values on UTF-8:
				477	*
				478	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				479	* 0000 0000-0000 007F 0xxxxxxx
				480	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				481	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				482	*
				483	* Check for the 0x110000 limit too
				484	*/
				485	cur = ctxt->input->cur;
				486	c = *cur;
				487	if (c & 0x80) {
				488	if ((c & 0x40) == 0)
				489	goto encoding_error;
				490	if (cur[1] == 0) {
				491	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				492	cur = ctxt->input->cur;
				493	}
				494	if ((cur[1] & 0xc0) != 0x80)
				495	goto encoding_error;
				496	if ((c & 0xe0) == 0xe0) {
				497
				498	if (cur[2] == 0) {
				499	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				500	cur = ctxt->input->cur;
				501	}
				502	if ((cur[2] & 0xc0) != 0x80)
				503	goto encoding_error;
				504	if ((c & 0xf0) == 0xf0) {
				505	if (cur[3] == 0) {
				506	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				507	cur = ctxt->input->cur;
				508	}
				509	if (((c & 0xf8) != 0xf0) \|\|
				510	((cur[3] & 0xc0) != 0x80))
				511	goto encoding_error;
				512	/* 4-byte code */
				513	*len = 4;
				514	val = (cur[0] & 0x7) << 18;
				515	val \|= (cur[1] & 0x3f) << 12;
				516	val \|= (cur[2] & 0x3f) << 6;
				517	val \|= cur[3] & 0x3f;
				518	if (val < 0x10000)
				519	goto encoding_error;
				520	} else {
				521	/* 3-byte code */
				522	*len = 3;
				523	val = (cur[0] & 0xf) << 12;
				524	val \|= (cur[1] & 0x3f) << 6;
				525	val \|= cur[2] & 0x3f;
				526	if (val < 0x800)
				527	goto encoding_error;
				528	}
				529	} else {
				530	/* 2-byte code */
				531	*len = 2;
				532	val = (cur[0] & 0x1f) << 6;
				533	val \|= cur[1] & 0x3f;
				534	if (val < 0x80)
				535	goto encoding_error;
				536	}
				537	if (!IS_CHAR(val)) {
				538	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				539	"Char 0x%X out of allowed range\n", val);
				540	}
				541	return(val);
				542	} else {
				543	if ((*ctxt->input->cur == 0) &&
				544	(ctxt->input->cur < ctxt->input->end)) {
				545	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				546	"Char 0x%X out of allowed range\n", 0);
				547	*len = 1;
				548	return(' ');
				549	}
				550	/* 1-byte code */
				551	*len = 1;
				552	return((int) *ctxt->input->cur);
				553	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	554
				555	encoding_error:
				556	/*
				557	* If we detect an UTF8 error that probably mean that the
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	558	* input encoding didn't get properly advertised in the
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	559	* declaration header. Report the error and switch the encoding
				560	* to ISO-Latin-1 (if you don't like this policy, just declare the
				561	* encoding !)
				562	*/
				563	{
				564	char buffer[150];
				565
				566	if (ctxt->input->end - ctxt->input->cur >= 4) {
				567	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				568	ctxt->input->cur[0], ctxt->input->cur[1],
				569	ctxt->input->cur[2], ctxt->input->cur[3]);
				570	} else {
				571	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
				572	}
				573	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				574	"Input is not proper UTF-8, indicate encoding !\n",
				575	BAD_CAST buffer, NULL);
				576	}
				577
Haibo Huang	735158e	2021-02-23 17:48:08 -0800	[diff] [blame^]	578	/*
				579	* Don't switch encodings twice. Note that if there's an encoder, we
				580	* shouldn't receive invalid UTF-8 anyway.
				581	*
				582	* Note that if ctxt->input->buf == NULL, switching encodings is
				583	* impossible, see Gitlab issue #34.
				584	*/
				585	if ((ctxt->input->buf != NULL) &&
				586	(ctxt->input->buf->encoder == NULL))
				587	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	588	*len = 1;
				589	return((int) *ctxt->input->cur);
				590	}
				591
				592	/**
				593	* htmlSkipBlankChars:
				594	* @ctxt: the HTML parser context
				595	*
				596	* skip all blanks character found at that point in the input streams.
				597	*
				598	* Returns the number of space chars skipped
				599	*/
				600
				601	static int
				602	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				603	int res = 0;
				604
				605	while (IS_BLANK_CH(*(ctxt->input->cur))) {
				606	if ((*ctxt->input->cur == 0) &&
				607	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				608	xmlPopInput(ctxt);
				609	} else {
				610	if (*(ctxt->input->cur) == '\n') {
				611	ctxt->input->line++; ctxt->input->col = 1;
				612	} else ctxt->input->col++;
				613	ctxt->input->cur++;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	614	if (*ctxt->input->cur == 0)
				615	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				616	}
				617	res++;
				618	}
				619	return(res);
				620	}
				621
				622
				623
				624	/************************************************************************
				625	* *
				626	* The list of HTML elements and their properties *
				627	* *
				628	************************************************************************/
				629
				630	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	631	* Start Tag: 1 means the start tag can be omitted
				632	* End Tag: 1 means the end tag can be omitted
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	633	* 2 means it's forbidden (empty elements)
				634	* 3 means the tag is stylistic and should be closed easily
				635	* Depr: this element is deprecated
				636	* DTD: 1 means that this element is valid only in the Loose DTD
				637	* 2 means that this element is valid only in the Frameset DTD
				638	*
				639	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
				640	, subElements , impliedsubelt , Attributes, userdata
				641	*/
				642
				643	/* Definitions and a couple of vars for HTML Elements */
				644
				645	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
				646	#define NB_FONTSTYLE 8
				647	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
				648	#define NB_PHRASE 10
				649	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
				650	#define NB_SPECIAL 16
				651	#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
				652	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
				653	#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
				654	#define NB_BLOCK NB_HEADING + NB_LIST + 14
				655	#define FORMCTRL "input", "select", "textarea", "label", "button"
				656	#define NB_FORMCTRL 5
				657	#define PCDATA
				658	#define NB_PCDATA 0
				659	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
				660	#define NB_HEADING 6
				661	#define LIST "ul", "ol", "dir", "menu"
				662	#define NB_LIST 4
				663	#define MODIFIER
				664	#define NB_MODIFIER 0
				665	#define FLOW BLOCK,INLINE
				666	#define NB_FLOW NB_BLOCK + NB_INLINE
				667	#define EMPTY NULL
				668
				669
				670	static const char* const html_flow[] = { FLOW, NULL } ;
				671	static const char* const html_inline[] = { INLINE, NULL } ;
				672
				673	/* placeholders: elts with content but no subelements */
				674	static const char* const html_pcdata[] = { NULL } ;
				675	#define html_cdata html_pcdata
				676
				677
				678	/* ... and for HTML Attributes */
				679
				680	#define COREATTRS "id", "class", "style", "title"
				681	#define NB_COREATTRS 4
				682	#define I18N "lang", "dir"
				683	#define NB_I18N 2
				684	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
				685	#define NB_EVENTS 9
				686	#define ATTRS COREATTRS,I18N,EVENTS
				687	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
				688	#define CELLHALIGN "align", "char", "charoff"
				689	#define NB_CELLHALIGN 3
				690	#define CELLVALIGN "valign"
				691	#define NB_CELLVALIGN 1
				692
				693	static const char* const html_attrs[] = { ATTRS, NULL } ;
				694	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
				695	static const char* const core_attrs[] = { COREATTRS, NULL } ;
				696	static const char* const i18n_attrs[] = { I18N, NULL } ;
				697
				698
				699	/* Other declarations that should go inline ... */
				700	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
				701	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
				702	"tabindex", "onfocus", "onblur", NULL } ;
				703	static const char* const target_attr[] = { "target", NULL } ;
				704	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
				705	static const char* const alt_attr[] = { "alt", NULL } ;
				706	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
				707	static const char* const href_attrs[] = { "href", NULL } ;
				708	static const char* const clear_attrs[] = { "clear", NULL } ;
				709	static const char* const inline_p[] = { INLINE, "p", NULL } ;
				710
				711	static const char* const flow_param[] = { FLOW, "param", NULL } ;
				712	static const char* const applet_attrs[] = { COREATTRS , "codebase",
				713	"archive", "alt", "name", "height", "width", "align",
				714	"hspace", "vspace", NULL } ;
				715	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
				716	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
				717	static const char* const basefont_attrs[] =
				718	{ "id", "size", "color", "face", NULL } ;
				719	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
				720	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
				721	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
				722	static const char* const body_depr[] = { "background", "bgcolor", "text",
				723	"link", "vlink", "alink", NULL } ;
				724	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
				725	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
				726
				727
				728	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
				729	static const char* const col_elt[] = { "col", NULL } ;
				730	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
				731	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
				732	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
				733	static const char* const compact_attr[] = { "compact", NULL } ;
				734	static const char* const label_attr[] = { "label", NULL } ;
				735	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
				736	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
				737	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
				738	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
				739	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
				740	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
				741	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
				742	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
				743	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
				744	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
				745	static const char* const version_attr[] = { "version", NULL } ;
				746	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
				747	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
				748	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
				749	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
				750	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
				751	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
				752	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
				753	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
				754	static const char* const align_attr[] = { "align", NULL } ;
				755	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
				756	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
				757	static const char* const name_attr[] = { "name", NULL } ;
				758	static const char* const action_attr[] = { "action", NULL } ;
				759	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
				760	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
				761	static const char* const content_attr[] = { "content", NULL } ;
				762	static const char* const type_attr[] = { "type", NULL } ;
				763	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
				764	static const char* const object_contents[] = { FLOW, "param", NULL } ;
				765	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
				766	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
				767	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
				768	static const char* const option_elt[] = { "option", NULL } ;
				769	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
				770	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
				771	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
				772	static const char* const width_attr[] = { "width", NULL } ;
				773	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
				774	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
				775	static const char* const language_attr[] = { "language", NULL } ;
				776	static const char* const select_content[] = { "optgroup", "option", NULL } ;
				777	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
				778	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
				779	static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
				780	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
				781	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
				782	static const char* const tr_elt[] = { "tr", NULL } ;
				783	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
				784	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
				785	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
				786	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
				787	static const char* const tr_contents[] = { "th", "td", NULL } ;
				788	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
				789	static const char* const li_elt[] = { "li", NULL } ;
				790	static const char* const ul_depr[] = { "type", "compact", NULL} ;
				791	static const char* const dir_attr[] = { "dir", NULL} ;
				792
				793	#define DECL (const char**)
				794
				795	static const htmlElemDesc
				796	html40ElementTable[] = {
				797	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
				798	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
				799	},
				800	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
				801	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				802	},
				803	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
				804	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				805	},
				806	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
				807	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
				808	},
				809	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
				810	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
				811	},
				812	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
				813	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
				814	},
				815	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
				816	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				817	},
				818	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
				819	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
				820	},
				821	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
				822	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
				823	},
				824	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
				825	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
				826	},
				827	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
				828	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				829	},
				830	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
				831	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
				832	},
				833	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
				834	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
				835	},
				836	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
				837	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
				838	},
				839	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
				840	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
				841	},
				842	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
				843	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				844	},
				845	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
				846	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
				847	},
				848	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
				849	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				850	},
				851	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
				852	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				853	},
				854	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
				855	EMPTY , NULL , DECL col_attrs , NULL, NULL
				856	},
				857	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
				858	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
				859	},
				860	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
				861	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
				862	},
				863	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
				864	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
				865	},
				866	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
				867	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				868	},
				869	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
				870	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
				871	},
				872	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
				873	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
				874	},
				875	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
				876	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
				877	},
				878	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
				879	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				880	},
				881	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
				882	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				883	},
				884	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
				885	EMPTY, NULL, DECL embed_attrs, NULL, NULL
				886	},
				887	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
				888	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
				889	},
				890	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
				891	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
				892	},
				893	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
				894	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
				895	},
				896	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
				897	EMPTY, NULL, NULL, DECL frame_attrs, NULL
				898	},
				899	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
				900	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
				901	},
				902	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
				903	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				904	},
				905	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
				906	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				907	},
				908	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
				909	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				910	},
				911	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
				912	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				913	},
				914	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
				915	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				916	},
				917	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
				918	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				919	},
				920	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
				921	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
				922	},
				923	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
				924	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
				925	},
				926	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
				927	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
				928	},
				929	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
				930	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				931	},
				932	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
				933	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
				934	},
				935	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
				936	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
				937	},
				938	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
				939	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
				940	},
				941	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
				942	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
				943	},
				944	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
				945	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
				946	},
				947	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
				948	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				949	},
				950	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
				951	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
				952	},
				953	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
				954	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
				955	},
				956	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
				957	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
				958	},
				959	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
				960	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
				961	},
				962	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
				963	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
				964	},
				965	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
				966	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
				967	},
				968	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
				969	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
				970	},
				971	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
				972	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
				973	},
				974	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
				975	DECL html_flow, "div", DECL html_attrs, NULL, NULL
				976	},
				977	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
				978	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
				979	},
				980	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
				981	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
				982	},
				983	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
				984	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
				985	},
				986	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
				987	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
				988	},
				989	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
				990	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				991	},
				992	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
				993	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
				994	},
				995	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
				996	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
				997	},
				998	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
				999	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
				1000	},
				1001	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
				1002	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				1003	},
				1004	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
				1005	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1006	},
				1007	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
				1008	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
				1009	},
				1010	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
				1011	DECL select_content, NULL, DECL select_attrs, NULL, NULL
				1012	},
				1013	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
				1014	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1015	},
				1016	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
				1017	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1018	},
				1019	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
				1020	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				1021	},
				1022	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
				1023	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1024	},
				1025	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
				1026	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
				1027	},
				1028	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
				1029	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1030	},
				1031	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
				1032	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1033	},
				1034	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
				1035	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
				1036	},
				1037	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
				1038	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1039	},
				1040	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
				1041	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
				1042	},
				1043	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
				1044	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
				1045	},
				1046	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
				1047	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1048	},
				1049	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
				1050	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
				1051	},
				1052	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
				1053	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1054	},
				1055	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
				1056	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
				1057	},
				1058	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
				1059	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
				1060	},
				1061	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
				1062	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1063	},
				1064	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
				1065	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				1066	},
				1067	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
				1068	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
				1069	},
				1070	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
				1071	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1072	}
				1073	};
				1074
				1075	/*
				1076	* start tags that imply the end of current element
				1077	*/
				1078	static const char * const htmlStartClose[] = {
				1079	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				1080	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				1081	"listing", "xmp", "head", NULL,
				1082	"head", "p", NULL,
				1083	"title", "p", NULL,
				1084	"body", "head", "style", "link", "title", "p", NULL,
				1085	"frameset", "head", "style", "link", "title", "p", NULL,
				1086	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				1087	"pre", "listing", "xmp", "head", "li", NULL,
				1088	"hr", "p", "head", NULL,
				1089	"h1", "p", "head", NULL,
				1090	"h2", "p", "head", NULL,
				1091	"h3", "p", "head", NULL,
				1092	"h4", "p", "head", NULL,
				1093	"h5", "p", "head", NULL,
				1094	"h6", "p", "head", NULL,
				1095	"dir", "p", "head", NULL,
				1096	"address", "p", "head", "ul", NULL,
				1097	"pre", "p", "head", "ul", NULL,
				1098	"listing", "p", "head", NULL,
				1099	"xmp", "p", "head", NULL,
				1100	"blockquote", "p", "head", NULL,
				1101	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				1102	"xmp", "head", NULL,
				1103	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				1104	"head", "dd", NULL,
				1105	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				1106	"head", "dt", NULL,
				1107	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				1108	"listing", "xmp", NULL,
				1109	"ol", "p", "head", "ul", NULL,
				1110	"menu", "p", "head", "ul", NULL,
				1111	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
				1112	"div", "p", "head", NULL,
				1113	"noscript", "script", NULL,
				1114	"center", "font", "b", "i", "p", "head", NULL,
				1115	"a", "a", "head", NULL,
				1116	"caption", "p", NULL,
				1117	"colgroup", "caption", "colgroup", "col", "p", NULL,
				1118	"col", "caption", "col", "p", NULL,
				1119	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				1120	"listing", "xmp", "a", NULL,
				1121	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				1122	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				1123	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				1124	"thead", "caption", "col", "colgroup", NULL,
				1125	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				1126	"tbody", "p", NULL,
				1127	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				1128	"tfoot", "tbody", "p", NULL,
				1129	"optgroup", "option", NULL,
				1130	"option", "option", NULL,
				1131	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				1132	"pre", "listing", "xmp", "a", NULL,
				1133	/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
				1134	"tt", "head", NULL,
				1135	"i", "head", NULL,
				1136	"b", "head", NULL,
				1137	"u", "head", NULL,
				1138	"s", "head", NULL,
				1139	"strike", "head", NULL,
				1140	"big", "head", NULL,
				1141	"small", "head", NULL,
				1142
				1143	"em", "head", NULL,
				1144	"strong", "head", NULL,
				1145	"dfn", "head", NULL,
				1146	"code", "head", NULL,
				1147	"samp", "head", NULL,
				1148	"kbd", "head", NULL,
				1149	"var", "head", NULL,
				1150	"cite", "head", NULL,
				1151	"abbr", "head", NULL,
				1152	"acronym", "head", NULL,
				1153
				1154	/* "a" */
				1155	"img", "head", NULL,
				1156	/* "applet" */
				1157	/* "embed" */
				1158	/* "object" */
				1159	"font", "head", NULL,
				1160	/* "basefont" */
				1161	"br", "head", NULL,
				1162	/* "script" */
				1163	"map", "head", NULL,
				1164	"q", "head", NULL,
				1165	"sub", "head", NULL,
				1166	"sup", "head", NULL,
				1167	"span", "head", NULL,
				1168	"bdo", "head", NULL,
				1169	"iframe", "head", NULL,
				1170	NULL
				1171	};
				1172
				1173	/*
				1174	* The list of HTML elements which are supposed not to have
				1175	* CDATA content and where a p element will be implied
				1176	*
				1177	* TODO: extend that list by reading the HTML SGML DTD on
				1178	* implied paragraph
				1179	*/
				1180	static const char *const htmlNoContentElements[] = {
				1181	"html",
				1182	"head",
				1183	NULL
				1184	};
				1185
				1186	/*
				1187	* The list of HTML attributes which are of content %Script;
				1188	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				1189	* it assumes the name starts with 'on'
				1190	*/
				1191	static const char *const htmlScriptAttributes[] = {
				1192	"onclick",
				1193	"ondblclick",
				1194	"onmousedown",
				1195	"onmouseup",
				1196	"onmouseover",
				1197	"onmousemove",
				1198	"onmouseout",
				1199	"onkeypress",
				1200	"onkeydown",
				1201	"onkeyup",
				1202	"onload",
				1203	"onunload",
				1204	"onfocus",
				1205	"onblur",
				1206	"onsubmit",
				1207	"onreset",
				1208	"onchange",
				1209	"onselect"
				1210	};
				1211
				1212	/*
				1213	* This table is used by the htmlparser to know what to do with
				1214	* broken html pages. By assigning different priorities to different
				1215	* elements the parser can decide how to handle extra endtags.
				1216	* Endtags are only allowed to close elements with lower or equal
				1217	* priority.
				1218	*/
				1219
				1220	typedef struct {
				1221	const char *name;
				1222	int priority;
				1223	} elementPriority;
				1224
				1225	static const elementPriority htmlEndPriority[] = {
				1226	{"div", 150},
				1227	{"td", 160},
				1228	{"th", 160},
				1229	{"tr", 170},
				1230	{"thead", 180},
				1231	{"tbody", 180},
				1232	{"tfoot", 180},
				1233	{"table", 190},
				1234	{"head", 200},
				1235	{"body", 200},
				1236	{"html", 220},
				1237	{NULL, 100} /* Default priority */
				1238	};
				1239
				1240	static const char** htmlStartCloseIndex[100];
				1241	static int htmlStartCloseIndexinitialized = 0;
				1242
				1243	/************************************************************************
				1244	* *
				1245	* functions to handle HTML specific data *
				1246	* *
				1247	************************************************************************/
				1248
				1249	/**
				1250	* htmlInitAutoClose:
				1251	*
				1252	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				1253	* This is not reentrant. Call xmlInitParser() once before processing in
				1254	* case of use in multithreaded programs.
				1255	*/
				1256	void
				1257	htmlInitAutoClose(void) {
				1258	int indx, i = 0;
				1259
				1260	if (htmlStartCloseIndexinitialized) return;
				1261
				1262	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				1263	indx = 0;
				1264	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				1265	htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
				1266	while (htmlStartClose[i] != NULL) i++;
				1267	i++;
				1268	}
				1269	htmlStartCloseIndexinitialized = 1;
				1270	}
				1271
				1272	/**
				1273	* htmlTagLookup:
				1274	* @tag: The tag name in lowercase
				1275	*
				1276	* Lookup the HTML tag in the ElementTable
				1277	*
				1278	* Returns the related htmlElemDescPtr or NULL if not found.
				1279	*/
				1280	const htmlElemDesc *
				1281	htmlTagLookup(const xmlChar *tag) {
				1282	unsigned int i;
				1283
				1284	for (i = 0; i < (sizeof(html40ElementTable) /
				1285	sizeof(html40ElementTable[0]));i++) {
				1286	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
				1287	return((htmlElemDescPtr) &html40ElementTable[i]);
				1288	}
				1289	return(NULL);
				1290	}
				1291
				1292	/**
				1293	* htmlGetEndPriority:
				1294	* @name: The name of the element to look up the priority for.
				1295	*
				1296	* Return value: The "endtag" priority.
				1297	**/
				1298	static int
				1299	htmlGetEndPriority (const xmlChar *name) {
				1300	int i = 0;
				1301
				1302	while ((htmlEndPriority[i].name != NULL) &&
				1303	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				1304	i++;
				1305
				1306	return(htmlEndPriority[i].priority);
				1307	}
				1308
				1309
				1310	/**
				1311	* htmlCheckAutoClose:
				1312	* @newtag: The new tag name
				1313	* @oldtag: The old tag name
				1314	*
				1315	* Checks whether the new tag is one of the registered valid tags for
				1316	* closing old.
				1317	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				1318	*
				1319	* Returns 0 if no, 1 if yes.
				1320	*/
				1321	static int
				1322	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
				1323	{
				1324	int i, indx;
				1325	const char **closed = NULL;
				1326
				1327	if (htmlStartCloseIndexinitialized == 0)
				1328	htmlInitAutoClose();
				1329
				1330	/* inefficient, but not a big deal */
				1331	for (indx = 0; indx < 100; indx++) {
				1332	closed = htmlStartCloseIndex[indx];
				1333	if (closed == NULL)
				1334	return (0);
				1335	if (xmlStrEqual(BAD_CAST * closed, newtag))
				1336	break;
				1337	}
				1338
				1339	i = closed - htmlStartClose;
				1340	i++;
				1341	while (htmlStartClose[i] != NULL) {
				1342	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				1343	return (1);
				1344	}
				1345	i++;
				1346	}
				1347	return (0);
				1348	}
				1349
				1350	/**
				1351	* htmlAutoCloseOnClose:
				1352	* @ctxt: an HTML parser context
				1353	* @newtag: The new tag name
				1354	* @force: force the tag closure
				1355	*
				1356	* The HTML DTD allows an ending tag to implicitly close other tags.
				1357	*/
				1358	static void
				1359	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
				1360	{
				1361	const htmlElemDesc *info;
				1362	int i, priority;
				1363
				1364	priority = htmlGetEndPriority(newtag);
				1365
				1366	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				1367
				1368	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
				1369	break;
				1370	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	1371	* A misplaced endtag can only close elements with lower
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	1372	* or equal priority, so if we find an element with higher
				1373	* priority before we find an element with
				1374	* matching name, we just ignore this endtag
				1375	*/
				1376	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
				1377	return;
				1378	}
				1379	if (i < 0)
				1380	return;
				1381
				1382	while (!xmlStrEqual(newtag, ctxt->name)) {
				1383	info = htmlTagLookup(ctxt->name);
				1384	if ((info != NULL) && (info->endTag == 3)) {
				1385	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				1386	"Opening and ending tag mismatch: %s and %s\n",
				1387	newtag, ctxt->name);
				1388	}
				1389	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1390	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1391	htmlnamePop(ctxt);
				1392	}
				1393	}
				1394
				1395	/**
				1396	* htmlAutoCloseOnEnd:
				1397	* @ctxt: an HTML parser context
				1398	*
				1399	* Close all remaining tags at the end of the stream
				1400	*/
				1401	static void
				1402	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
				1403	{
				1404	int i;
				1405
				1406	if (ctxt->nameNr == 0)
				1407	return;
				1408	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				1409	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1410	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1411	htmlnamePop(ctxt);
				1412	}
				1413	}
				1414
				1415	/**
				1416	* htmlAutoClose:
				1417	* @ctxt: an HTML parser context
				1418	* @newtag: The new tag name or NULL
				1419	*
				1420	* The HTML DTD allows a tag to implicitly close other tags.
				1421	* The list is kept in htmlStartClose array. This function is
				1422	* called when a new tag has been detected and generates the
				1423	* appropriates closes if possible/needed.
				1424	* If newtag is NULL this mean we are at the end of the resource
				1425	* and we should check
				1426	*/
				1427	static void
				1428	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
				1429	{
				1430	while ((newtag != NULL) && (ctxt->name != NULL) &&
				1431	(htmlCheckAutoClose(newtag, ctxt->name))) {
				1432	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1433	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1434	htmlnamePop(ctxt);
				1435	}
				1436	if (newtag == NULL) {
				1437	htmlAutoCloseOnEnd(ctxt);
				1438	return;
				1439	}
				1440	while ((newtag == NULL) && (ctxt->name != NULL) &&
				1441	((xmlStrEqual(ctxt->name, BAD_CAST "head")) \|\|
				1442	(xmlStrEqual(ctxt->name, BAD_CAST "body")) \|\|
				1443	(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
				1444	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1445	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1446	htmlnamePop(ctxt);
				1447	}
				1448	}
				1449
				1450	/**
				1451	* htmlAutoCloseTag:
				1452	* @doc: the HTML document
				1453	* @name: The tag name
				1454	* @elem: the HTML element
				1455	*
				1456	* The HTML DTD allows a tag to implicitly close other tags.
				1457	* The list is kept in htmlStartClose array. This function checks
				1458	* if the element or one of it's children would autoclose the
				1459	* given tag.
				1460	*
				1461	* Returns 1 if autoclose, 0 otherwise
				1462	*/
				1463	int
				1464	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				1465	htmlNodePtr child;
				1466
				1467	if (elem == NULL) return(1);
				1468	if (xmlStrEqual(name, elem->name)) return(0);
				1469	if (htmlCheckAutoClose(elem->name, name)) return(1);
				1470	child = elem->children;
				1471	while (child != NULL) {
				1472	if (htmlAutoCloseTag(doc, name, child)) return(1);
				1473	child = child->next;
				1474	}
				1475	return(0);
				1476	}
				1477
				1478	/**
				1479	* htmlIsAutoClosed:
				1480	* @doc: the HTML document
				1481	* @elem: the HTML element
				1482	*
				1483	* The HTML DTD allows a tag to implicitly close other tags.
				1484	* The list is kept in htmlStartClose array. This function checks
				1485	* if a tag is autoclosed by one of it's child
				1486	*
				1487	* Returns 1 if autoclosed, 0 otherwise
				1488	*/
				1489	int
				1490	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				1491	htmlNodePtr child;
				1492
				1493	if (elem == NULL) return(1);
				1494	child = elem->children;
				1495	while (child != NULL) {
				1496	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				1497	child = child->next;
				1498	}
				1499	return(0);
				1500	}
				1501
				1502	/**
				1503	* htmlCheckImplied:
				1504	* @ctxt: an HTML parser context
				1505	* @newtag: The new tag name
				1506	*
				1507	* The HTML DTD allows a tag to exists only implicitly
				1508	* called when a new tag has been detected and generates the
				1509	* appropriates implicit tags if missing
				1510	*/
				1511	static void
				1512	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				1513	int i;
				1514
				1515	if (ctxt->options & HTML_PARSE_NOIMPLIED)
				1516	return;
				1517	if (!htmlOmittedDefaultValue)
				1518	return;
				1519	if (xmlStrEqual(newtag, BAD_CAST"html"))
				1520	return;
				1521	if (ctxt->nameNr <= 0) {
				1522	htmlnamePush(ctxt, BAD_CAST"html");
				1523	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1524	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				1525	}
				1526	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				1527	return;
				1528	if ((ctxt->nameNr <= 1) &&
				1529	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				1530	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				1531	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				1532	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				1533	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				1534	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				1535	if (ctxt->html >= 3) {
				1536	/* we already saw or generated an <head> before */
				1537	return;
				1538	}
				1539	/*
				1540	* dropped OBJECT ... i you put it first BODY will be
				1541	* assumed !
				1542	*/
				1543	htmlnamePush(ctxt, BAD_CAST"head");
				1544	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1545	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				1546	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				1547	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				1548	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				1549	if (ctxt->html >= 10) {
				1550	/* we already saw or generated a <body> before */
				1551	return;
				1552	}
				1553	for (i = 0;i < ctxt->nameNr;i++) {
				1554	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				1555	return;
				1556	}
				1557	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				1558	return;
				1559	}
				1560	}
				1561
				1562	htmlnamePush(ctxt, BAD_CAST"body");
				1563	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1564	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				1565	}
				1566	}
				1567
				1568	/**
				1569	* htmlCheckParagraph
				1570	* @ctxt: an HTML parser context
				1571	*
				1572	* Check whether a p element need to be implied before inserting
				1573	* characters in the current element.
				1574	*
				1575	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				1576	* in case of error.
				1577	*/
				1578
				1579	static int
				1580	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				1581	const xmlChar *tag;
				1582	int i;
				1583
				1584	if (ctxt == NULL)
				1585	return(-1);
				1586	tag = ctxt->name;
				1587	if (tag == NULL) {
				1588	htmlAutoClose(ctxt, BAD_CAST"p");
				1589	htmlCheckImplied(ctxt, BAD_CAST"p");
				1590	htmlnamePush(ctxt, BAD_CAST"p");
				1591	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1592	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1593	return(1);
				1594	}
				1595	if (!htmlOmittedDefaultValue)
				1596	return(0);
				1597	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				1598	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				1599	htmlAutoClose(ctxt, BAD_CAST"p");
				1600	htmlCheckImplied(ctxt, BAD_CAST"p");
				1601	htmlnamePush(ctxt, BAD_CAST"p");
				1602	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1603	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1604	return(1);
				1605	}
				1606	}
				1607	return(0);
				1608	}
				1609
				1610	/**
				1611	* htmlIsScriptAttribute:
				1612	* @name: an attribute name
				1613	*
				1614	* Check if an attribute is of content type Script
				1615	*
				1616	* Returns 1 is the attribute is a script 0 otherwise
				1617	*/
				1618	int
				1619	htmlIsScriptAttribute(const xmlChar *name) {
				1620	unsigned int i;
				1621
				1622	if (name == NULL)
				1623	return(0);
				1624	/*
				1625	* all script attributes start with 'on'
				1626	*/
				1627	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1628	return(0);
				1629	for (i = 0;
				1630	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1631	i++) {
				1632	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1633	return(1);
				1634	}
				1635	return(0);
				1636	}
				1637
				1638	/************************************************************************
				1639	* *
				1640	* The list of HTML predefined entities *
				1641	* *
				1642	************************************************************************/
				1643
				1644
				1645	static const htmlEntityDesc html40EntitiesTable[] = {
				1646	/*
				1647	* the 4 absolute ones, plus apostrophe.
				1648	*/
				1649	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1650	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1651	{ 39, "apos", "single quote" },
				1652	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1653	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1654
				1655	/*
				1656	* A bunch still in the 128-255 range
				1657	* Replacing them depend really on the charset used.
				1658	*/
				1659	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1660	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1661	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1662	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1663	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1664	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1665	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1666	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1667	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1668	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1669	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1670	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1671	{ 172, "not", "not sign, U+00AC ISOnum" },
				1672	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1673	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1674	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1675	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1676	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1677	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1678	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1679	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1680	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1681	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1682	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1683	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1684	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1685	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1686	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1687	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1688	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1689	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1690	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1691	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1692	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1693	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1694	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1695	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1696	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1697	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1698	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1699	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1700	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1701	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1702	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1703	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1704	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1705	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1706	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1707	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1708	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1709	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1710	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1711	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1712	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1713	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1714	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1715	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1716	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1717	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1718	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1719	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1720	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1721	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1722	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1723	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1724	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1725	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1726	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1727	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1728	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1729	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1730	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1731	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1732	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1733	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1734	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1735	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1736	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1737	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1738	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1739	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1740	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1741	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1742	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1743	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1744	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1745	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1746	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1747	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1748	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1749	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1750	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1751	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1752	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1753	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1754	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1755
				1756	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1757	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1758	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1759	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1760	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1761
				1762	/*
				1763	* Anything below should really be kept as entities references
				1764	*/
				1765	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1766
				1767	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1768	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1769
				1770	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1771	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1772	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1773	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1774	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1775	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1776	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1777	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1778	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1779	{ 922, "Kappa","greek capital letter kappa, U+039A" },
				1780	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
				1781	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1782	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1783	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1784	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1785	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1786	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1787	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1788	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1789	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1790	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1791	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1792	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1793	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1794
				1795	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1796	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1797	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1798	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1799	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1800	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1801	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1802	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1803	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1804	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1805	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1806	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1807	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1808	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1809	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1810	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1811	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1812	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1813	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1814	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1815	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1816	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1817	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1818	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1819	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1820	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1821	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1822	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1823
				1824	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1825	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1826	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1827	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1828	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1829	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1830	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1831	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1832	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1833	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1834	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1835	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1836	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1837	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1838	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1839	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1840	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1841
				1842	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1843	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1844
				1845	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1846
				1847	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1848	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1849
				1850	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1851	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1852
				1853	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1854	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1855
				1856	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1857
				1858	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1859	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1860	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1861	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1862	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1863	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1864	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1865	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1866	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1867	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1868	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1869	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1870	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1871	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1872	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1873	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1874
				1875	{ 8704, "forall","for all, U+2200 ISOtech" },
				1876	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1877	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1878	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1879	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1880	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1881	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1882	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1883	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1884	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
				1885	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1886	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1887	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1888	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1889	{ 8734, "infin","infinity, U+221E ISOtech" },
				1890	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1891	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1892	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1893	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1894	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1895	{ 8747, "int", "integral, U+222B ISOtech" },
				1896	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1897	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1898	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1899	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1900	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1901	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1902	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1903	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1904	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1905	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1906	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1907	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1908	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1909	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1910	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1911	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1912	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1913	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1914	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1915	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1916	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1917	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1918	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1919	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1920
				1921	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1922	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1923	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1924	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1925
				1926	};
				1927
				1928	/************************************************************************
				1929	* *
				1930	* Commodity functions to handle entities *
				1931	* *
				1932	************************************************************************/
				1933
				1934	/*
				1935	* Macro used to grow the current buffer.
				1936	*/
				1937	#define growBuffer(buffer) { \
				1938	xmlChar *tmp; \
				1939	buffer##_size *= 2; \
				1940	tmp = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1941	if (tmp == NULL) { \
				1942	htmlErrMemory(ctxt, "growing buffer\n"); \
				1943	xmlFree(buffer); \
				1944	return(NULL); \
				1945	} \
				1946	buffer = tmp; \
				1947	}
				1948
				1949	/**
				1950	* htmlEntityLookup:
				1951	* @name: the entity name
				1952	*
				1953	* Lookup the given entity in EntitiesTable
				1954	*
				1955	* TODO: the linear scan is really ugly, an hash table is really needed.
				1956	*
				1957	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1958	*/
				1959	const htmlEntityDesc *
				1960	htmlEntityLookup(const xmlChar *name) {
				1961	unsigned int i;
				1962
				1963	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1964	sizeof(html40EntitiesTable[0]));i++) {
				1965	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1966	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
				1967	}
				1968	}
				1969	return(NULL);
				1970	}
				1971
				1972	/**
				1973	* htmlEntityValueLookup:
				1974	* @value: the entity's unicode value
				1975	*
				1976	* Lookup the given entity in EntitiesTable
				1977	*
				1978	* TODO: the linear scan is really ugly, an hash table is really needed.
				1979	*
				1980	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1981	*/
				1982	const htmlEntityDesc *
				1983	htmlEntityValueLookup(unsigned int value) {
				1984	unsigned int i;
				1985
				1986	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1987	sizeof(html40EntitiesTable[0]));i++) {
				1988	if (html40EntitiesTable[i].value >= value) {
				1989	if (html40EntitiesTable[i].value > value)
				1990	break;
				1991	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
				1992	}
				1993	}
				1994	return(NULL);
				1995	}
				1996
				1997	/**
				1998	* UTF8ToHtml:
				1999	* @out: a pointer to an array of bytes to store the result
				2000	* @outlen: the length of @out
				2001	* @in: a pointer to an array of UTF-8 chars
				2002	* @inlen: the length of @in
				2003	*
				2004	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				2005	* plus HTML entities block of chars out.
				2006	*
				2007	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				2008	* The value of @inlen after return is the number of octets consumed
				2009	* as the return value is positive, else unpredictable.
				2010	* The value of @outlen after return is the number of octets consumed.
				2011	*/
				2012	int
				2013	UTF8ToHtml(unsigned char* out, int *outlen,
				2014	const unsigned char* in, int *inlen) {
				2015	const unsigned char* processed = in;
				2016	const unsigned char* outend;
				2017	const unsigned char* outstart = out;
				2018	const unsigned char* instart = in;
				2019	const unsigned char* inend;
				2020	unsigned int c, d;
				2021	int trailing;
				2022
				2023	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
				2024	if (in == NULL) {
				2025	/*
				2026	* initialization nothing to do
				2027	*/
				2028	*outlen = 0;
				2029	*inlen = 0;
				2030	return(0);
				2031	}
				2032	inend = in + (*inlen);
				2033	outend = out + (*outlen);
				2034	while (in < inend) {
				2035	d = *in++;
				2036	if (d < 0x80) { c= d; trailing= 0; }
				2037	else if (d < 0xC0) {
				2038	/* trailing byte in leading position */
				2039	*outlen = out - outstart;
				2040	*inlen = processed - instart;
				2041	return(-2);
				2042	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				2043	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				2044	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				2045	else {
				2046	/* no chance for this in Ascii */
				2047	*outlen = out - outstart;
				2048	*inlen = processed - instart;
				2049	return(-2);
				2050	}
				2051
				2052	if (inend - in < trailing) {
				2053	break;
				2054	}
				2055
				2056	for ( ; trailing; trailing--) {
				2057	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				2058	break;
				2059	c <<= 6;
				2060	c \|= d & 0x3F;
				2061	}
				2062
				2063	/* assertion: c is a single UTF-4 value */
				2064	if (c < 0x80) {
				2065	if (out + 1 >= outend)
				2066	break;
				2067	*out++ = c;
				2068	} else {
				2069	int len;
				2070	const htmlEntityDesc * ent;
				2071	const char *cp;
				2072	char nbuf[16];
				2073
				2074	/*
				2075	* Try to lookup a predefined HTML entity for it
				2076	*/
				2077
				2078	ent = htmlEntityValueLookup(c);
				2079	if (ent == NULL) {
				2080	snprintf(nbuf, sizeof(nbuf), "#%u", c);
				2081	cp = nbuf;
				2082	}
				2083	else
				2084	cp = ent->name;
				2085	len = strlen(cp);
				2086	if (out + 2 + len >= outend)
				2087	break;
				2088	*out++ = '&';
				2089	memcpy(out, cp, len);
				2090	out += len;
				2091	*out++ = ';';
				2092	}
				2093	processed = in;
				2094	}
				2095	*outlen = out - outstart;
				2096	*inlen = processed - instart;
				2097	return(0);
				2098	}
				2099
				2100	/**
				2101	* htmlEncodeEntities:
				2102	* @out: a pointer to an array of bytes to store the result
				2103	* @outlen: the length of @out
				2104	* @in: a pointer to an array of UTF-8 chars
				2105	* @inlen: the length of @in
				2106	* @quoteChar: the quote character to escape (' or ") or zero.
				2107	*
				2108	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				2109	* plus HTML entities block of chars out.
				2110	*
				2111	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				2112	* The value of @inlen after return is the number of octets consumed
				2113	* as the return value is positive, else unpredictable.
				2114	* The value of @outlen after return is the number of octets consumed.
				2115	*/
				2116	int
				2117	htmlEncodeEntities(unsigned char* out, int *outlen,
				2118	const unsigned char* in, int *inlen, int quoteChar) {
				2119	const unsigned char* processed = in;
				2120	const unsigned char* outend;
				2121	const unsigned char* outstart = out;
				2122	const unsigned char* instart = in;
				2123	const unsigned char* inend;
				2124	unsigned int c, d;
				2125	int trailing;
				2126
				2127	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
				2128	return(-1);
				2129	outend = out + (*outlen);
				2130	inend = in + (*inlen);
				2131	while (in < inend) {
				2132	d = *in++;
				2133	if (d < 0x80) { c= d; trailing= 0; }
				2134	else if (d < 0xC0) {
				2135	/* trailing byte in leading position */
				2136	*outlen = out - outstart;
				2137	*inlen = processed - instart;
				2138	return(-2);
				2139	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				2140	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				2141	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				2142	else {
				2143	/* no chance for this in Ascii */
				2144	*outlen = out - outstart;
				2145	*inlen = processed - instart;
				2146	return(-2);
				2147	}
				2148
				2149	if (inend - in < trailing)
				2150	break;
				2151
				2152	while (trailing--) {
				2153	if (((d= *in++) & 0xC0) != 0x80) {
				2154	*outlen = out - outstart;
				2155	*inlen = processed - instart;
				2156	return(-2);
				2157	}
				2158	c <<= 6;
				2159	c \|= d & 0x3F;
				2160	}
				2161
				2162	/* assertion: c is a single UTF-4 value */
				2163	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				2164	(c != '&') && (c != '<') && (c != '>')) {
				2165	if (out >= outend)
				2166	break;
				2167	*out++ = c;
				2168	} else {
				2169	const htmlEntityDesc * ent;
				2170	const char *cp;
				2171	char nbuf[16];
				2172	int len;
				2173
				2174	/*
				2175	* Try to lookup a predefined HTML entity for it
				2176	*/
				2177	ent = htmlEntityValueLookup(c);
				2178	if (ent == NULL) {
				2179	snprintf(nbuf, sizeof(nbuf), "#%u", c);
				2180	cp = nbuf;
				2181	}
				2182	else
				2183	cp = ent->name;
				2184	len = strlen(cp);
				2185	if (out + 2 + len > outend)
				2186	break;
				2187	*out++ = '&';
				2188	memcpy(out, cp, len);
				2189	out += len;
				2190	*out++ = ';';
				2191	}
				2192	processed = in;
				2193	}
				2194	*outlen = out - outstart;
				2195	*inlen = processed - instart;
				2196	return(0);
				2197	}
				2198
				2199	/************************************************************************
				2200	* *
				2201	* Commodity functions to handle streams *
				2202	* *
				2203	************************************************************************/
				2204
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2205	#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2206	/**
				2207	* htmlNewInputStream:
				2208	* @ctxt: an HTML parser context
				2209	*
				2210	* Create a new input stream structure
				2211	* Returns the new input stream or NULL
				2212	*/
				2213	static htmlParserInputPtr
				2214	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				2215	htmlParserInputPtr input;
				2216
				2217	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				2218	if (input == NULL) {
				2219	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
				2220	return(NULL);
				2221	}
				2222	memset(input, 0, sizeof(htmlParserInput));
				2223	input->filename = NULL;
				2224	input->directory = NULL;
				2225	input->base = NULL;
				2226	input->cur = NULL;
				2227	input->buf = NULL;
				2228	input->line = 1;
				2229	input->col = 1;
				2230	input->buf = NULL;
				2231	input->free = NULL;
				2232	input->version = NULL;
				2233	input->consumed = 0;
				2234	input->length = 0;
				2235	return(input);
				2236	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2237	#endif
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2238
				2239
				2240	/************************************************************************
				2241	* *
				2242	* Commodity functions, cleanup needed ? *
				2243	* *
				2244	************************************************************************/
				2245	/*
				2246	* all tags allowing pc data from the html 4.01 loose dtd
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2247	* NOTE: it might be more appropriate to integrate this information
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2248	* into the html40ElementTable array but I don't want to risk any
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2249	* binary incompatibility
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2250	*/
				2251	static const char *allowPCData[] = {
				2252	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
				2253	"blockquote", "body", "button", "caption", "center", "cite", "code",
				2254	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
				2255	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
				2256	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
				2257	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
				2258	};
				2259
				2260	/**
				2261	* areBlanks:
				2262	* @ctxt: an HTML parser context
				2263	* @str: a xmlChar *
				2264	* @len: the size of @str
				2265	*
				2266	* Is this a sequence of blank chars that one can ignore ?
				2267	*
				2268	* Returns 1 if ignorable 0 otherwise.
				2269	*/
				2270
				2271	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				2272	unsigned int i;
				2273	int j;
				2274	xmlNodePtr lastChild;
				2275	xmlDtdPtr dtd;
				2276
				2277	for (j = 0;j < len;j++)
				2278	if (!(IS_BLANK_CH(str[j]))) return(0);
				2279
				2280	if (CUR == 0) return(1);
				2281	if (CUR != '<') return(0);
				2282	if (ctxt->name == NULL)
				2283	return(1);
				2284	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				2285	return(1);
				2286	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				2287	return(1);
				2288
				2289	/* Only strip CDATA children of the body tag for strict HTML DTDs */
				2290	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
				2291	dtd = xmlGetIntSubset(ctxt->myDoc);
				2292	if (dtd != NULL && dtd->ExternalID != NULL) {
				2293	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
				2294	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
				2295	return(1);
				2296	}
				2297	}
				2298
				2299	if (ctxt->node == NULL) return(0);
				2300	lastChild = xmlGetLastChild(ctxt->node);
				2301	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
				2302	lastChild = lastChild->prev;
				2303	if (lastChild == NULL) {
				2304	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				2305	(ctxt->node->content != NULL)) return(0);
				2306	/* keep ws in constructs like ...<b> </b>...
				2307	for all tags "b" allowing PCDATA */
				2308	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				2309	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
				2310	return(0);
				2311	}
				2312	}
				2313	} else if (xmlNodeIsText(lastChild)) {
				2314	return(0);
				2315	} else {
				2316	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
				2317	for all tags "p" allowing PCDATA */
				2318	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				2319	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
				2320	return(0);
				2321	}
				2322	}
				2323	}
				2324	return(1);
				2325	}
				2326
				2327	/**
				2328	* htmlNewDocNoDtD:
				2329	* @URI: URI for the dtd, or NULL
				2330	* @ExternalID: the external ID of the DTD, or NULL
				2331	*
				2332	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				2333	* are NULL
				2334	*
				2335	* Returns a new document, do not initialize the DTD if not provided
				2336	*/
				2337	htmlDocPtr
				2338	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				2339	xmlDocPtr cur;
				2340
				2341	/*
				2342	* Allocate a new document and fill the fields.
				2343	*/
				2344	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				2345	if (cur == NULL) {
				2346	htmlErrMemory(NULL, "HTML document creation failed\n");
				2347	return(NULL);
				2348	}
				2349	memset(cur, 0, sizeof(xmlDoc));
				2350
				2351	cur->type = XML_HTML_DOCUMENT_NODE;
				2352	cur->version = NULL;
				2353	cur->intSubset = NULL;
				2354	cur->doc = cur;
				2355	cur->name = NULL;
				2356	cur->children = NULL;
				2357	cur->extSubset = NULL;
				2358	cur->oldNs = NULL;
				2359	cur->encoding = NULL;
				2360	cur->standalone = 1;
				2361	cur->compression = 0;
				2362	cur->ids = NULL;
				2363	cur->refs = NULL;
				2364	cur->_private = NULL;
				2365	cur->charset = XML_CHAR_ENCODING_UTF8;
				2366	cur->properties = XML_DOC_HTML \| XML_DOC_USERBUILT;
				2367	if ((ExternalID != NULL) \|\|
				2368	(URI != NULL))
				2369	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
				2370	return(cur);
				2371	}
				2372
				2373	/**
				2374	* htmlNewDoc:
				2375	* @URI: URI for the dtd, or NULL
				2376	* @ExternalID: the external ID of the DTD, or NULL
				2377	*
				2378	* Creates a new HTML document
				2379	*
				2380	* Returns a new document
				2381	*/
				2382	htmlDocPtr
				2383	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				2384	if ((URI == NULL) && (ExternalID == NULL))
				2385	return(htmlNewDocNoDtD(
				2386	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				2387	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
				2388
				2389	return(htmlNewDocNoDtD(URI, ExternalID));
				2390	}
				2391
				2392
				2393	/************************************************************************
				2394	* *
				2395	* The parser itself *
				2396	* Relates to http://www.w3.org/TR/html40 *
				2397	* *
				2398	************************************************************************/
				2399
				2400	/************************************************************************
				2401	* *
				2402	* The parser itself *
				2403	* *
				2404	************************************************************************/
				2405
				2406	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
				2407
				2408	/**
				2409	* htmlParseHTMLName:
				2410	* @ctxt: an HTML parser context
				2411	*
				2412	* parse an HTML tag or attribute name, note that we convert it to lowercase
				2413	* since HTML names are not case-sensitive.
				2414	*
				2415	* Returns the Tag Name parsed or NULL
				2416	*/
				2417
				2418	static const xmlChar *
				2419	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				2420	int i = 0;
				2421	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				2422
				2423	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
				2424	(CUR != ':') && (CUR != '.')) return(NULL);
				2425
				2426	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				2427	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
				2428	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_') \|\|
				2429	(CUR == '.'))) {
				2430	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				2431	else loc[i] = CUR;
				2432	i++;
				2433
				2434	NEXT;
				2435	}
				2436
				2437	return(xmlDictLookup(ctxt->dict, loc, i));
				2438	}
				2439
				2440
				2441	/**
				2442	* htmlParseHTMLName_nonInvasive:
				2443	* @ctxt: an HTML parser context
				2444	*
				2445	* parse an HTML tag or attribute name, note that we convert it to lowercase
				2446	* since HTML names are not case-sensitive, this doesn't consume the data
				2447	* from the stream, it's a look-ahead
				2448	*
				2449	* Returns the Tag Name parsed or NULL
				2450	*/
				2451
				2452	static const xmlChar *
				2453	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
				2454	int i = 0;
				2455	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				2456
				2457	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
				2458	(NXT(1) != ':')) return(NULL);
				2459
				2460	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				2461	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
				2462	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
				2463	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
				2464	else loc[i] = NXT(1+i);
				2465	i++;
				2466	}
				2467
				2468	return(xmlDictLookup(ctxt->dict, loc, i));
				2469	}
				2470
				2471
				2472	/**
				2473	* htmlParseName:
				2474	* @ctxt: an HTML parser context
				2475	*
				2476	* parse an HTML name, this routine is case sensitive.
				2477	*
				2478	* Returns the Name parsed or NULL
				2479	*/
				2480
				2481	static const xmlChar *
				2482	htmlParseName(htmlParserCtxtPtr ctxt) {
				2483	const xmlChar *in;
				2484	const xmlChar *ret;
				2485	int count = 0;
				2486
				2487	GROW;
				2488
				2489	/*
				2490	* Accelerator for simple ASCII names
				2491	*/
				2492	in = ctxt->input->cur;
				2493	if (((in >= 0x61) && (in <= 0x7A)) \|\|
				2494	((in >= 0x41) && (in <= 0x5A)) \|\|
				2495	(in == '_') \|\| (in == ':')) {
				2496	in++;
				2497	while (((in >= 0x61) && (in <= 0x7A)) \|\|
				2498	((in >= 0x41) && (in <= 0x5A)) \|\|
				2499	((in >= 0x30) && (in <= 0x39)) \|\|
				2500	(in == '_') \|\| (in == '-') \|\|
				2501	(in == ':') \|\| (in == '.'))
				2502	in++;
				2503
				2504	if (in == ctxt->input->end)
				2505	return(NULL);
				2506
				2507	if ((in > 0) && (in < 0x80)) {
				2508	count = in - ctxt->input->cur;
				2509	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
				2510	ctxt->input->cur = in;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2511	ctxt->input->col += count;
				2512	return(ret);
				2513	}
				2514	}
				2515	return(htmlParseNameComplex(ctxt));
				2516	}
				2517
				2518	static const xmlChar *
				2519	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
				2520	int len = 0, l;
				2521	int c;
				2522	int count = 0;
				2523	const xmlChar *base = ctxt->input->base;
				2524
				2525	/*
				2526	* Handler for more complex cases
				2527	*/
				2528	GROW;
				2529	c = CUR_CHAR(l);
				2530	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
				2531	(!IS_LETTER(c) && (c != '_') &&
				2532	(c != ':'))) {
				2533	return(NULL);
				2534	}
				2535
				2536	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
				2537	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
				2538	(c == '.') \|\| (c == '-') \|\|
				2539	(c == '_') \|\| (c == ':') \|\|
				2540	(IS_COMBINING(c)) \|\|
				2541	(IS_EXTENDER(c)))) {
				2542	if (count++ > 100) {
				2543	count = 0;
				2544	GROW;
				2545	}
				2546	len += l;
				2547	NEXTL(l);
				2548	c = CUR_CHAR(l);
				2549	if (ctxt->input->base != base) {
				2550	/*
				2551	* We changed encoding from an unknown encoding
				2552	* Input buffer changed location, so we better start again
				2553	*/
				2554	return(htmlParseNameComplex(ctxt));
				2555	}
				2556	}
				2557
				2558	if (ctxt->input->cur - ctxt->input->base < len) {
				2559	/* Sanity check */
				2560	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				2561	"unexpected change of input buffer", NULL, NULL);
				2562	return (NULL);
				2563	}
				2564
				2565	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
				2566	}
				2567
				2568
				2569	/**
				2570	* htmlParseHTMLAttribute:
				2571	* @ctxt: an HTML parser context
				2572	* @stop: a char stop value
				2573	*
				2574	* parse an HTML attribute value till the stop (quote), if
				2575	* stop is 0 then it stops at the first space
				2576	*
				2577	* Returns the attribute parsed or NULL
				2578	*/
				2579
				2580	static xmlChar *
				2581	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				2582	xmlChar *buffer = NULL;
				2583	int buffer_size = 0;
				2584	xmlChar *out = NULL;
				2585	const xmlChar *name = NULL;
				2586	const xmlChar *cur = NULL;
				2587	const htmlEntityDesc * ent;
				2588
				2589	/*
				2590	* allocate a translation buffer.
				2591	*/
				2592	buffer_size = HTML_PARSER_BUFFER_SIZE;
				2593	buffer = (xmlChar ) xmlMallocAtomic(buffer_size sizeof(xmlChar));
				2594	if (buffer == NULL) {
				2595	htmlErrMemory(ctxt, "buffer allocation failed\n");
				2596	return(NULL);
				2597	}
				2598	out = buffer;
				2599
				2600	/*
				2601	* Ok loop until we reach one of the ending chars
				2602	*/
				2603	while ((CUR != 0) && (CUR != stop)) {
				2604	if ((stop == 0) && (CUR == '>')) break;
				2605	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
				2606	if (CUR == '&') {
				2607	if (NXT(1) == '#') {
				2608	unsigned int c;
				2609	int bits;
				2610
				2611	c = htmlParseCharRef(ctxt);
				2612	if (c < 0x80)
				2613	{ *out++ = c; bits= -6; }
				2614	else if (c < 0x800)
				2615	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2616	else if (c < 0x10000)
				2617	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2618	else
				2619	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2620
				2621	for ( ; bits >= 0; bits-= 6) {
				2622	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2623	}
				2624
				2625	if (out - buffer > buffer_size - 100) {
				2626	int indx = out - buffer;
				2627
				2628	growBuffer(buffer);
				2629	out = &buffer[indx];
				2630	}
				2631	} else {
				2632	ent = htmlParseEntityRef(ctxt, &name);
				2633	if (name == NULL) {
				2634	*out++ = '&';
				2635	if (out - buffer > buffer_size - 100) {
				2636	int indx = out - buffer;
				2637
				2638	growBuffer(buffer);
				2639	out = &buffer[indx];
				2640	}
				2641	} else if (ent == NULL) {
				2642	*out++ = '&';
				2643	cur = name;
				2644	while (*cur != 0) {
				2645	if (out - buffer > buffer_size - 100) {
				2646	int indx = out - buffer;
				2647
				2648	growBuffer(buffer);
				2649	out = &buffer[indx];
				2650	}
				2651	out++ = cur++;
				2652	}
				2653	} else {
				2654	unsigned int c;
				2655	int bits;
				2656
				2657	if (out - buffer > buffer_size - 100) {
				2658	int indx = out - buffer;
				2659
				2660	growBuffer(buffer);
				2661	out = &buffer[indx];
				2662	}
				2663	c = ent->value;
				2664	if (c < 0x80)
				2665	{ *out++ = c; bits= -6; }
				2666	else if (c < 0x800)
				2667	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2668	else if (c < 0x10000)
				2669	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2670	else
				2671	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2672
				2673	for ( ; bits >= 0; bits-= 6) {
				2674	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2675	}
				2676	}
				2677	}
				2678	} else {
				2679	unsigned int c;
				2680	int bits, l;
				2681
				2682	if (out - buffer > buffer_size - 100) {
				2683	int indx = out - buffer;
				2684
				2685	growBuffer(buffer);
				2686	out = &buffer[indx];
				2687	}
				2688	c = CUR_CHAR(l);
				2689	if (c < 0x80)
				2690	{ *out++ = c; bits= -6; }
				2691	else if (c < 0x800)
				2692	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2693	else if (c < 0x10000)
				2694	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2695	else
				2696	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2697
				2698	for ( ; bits >= 0; bits-= 6) {
				2699	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2700	}
				2701	NEXT;
				2702	}
				2703	}
				2704	*out = 0;
				2705	return(buffer);
				2706	}
				2707
				2708	/**
				2709	* htmlParseEntityRef:
				2710	* @ctxt: an HTML parser context
				2711	* @str: location to store the entity name
				2712	*
				2713	* parse an HTML ENTITY references
				2714	*
				2715	* [68] EntityRef ::= '&' Name ';'
				2716	*
				2717	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2718	* if non-NULL *str will have to be freed by the caller.
				2719	*/
				2720	const htmlEntityDesc *
				2721	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
				2722	const xmlChar *name;
				2723	const htmlEntityDesc * ent = NULL;
				2724
				2725	if (str != NULL) *str = NULL;
				2726	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
				2727
				2728	if (CUR == '&') {
				2729	NEXT;
				2730	name = htmlParseName(ctxt);
				2731	if (name == NULL) {
				2732	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				2733	"htmlParseEntityRef: no name\n", NULL, NULL);
				2734	} else {
				2735	GROW;
				2736	if (CUR == ';') {
				2737	if (str != NULL)
				2738	*str = name;
				2739
				2740	/*
				2741	* Lookup the entity in the table.
				2742	*/
				2743	ent = htmlEntityLookup(name);
				2744	if (ent != NULL) /* OK that's ugly !!! */
				2745	NEXT;
				2746	} else {
				2747	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
				2748	"htmlParseEntityRef: expecting ';'\n",
				2749	NULL, NULL);
				2750	if (str != NULL)
				2751	*str = name;
				2752	}
				2753	}
				2754	}
				2755	return(ent);
				2756	}
				2757
				2758	/**
				2759	* htmlParseAttValue:
				2760	* @ctxt: an HTML parser context
				2761	*
				2762	* parse a value for an attribute
				2763	* Note: the parser won't do substitution of entities here, this
				2764	* will be handled later in xmlStringGetNodeList, unless it was
				2765	* asked for ctxt->replaceEntities != 0
				2766	*
				2767	* Returns the AttValue parsed or NULL.
				2768	*/
				2769
				2770	static xmlChar *
				2771	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2772	xmlChar *ret = NULL;
				2773
				2774	if (CUR == '"') {
				2775	NEXT;
				2776	ret = htmlParseHTMLAttribute(ctxt, '"');
				2777	if (CUR != '"') {
				2778	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
				2779	"AttValue: \" expected\n", NULL, NULL);
				2780	} else
				2781	NEXT;
				2782	} else if (CUR == '\'') {
				2783	NEXT;
				2784	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2785	if (CUR != '\'') {
				2786	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
				2787	"AttValue: ' expected\n", NULL, NULL);
				2788	} else
				2789	NEXT;
				2790	} else {
				2791	/*
				2792	* That's an HTMLism, the attribute value may not be quoted
				2793	*/
				2794	ret = htmlParseHTMLAttribute(ctxt, 0);
				2795	if (ret == NULL) {
				2796	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
				2797	"AttValue: no value found\n", NULL, NULL);
				2798	}
				2799	}
				2800	return(ret);
				2801	}
				2802
				2803	/**
				2804	* htmlParseSystemLiteral:
				2805	* @ctxt: an HTML parser context
				2806	*
				2807	* parse an HTML Literal
				2808	*
				2809	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2810	*
				2811	* Returns the SystemLiteral parsed or NULL
				2812	*/
				2813
				2814	static xmlChar *
				2815	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2816	size_t len = 0, startPosition = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2817	int err = 0;
				2818	int quote;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2819	xmlChar *ret = NULL;
				2820
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2821	if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2822	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2823	"SystemLiteral \" or ' expected\n", NULL, NULL);
				2824	return(NULL);
				2825	}
				2826	quote = CUR;
				2827	NEXT;
				2828
				2829	if (CUR_PTR < BASE_PTR)
				2830	return(ret);
				2831	startPosition = CUR_PTR - BASE_PTR;
				2832
				2833	while ((CUR != 0) && (CUR != quote)) {
				2834	/* TODO: Handle UTF-8 */
				2835	if (!IS_CHAR_CH(CUR)) {
				2836	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2837	"Invalid char in SystemLiteral 0x%X\n", CUR);
				2838	err = 1;
				2839	}
				2840	NEXT;
				2841	len++;
				2842	}
				2843	if (CUR != quote) {
				2844	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
				2845	"Unfinished SystemLiteral\n", NULL, NULL);
				2846	} else {
				2847	NEXT;
				2848	if (err == 0)
				2849	ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2850	}
				2851
				2852	return(ret);
				2853	}
				2854
				2855	/**
				2856	* htmlParsePubidLiteral:
				2857	* @ctxt: an HTML parser context
				2858	*
				2859	* parse an HTML public literal
				2860	*
				2861	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2862	*
				2863	* Returns the PubidLiteral parsed or NULL.
				2864	*/
				2865
				2866	static xmlChar *
				2867	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2868	size_t len = 0, startPosition = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2869	int err = 0;
				2870	int quote;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2871	xmlChar *ret = NULL;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2872
				2873	if ((CUR != '"') && (CUR != '\'')) {
				2874	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
				2875	"PubidLiteral \" or ' expected\n", NULL, NULL);
				2876	return(NULL);
				2877	}
				2878	quote = CUR;
				2879	NEXT;
				2880
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2881	/*
				2882	* Name ::= (Letter \| '_') (NameChar)*
				2883	*/
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2884	if (CUR_PTR < BASE_PTR)
				2885	return(ret);
				2886	startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2887
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2888	while ((CUR != 0) && (CUR != quote)) {
				2889	if (!IS_PUBIDCHAR_CH(CUR)) {
				2890	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2891	"Invalid char in PubidLiteral 0x%X\n", CUR);
				2892	err = 1;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2893	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2894	len++;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2895	NEXT;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2896	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2897
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2898	if (CUR != '"') {
				2899	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
				2900	"Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2901	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2902	NEXT;
				2903	if (err == 0)
				2904	ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2905	}
				2906
				2907	return(ret);
				2908	}
				2909
				2910	/**
				2911	* htmlParseScript:
				2912	* @ctxt: an HTML parser context
				2913	*
				2914	* parse the content of an HTML SCRIPT or STYLE element
				2915	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2916	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2917	* http://www.w3.org/TR/html4/types.html#type-script
				2918	* http://www.w3.org/TR/html4/types.html#h-6.15
				2919	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2920	*
				2921	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2922	* element and the value of intrinsic event attributes. User agents must
				2923	* not evaluate script data as HTML markup but instead must pass it on as
				2924	* data to a script engine.
				2925	* NOTES:
				2926	* - The content is passed like CDATA
				2927	* - the attributes for style and scripting "onXXX" are also described
				2928	* as CDATA but SGML allows entities references in attributes so their
				2929	* processing is identical as other attributes
				2930	*/
				2931	static void
				2932	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2933	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2934	int nbchar = 0;
				2935	int cur,l;
				2936
				2937	SHRINK;
				2938	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2939	while (cur != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2940	if ((cur == '<') && (NXT(1) == '/')) {
				2941	/*
				2942	* One should break here, the specification is clear:
				2943	* Authors should therefore escape "</" within the content.
				2944	* Escape mechanisms are specific to each scripting or
				2945	* style sheet language.
				2946	*
				2947	* In recovery mode, only break if end tag match the
				2948	* current tag, effectively ignoring all tags inside the
				2949	* script/style block and treating the entire block as
				2950	* CDATA.
				2951	*/
				2952	if (ctxt->recovery) {
				2953	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
				2954	xmlStrlen(ctxt->name)) == 0)
				2955	{
				2956	break; /* while */
				2957	} else {
				2958	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				2959	"Element %s embeds close tag\n",
				2960	ctxt->name, NULL);
				2961	}
				2962	} else {
				2963	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2964	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2965	{
				2966	break; /* while */
				2967	}
				2968	}
				2969	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2970	if (IS_CHAR(cur)) {
				2971	COPY_BUF(l,buf,nbchar,cur);
				2972	} else {
				2973	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2974	"Invalid char in CDATA 0x%X\n", cur);
				2975	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2976	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2977	buf[nbchar] = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2978	if (ctxt->sax->cdataBlock!= NULL) {
				2979	/*
				2980	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2981	*/
				2982	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2983	} else if (ctxt->sax->characters != NULL) {
				2984	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2985	}
				2986	nbchar = 0;
				2987	}
				2988	GROW;
				2989	NEXTL(l);
				2990	cur = CUR_CHAR(l);
				2991	}
				2992
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2993	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2994	buf[nbchar] = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2995	if (ctxt->sax->cdataBlock!= NULL) {
				2996	/*
				2997	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2998	*/
				2999	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				3000	} else if (ctxt->sax->characters != NULL) {
				3001	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3002	}
				3003	}
				3004	}
				3005
				3006
				3007	/**
				3008	* htmlParseCharDataInternal:
				3009	* @ctxt: an HTML parser context
				3010	* @readahead: optional read ahead character in ascii range
				3011	*
				3012	* parse a CharData section.
				3013	* if we are within a CDATA section ']]>' marks an end of section.
				3014	*
				3015	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				3016	*/
				3017
				3018	static void
				3019	htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
				3020	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
				3021	int nbchar = 0;
				3022	int cur, l;
				3023	int chunk = 0;
				3024
				3025	if (readahead)
				3026	buf[nbchar++] = readahead;
				3027
				3028	SHRINK;
				3029	cur = CUR_CHAR(l);
				3030	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				3031	((cur != '&') \|\| (ctxt->token == '&')) &&
				3032	(cur != 0)) {
				3033	if (!(IS_CHAR(cur))) {
				3034	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3035	"Invalid char in CDATA 0x%X\n", cur);
				3036	} else {
				3037	COPY_BUF(l,buf,nbchar,cur);
				3038	}
				3039	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3040	buf[nbchar] = 0;
				3041
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3042	/*
				3043	* Ok the segment is to be consumed as chars.
				3044	*/
				3045	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				3046	if (areBlanks(ctxt, buf, nbchar)) {
				3047	if (ctxt->keepBlanks) {
				3048	if (ctxt->sax->characters != NULL)
				3049	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3050	} else {
				3051	if (ctxt->sax->ignorableWhitespace != NULL)
				3052	ctxt->sax->ignorableWhitespace(ctxt->userData,
				3053	buf, nbchar);
				3054	}
				3055	} else {
				3056	htmlCheckParagraph(ctxt);
				3057	if (ctxt->sax->characters != NULL)
				3058	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3059	}
				3060	}
				3061	nbchar = 0;
				3062	}
				3063	NEXTL(l);
				3064	chunk++;
				3065	if (chunk > HTML_PARSER_BUFFER_SIZE) {
				3066	chunk = 0;
				3067	SHRINK;
				3068	GROW;
				3069	}
				3070	cur = CUR_CHAR(l);
				3071	if (cur == 0) {
				3072	SHRINK;
				3073	GROW;
				3074	cur = CUR_CHAR(l);
				3075	}
				3076	}
				3077	if (nbchar != 0) {
				3078	buf[nbchar] = 0;
				3079
				3080	/*
				3081	* Ok the segment is to be consumed as chars.
				3082	*/
				3083	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				3084	if (areBlanks(ctxt, buf, nbchar)) {
				3085	if (ctxt->keepBlanks) {
				3086	if (ctxt->sax->characters != NULL)
				3087	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3088	} else {
				3089	if (ctxt->sax->ignorableWhitespace != NULL)
				3090	ctxt->sax->ignorableWhitespace(ctxt->userData,
				3091	buf, nbchar);
				3092	}
				3093	} else {
				3094	htmlCheckParagraph(ctxt);
				3095	if (ctxt->sax->characters != NULL)
				3096	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3097	}
				3098	}
				3099	} else {
				3100	/*
				3101	* Loop detection
				3102	*/
				3103	if (cur == 0)
				3104	ctxt->instate = XML_PARSER_EOF;
				3105	}
				3106	}
				3107
				3108	/**
				3109	* htmlParseCharData:
				3110	* @ctxt: an HTML parser context
				3111	*
				3112	* parse a CharData section.
				3113	* if we are within a CDATA section ']]>' marks an end of section.
				3114	*
				3115	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				3116	*/
				3117
				3118	static void
				3119	htmlParseCharData(htmlParserCtxtPtr ctxt) {
				3120	htmlParseCharDataInternal(ctxt, 0);
				3121	}
				3122
				3123	/**
				3124	* htmlParseExternalID:
				3125	* @ctxt: an HTML parser context
				3126	* @publicID: a xmlChar** receiving PubidLiteral
				3127	*
				3128	* Parse an External ID or a Public ID
				3129	*
				3130	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				3131	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				3132	*
				3133	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				3134	*
				3135	* Returns the function returns SystemLiteral and in the second
				3136	* case publicID receives PubidLiteral, is strict is off
				3137	* it is possible to return NULL and have publicID set.
				3138	*/
				3139
				3140	static xmlChar *
				3141	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
				3142	xmlChar *URI = NULL;
				3143
				3144	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				3145	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				3146	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				3147	SKIP(6);
				3148	if (!IS_BLANK_CH(CUR)) {
				3149	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3150	"Space required after 'SYSTEM'\n", NULL, NULL);
				3151	}
				3152	SKIP_BLANKS;
				3153	URI = htmlParseSystemLiteral(ctxt);
				3154	if (URI == NULL) {
				3155	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
				3156	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
				3157	}
				3158	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				3159	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				3160	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				3161	SKIP(6);
				3162	if (!IS_BLANK_CH(CUR)) {
				3163	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3164	"Space required after 'PUBLIC'\n", NULL, NULL);
				3165	}
				3166	SKIP_BLANKS;
				3167	*publicID = htmlParsePubidLiteral(ctxt);
				3168	if (*publicID == NULL) {
				3169	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
				3170	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
				3171	NULL, NULL);
				3172	}
				3173	SKIP_BLANKS;
				3174	if ((CUR == '"') \|\| (CUR == '\'')) {
				3175	URI = htmlParseSystemLiteral(ctxt);
				3176	}
				3177	}
				3178	return(URI);
				3179	}
				3180
				3181	/**
				3182	* xmlParsePI:
				3183	* @ctxt: an XML parser context
				3184	*
				3185	* parse an XML Processing Instruction.
				3186	*
				3187	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
				3188	*/
				3189	static void
				3190	htmlParsePI(htmlParserCtxtPtr ctxt) {
				3191	xmlChar *buf = NULL;
				3192	int len = 0;
				3193	int size = HTML_PARSER_BUFFER_SIZE;
				3194	int cur, l;
				3195	const xmlChar *target;
				3196	xmlParserInputState state;
				3197	int count = 0;
				3198
				3199	if ((RAW == '<') && (NXT(1) == '?')) {
				3200	state = ctxt->instate;
				3201	ctxt->instate = XML_PARSER_PI;
				3202	/*
				3203	* this is a Processing Instruction.
				3204	*/
				3205	SKIP(2);
				3206	SHRINK;
				3207
				3208	/*
				3209	* Parse the target name and check for special support like
				3210	* namespace.
				3211	*/
				3212	target = htmlParseName(ctxt);
				3213	if (target != NULL) {
				3214	if (RAW == '>') {
				3215	SKIP(1);
				3216
				3217	/*
				3218	* SAX: PI detected.
				3219	*/
				3220	if ((ctxt->sax) && (!ctxt->disableSAX) &&
				3221	(ctxt->sax->processingInstruction != NULL))
				3222	ctxt->sax->processingInstruction(ctxt->userData,
				3223	target, NULL);
				3224	ctxt->instate = state;
				3225	return;
				3226	}
				3227	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
				3228	if (buf == NULL) {
				3229	htmlErrMemory(ctxt, NULL);
				3230	ctxt->instate = state;
				3231	return;
				3232	}
				3233	cur = CUR;
				3234	if (!IS_BLANK(cur)) {
				3235	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3236	"ParsePI: PI %s space expected\n", target, NULL);
				3237	}
				3238	SKIP_BLANKS;
				3239	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3240	while ((cur != 0) && (cur != '>')) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3241	if (len + 5 >= size) {
				3242	xmlChar *tmp;
				3243
				3244	size *= 2;
				3245	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				3246	if (tmp == NULL) {
				3247	htmlErrMemory(ctxt, NULL);
				3248	xmlFree(buf);
				3249	ctxt->instate = state;
				3250	return;
				3251	}
				3252	buf = tmp;
				3253	}
				3254	count++;
				3255	if (count > 50) {
				3256	GROW;
				3257	count = 0;
				3258	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3259	if (IS_CHAR(cur)) {
				3260	COPY_BUF(l,buf,len,cur);
				3261	} else {
				3262	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3263	"Invalid char in processing instruction "
				3264	"0x%X\n", cur);
				3265	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3266	NEXTL(l);
				3267	cur = CUR_CHAR(l);
				3268	if (cur == 0) {
				3269	SHRINK;
				3270	GROW;
				3271	cur = CUR_CHAR(l);
				3272	}
				3273	}
				3274	buf[len] = 0;
				3275	if (cur != '>') {
				3276	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
				3277	"ParsePI: PI %s never end ...\n", target, NULL);
				3278	} else {
				3279	SKIP(1);
				3280
				3281	/*
				3282	* SAX: PI detected.
				3283	*/
				3284	if ((ctxt->sax) && (!ctxt->disableSAX) &&
				3285	(ctxt->sax->processingInstruction != NULL))
				3286	ctxt->sax->processingInstruction(ctxt->userData,
				3287	target, buf);
				3288	}
				3289	xmlFree(buf);
				3290	} else {
				3291	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
				3292	"PI is not started correctly", NULL, NULL);
				3293	}
				3294	ctxt->instate = state;
				3295	}
				3296	}
				3297
				3298	/**
				3299	* htmlParseComment:
				3300	* @ctxt: an HTML parser context
				3301	*
				3302	* Parse an XML (SGML) comment <!-- .... -->
				3303	*
				3304	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				3305	*/
				3306	static void
				3307	htmlParseComment(htmlParserCtxtPtr ctxt) {
				3308	xmlChar *buf = NULL;
				3309	int len;
				3310	int size = HTML_PARSER_BUFFER_SIZE;
				3311	int q, ql;
				3312	int r, rl;
				3313	int cur, l;
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	3314	int next, nl;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3315	xmlParserInputState state;
				3316
				3317	/*
				3318	* Check that there is a comment right here.
				3319	*/
				3320	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				3321	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				3322
				3323	state = ctxt->instate;
				3324	ctxt->instate = XML_PARSER_COMMENT;
				3325	SHRINK;
				3326	SKIP(4);
				3327	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
				3328	if (buf == NULL) {
				3329	htmlErrMemory(ctxt, "buffer allocation failed\n");
				3330	ctxt->instate = state;
				3331	return;
				3332	}
				3333	len = 0;
				3334	buf[len] = 0;
				3335	q = CUR_CHAR(ql);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3336	if (q == 0)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3337	goto unfinished;
				3338	NEXTL(ql);
				3339	r = CUR_CHAR(rl);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3340	if (r == 0)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3341	goto unfinished;
				3342	NEXTL(rl);
				3343	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3344	while ((cur != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3345	((cur != '>') \|\|
				3346	(r != '-') \|\| (q != '-'))) {
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	3347	NEXTL(l);
				3348	next = CUR_CHAR(nl);
				3349	if (next == 0) {
				3350	SHRINK;
				3351	GROW;
				3352	next = CUR_CHAR(nl);
				3353	}
				3354
				3355	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
				3356	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
				3357	"Comment incorrectly closed by '--!>'", NULL, NULL);
				3358	cur = '>';
				3359	break;
				3360	}
				3361
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3362	if (len + 5 >= size) {
				3363	xmlChar *tmp;
				3364
				3365	size *= 2;
				3366	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				3367	if (tmp == NULL) {
				3368	xmlFree(buf);
				3369	htmlErrMemory(ctxt, "growing buffer failed\n");
				3370	ctxt->instate = state;
				3371	return;
				3372	}
				3373	buf = tmp;
				3374	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3375	if (IS_CHAR(q)) {
				3376	COPY_BUF(ql,buf,len,q);
				3377	} else {
				3378	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3379	"Invalid char in comment 0x%X\n", q);
				3380	}
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	3381
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3382	q = r;
				3383	ql = rl;
				3384	r = cur;
				3385	rl = l;
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	3386	cur = next;
				3387	l = nl;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3388	}
				3389	buf[len] = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3390	if (cur == '>') {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3391	NEXT;
				3392	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				3393	(!ctxt->disableSAX))
				3394	ctxt->sax->comment(ctxt->userData, buf);
				3395	xmlFree(buf);
				3396	ctxt->instate = state;
				3397	return;
				3398	}
				3399
				3400	unfinished:
				3401	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
				3402	"Comment not terminated \n<!--%.50s\n", buf, NULL);
				3403	xmlFree(buf);
				3404	}
				3405
				3406	/**
				3407	* htmlParseCharRef:
				3408	* @ctxt: an HTML parser context
				3409	*
				3410	* parse Reference declarations
				3411	*
				3412	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				3413	* '&#x' [0-9a-fA-F]+ ';'
				3414	*
				3415	* Returns the value parsed (as an int)
				3416	*/
				3417	int
				3418	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				3419	int val = 0;
				3420
				3421	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				3422	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				3423	"htmlParseCharRef: context error\n",
				3424	NULL, NULL);
				3425	return(0);
				3426	}
				3427	if ((CUR == '&') && (NXT(1) == '#') &&
				3428	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
				3429	SKIP(3);
				3430	while (CUR != ';') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3431	if ((CUR >= '0') && (CUR <= '9')) {
				3432	if (val < 0x110000)
				3433	val = val * 16 + (CUR - '0');
				3434	} else if ((CUR >= 'a') && (CUR <= 'f')) {
				3435	if (val < 0x110000)
				3436	val = val * 16 + (CUR - 'a') + 10;
				3437	} else if ((CUR >= 'A') && (CUR <= 'F')) {
				3438	if (val < 0x110000)
				3439	val = val * 16 + (CUR - 'A') + 10;
				3440	} else {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3441	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
				3442	"htmlParseCharRef: missing semicolon\n",
				3443	NULL, NULL);
				3444	break;
				3445	}
				3446	NEXT;
				3447	}
				3448	if (CUR == ';')
				3449	NEXT;
				3450	} else if ((CUR == '&') && (NXT(1) == '#')) {
				3451	SKIP(2);
				3452	while (CUR != ';') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3453	if ((CUR >= '0') && (CUR <= '9')) {
				3454	if (val < 0x110000)
				3455	val = val * 10 + (CUR - '0');
				3456	} else {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3457	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
				3458	"htmlParseCharRef: missing semicolon\n",
				3459	NULL, NULL);
				3460	break;
				3461	}
				3462	NEXT;
				3463	}
				3464	if (CUR == ';')
				3465	NEXT;
				3466	} else {
				3467	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
				3468	"htmlParseCharRef: invalid value\n", NULL, NULL);
				3469	}
				3470	/*
				3471	* Check the value IS_CHAR ...
				3472	*/
				3473	if (IS_CHAR(val)) {
				3474	return(val);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3475	} else if (val >= 0x110000) {
				3476	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
				3477	"htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3478	} else {
				3479	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3480	"htmlParseCharRef: invalid xmlChar value %d\n",
				3481	val);
				3482	}
				3483	return(0);
				3484	}
				3485
				3486
				3487	/**
				3488	* htmlParseDocTypeDecl:
				3489	* @ctxt: an HTML parser context
				3490	*
				3491	* parse a DOCTYPE declaration
				3492	*
				3493	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				3494	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				3495	*/
				3496
				3497	static void
				3498	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				3499	const xmlChar *name;
				3500	xmlChar *ExternalID = NULL;
				3501	xmlChar *URI = NULL;
				3502
				3503	/*
				3504	* We know that '<!DOCTYPE' has been detected.
				3505	*/
				3506	SKIP(9);
				3507
				3508	SKIP_BLANKS;
				3509
				3510	/*
				3511	* Parse the DOCTYPE name.
				3512	*/
				3513	name = htmlParseName(ctxt);
				3514	if (name == NULL) {
				3515	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3516	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
				3517	NULL, NULL);
				3518	}
				3519	/*
				3520	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				3521	*/
				3522
				3523	SKIP_BLANKS;
				3524
				3525	/*
				3526	* Check for SystemID and ExternalID
				3527	*/
				3528	URI = htmlParseExternalID(ctxt, &ExternalID);
				3529	SKIP_BLANKS;
				3530
				3531	/*
				3532	* We should be at the end of the DOCTYPE declaration.
				3533	*/
				3534	if (CUR != '>') {
				3535	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
				3536	"DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3537	/* Ignore bogus content */
				3538	while ((CUR != 0) && (CUR != '>'))
				3539	NEXT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3540	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3541	if (CUR == '>')
				3542	NEXT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3543
				3544	/*
				3545	* Create or update the document accordingly to the DOCTYPE
				3546	*/
				3547	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				3548	(!ctxt->disableSAX))
				3549	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				3550
				3551	/*
				3552	* Cleanup, since we don't use all those identifiers
				3553	*/
				3554	if (URI != NULL) xmlFree(URI);
				3555	if (ExternalID != NULL) xmlFree(ExternalID);
				3556	}
				3557
				3558	/**
				3559	* htmlParseAttribute:
				3560	* @ctxt: an HTML parser context
				3561	* @value: a xmlChar ** used to store the value of the attribute
				3562	*
				3563	* parse an attribute
				3564	*
				3565	* [41] Attribute ::= Name Eq AttValue
				3566	*
				3567	* [25] Eq ::= S? '=' S?
				3568	*
				3569	* With namespace:
				3570	*
				3571	* [NS 11] Attribute ::= QName Eq AttValue
				3572	*
				3573	* Also the case QName == xmlns:??? is handled independently as a namespace
				3574	* definition.
				3575	*
				3576	* Returns the attribute name, and the value in *value.
				3577	*/
				3578
				3579	static const xmlChar *
				3580	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				3581	const xmlChar *name;
				3582	xmlChar *val = NULL;
				3583
				3584	*value = NULL;
				3585	name = htmlParseHTMLName(ctxt);
				3586	if (name == NULL) {
				3587	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3588	"error parsing attribute name\n", NULL, NULL);
				3589	return(NULL);
				3590	}
				3591
				3592	/*
				3593	* read the value
				3594	*/
				3595	SKIP_BLANKS;
				3596	if (CUR == '=') {
				3597	NEXT;
				3598	SKIP_BLANKS;
				3599	val = htmlParseAttValue(ctxt);
				3600	}
				3601
				3602	*value = val;
				3603	return(name);
				3604	}
				3605
				3606	/**
				3607	* htmlCheckEncodingDirect:
				3608	* @ctxt: an HTML parser context
				3609	* @attvalue: the attribute value
				3610	*
				3611	* Checks an attribute value to detect
				3612	* the encoding
				3613	* If a new encoding is detected the parser is switched to decode
				3614	* it and pass UTF8
				3615	*/
				3616	static void
				3617	htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
				3618
				3619	if ((ctxt == NULL) \|\| (encoding == NULL) \|\|
				3620	(ctxt->options & HTML_PARSE_IGNORE_ENC))
				3621	return;
				3622
				3623	/* do not change encoding */
				3624	if (ctxt->input->encoding != NULL)
				3625	return;
				3626
				3627	if (encoding != NULL) {
				3628	xmlCharEncoding enc;
				3629	xmlCharEncodingHandlerPtr handler;
				3630
				3631	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				3632
				3633	if (ctxt->input->encoding != NULL)
				3634	xmlFree((xmlChar *) ctxt->input->encoding);
				3635	ctxt->input->encoding = xmlStrdup(encoding);
				3636
				3637	enc = xmlParseCharEncoding((const char *) encoding);
				3638	/*
				3639	* registered set of known encodings
				3640	*/
				3641	if (enc != XML_CHAR_ENCODING_ERROR) {
				3642	if (((enc == XML_CHAR_ENCODING_UTF16LE) \|\|
				3643	(enc == XML_CHAR_ENCODING_UTF16BE) \|\|
				3644	(enc == XML_CHAR_ENCODING_UCS4LE) \|\|
				3645	(enc == XML_CHAR_ENCODING_UCS4BE)) &&
				3646	(ctxt->input->buf != NULL) &&
				3647	(ctxt->input->buf->encoder == NULL)) {
				3648	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				3649	"htmlCheckEncoding: wrong encoding meta\n",
				3650	NULL, NULL);
				3651	} else {
				3652	xmlSwitchEncoding(ctxt, enc);
				3653	}
				3654	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				3655	} else {
				3656	/*
				3657	* fallback for unknown encodings
				3658	*/
				3659	handler = xmlFindCharEncodingHandler((const char *) encoding);
				3660	if (handler != NULL) {
				3661	xmlSwitchToEncoding(ctxt, handler);
				3662	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				3663	} else {
				3664	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				3665	"htmlCheckEncoding: unknown encoding %s\n",
				3666	encoding, NULL);
				3667	}
				3668	}
				3669
				3670	if ((ctxt->input->buf != NULL) &&
				3671	(ctxt->input->buf->encoder != NULL) &&
				3672	(ctxt->input->buf->raw != NULL) &&
				3673	(ctxt->input->buf->buffer != NULL)) {
				3674	int nbchars;
				3675	int processed;
				3676
				3677	/*
				3678	* convert as much as possible to the parser reading buffer.
				3679	*/
				3680	processed = ctxt->input->cur - ctxt->input->base;
				3681	xmlBufShrink(ctxt->input->buf->buffer, processed);
				3682	nbchars = xmlCharEncInput(ctxt->input->buf, 1);
				3683	xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
				3684	if (nbchars < 0) {
				3685	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				3686	"htmlCheckEncoding: encoder error\n",
				3687	NULL, NULL);
				3688	}
				3689	}
				3690	}
				3691	}
				3692
				3693	/**
				3694	* htmlCheckEncoding:
				3695	* @ctxt: an HTML parser context
				3696	* @attvalue: the attribute value
				3697	*
				3698	* Checks an http-equiv attribute from a Meta tag to detect
				3699	* the encoding
				3700	* If a new encoding is detected the parser is switched to decode
				3701	* it and pass UTF8
				3702	*/
				3703	static void
				3704	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				3705	const xmlChar *encoding;
				3706
				3707	if (!attvalue)
				3708	return;
				3709
				3710	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
				3711	if (encoding != NULL) {
				3712	encoding += 7;
				3713	}
				3714	/*
				3715	* skip blank
				3716	*/
				3717	if (encoding && IS_BLANK_CH(*encoding))
				3718	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
				3719	if (encoding && *encoding == '=') {
				3720	encoding ++;
				3721	htmlCheckEncodingDirect(ctxt, encoding);
				3722	}
				3723	}
				3724
				3725	/**
				3726	* htmlCheckMeta:
				3727	* @ctxt: an HTML parser context
				3728	* @atts: the attributes values
				3729	*
				3730	* Checks an attributes from a Meta tag
				3731	*/
				3732	static void
				3733	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				3734	int i;
				3735	const xmlChar att, value;
				3736	int http = 0;
				3737	const xmlChar *content = NULL;
				3738
				3739	if ((ctxt == NULL) \|\| (atts == NULL))
				3740	return;
				3741
				3742	i = 0;
				3743	att = atts[i++];
				3744	while (att != NULL) {
				3745	value = atts[i++];
				3746	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				3747	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				3748	http = 1;
				3749	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
				3750	htmlCheckEncodingDirect(ctxt, value);
				3751	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				3752	content = value;
				3753	att = atts[i++];
				3754	}
				3755	if ((http) && (content != NULL))
				3756	htmlCheckEncoding(ctxt, content);
				3757
				3758	}
				3759
				3760	/**
				3761	* htmlParseStartTag:
				3762	* @ctxt: an HTML parser context
				3763	*
				3764	* parse a start of tag either for rule element or
				3765	* EmptyElement. In both case we don't parse the tag closing chars.
				3766	*
				3767	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				3768	*
				3769	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				3770	*
				3771	* With namespace:
				3772	*
				3773	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				3774	*
				3775	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				3776	*
				3777	* Returns 0 in case of success, -1 in case of error and 1 if discarded
				3778	*/
				3779
				3780	static int
				3781	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				3782	const xmlChar *name;
				3783	const xmlChar *attname;
				3784	xmlChar *attvalue;
				3785	const xmlChar **atts;
				3786	int nbatts = 0;
				3787	int maxatts;
				3788	int meta = 0;
				3789	int i;
				3790	int discardtag = 0;
				3791
				3792	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				3793	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				3794	"htmlParseStartTag: context error\n", NULL, NULL);
				3795	return -1;
				3796	}
				3797	if (ctxt->instate == XML_PARSER_EOF)
				3798	return(-1);
				3799	if (CUR != '<') return -1;
				3800	NEXT;
				3801
				3802	atts = ctxt->atts;
				3803	maxatts = ctxt->maxatts;
				3804
				3805	GROW;
				3806	name = htmlParseHTMLName(ctxt);
				3807	if (name == NULL) {
				3808	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3809	"htmlParseStartTag: invalid element name\n",
				3810	NULL, NULL);
				3811	/* if recover preserve text on classic misconstructs */
				3812	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) \|\| (CUR == '<') \|\|
				3813	(CUR == '=') \|\| (CUR == '>') \|\| (((CUR >= '0') && (CUR <= '9'))))) {
				3814	htmlParseCharDataInternal(ctxt, '<');
				3815	return(-1);
				3816	}
				3817
				3818
				3819	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3820	while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3821	(ctxt->instate != XML_PARSER_EOF))
				3822	NEXT;
				3823	return -1;
				3824	}
				3825	if (xmlStrEqual(name, BAD_CAST"meta"))
				3826	meta = 1;
				3827
				3828	/*
				3829	* Check for auto-closure of HTML elements.
				3830	*/
				3831	htmlAutoClose(ctxt, name);
				3832
				3833	/*
				3834	* Check for implied HTML elements.
				3835	*/
				3836	htmlCheckImplied(ctxt, name);
				3837
				3838	/*
				3839	* Avoid html at any level > 0, head at any level != 1
				3840	* or any attempt to recurse body
				3841	*/
				3842	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				3843	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3844	"htmlParseStartTag: misplaced <html> tag\n",
				3845	name, NULL);
				3846	discardtag = 1;
				3847	ctxt->depth++;
				3848	}
				3849	if ((ctxt->nameNr != 1) &&
				3850	(xmlStrEqual(name, BAD_CAST"head"))) {
				3851	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3852	"htmlParseStartTag: misplaced <head> tag\n",
				3853	name, NULL);
				3854	discardtag = 1;
				3855	ctxt->depth++;
				3856	}
				3857	if (xmlStrEqual(name, BAD_CAST"body")) {
				3858	int indx;
				3859	for (indx = 0;indx < ctxt->nameNr;indx++) {
				3860	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
				3861	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3862	"htmlParseStartTag: misplaced <body> tag\n",
				3863	name, NULL);
				3864	discardtag = 1;
				3865	ctxt->depth++;
				3866	}
				3867	}
				3868	}
				3869
				3870	/*
				3871	* Now parse the attributes, it ends up with the ending
				3872	*
				3873	* (S Attribute)* S?
				3874	*/
				3875	SKIP_BLANKS;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3876	while ((CUR != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3877	(CUR != '>') &&
				3878	((CUR != '/') \|\| (NXT(1) != '>'))) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3879	GROW;
				3880	attname = htmlParseAttribute(ctxt, &attvalue);
				3881	if (attname != NULL) {
				3882
				3883	/*
				3884	* Well formedness requires at most one declaration of an attribute
				3885	*/
				3886	for (i = 0; i < nbatts;i += 2) {
				3887	if (xmlStrEqual(atts[i], attname)) {
				3888	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
				3889	"Attribute %s redefined\n", attname, NULL);
				3890	if (attvalue != NULL)
				3891	xmlFree(attvalue);
				3892	goto failed;
				3893	}
				3894	}
				3895
				3896	/*
				3897	* Add the pair to atts
				3898	*/
				3899	if (atts == NULL) {
				3900	maxatts = 22; /* allow for 10 attrs by default */
				3901	atts = (const xmlChar **)
				3902	xmlMalloc(maxatts * sizeof(xmlChar *));
				3903	if (atts == NULL) {
				3904	htmlErrMemory(ctxt, NULL);
				3905	if (attvalue != NULL)
				3906	xmlFree(attvalue);
				3907	goto failed;
				3908	}
				3909	ctxt->atts = atts;
				3910	ctxt->maxatts = maxatts;
				3911	} else if (nbatts + 4 > maxatts) {
				3912	const xmlChar **n;
				3913
				3914	maxatts *= 2;
				3915	n = (const xmlChar *) xmlRealloc((void ) atts,
				3916	maxatts * sizeof(const xmlChar *));
				3917	if (n == NULL) {
				3918	htmlErrMemory(ctxt, NULL);
				3919	if (attvalue != NULL)
				3920	xmlFree(attvalue);
				3921	goto failed;
				3922	}
				3923	atts = n;
				3924	ctxt->atts = atts;
				3925	ctxt->maxatts = maxatts;
				3926	}
				3927	atts[nbatts++] = attname;
				3928	atts[nbatts++] = attvalue;
				3929	atts[nbatts] = NULL;
				3930	atts[nbatts + 1] = NULL;
				3931	}
				3932	else {
				3933	if (attvalue != NULL)
				3934	xmlFree(attvalue);
				3935	/* Dump the bogus attribute string up to the next blank or
				3936	* the end of the tag. */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3937	while ((CUR != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3938	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
				3939	((CUR != '/') \|\| (NXT(1) != '>')))
				3940	NEXT;
				3941	}
				3942
				3943	failed:
				3944	SKIP_BLANKS;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3945	}
				3946
				3947	/*
				3948	* Handle specific association to the META tag
				3949	*/
				3950	if (meta && (nbatts != 0))
				3951	htmlCheckMeta(ctxt, atts);
				3952
				3953	/*
				3954	* SAX: Start of Element !
				3955	*/
				3956	if (!discardtag) {
				3957	htmlnamePush(ctxt, name);
				3958	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
				3959	if (nbatts != 0)
				3960	ctxt->sax->startElement(ctxt->userData, name, atts);
				3961	else
				3962	ctxt->sax->startElement(ctxt->userData, name, NULL);
				3963	}
				3964	}
				3965
				3966	if (atts != NULL) {
				3967	for (i = 1;i < nbatts;i += 2) {
				3968	if (atts[i] != NULL)
				3969	xmlFree((xmlChar *) atts[i]);
				3970	}
				3971	}
				3972
				3973	return(discardtag);
				3974	}
				3975
				3976	/**
				3977	* htmlParseEndTag:
				3978	* @ctxt: an HTML parser context
				3979	*
				3980	* parse an end of tag
				3981	*
				3982	* [42] ETag ::= '</' Name S? '>'
				3983	*
				3984	* With namespace
				3985	*
				3986	* [NS 9] ETag ::= '</' QName S? '>'
				3987	*
				3988	* Returns 1 if the current level should be closed.
				3989	*/
				3990
				3991	static int
				3992	htmlParseEndTag(htmlParserCtxtPtr ctxt)
				3993	{
				3994	const xmlChar *name;
				3995	const xmlChar *oldname;
				3996	int i, ret;
				3997
				3998	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3999	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
				4000	"htmlParseEndTag: '</' not found\n", NULL, NULL);
				4001	return (0);
				4002	}
				4003	SKIP(2);
				4004
				4005	name = htmlParseHTMLName(ctxt);
				4006	if (name == NULL)
				4007	return (0);
				4008	/*
				4009	* We should definitely be at the ending "S? '>'" part
				4010	*/
				4011	SKIP_BLANKS;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4012	if (CUR != '>') {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4013	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				4014	"End tag : expected '>'\n", NULL, NULL);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4015	/* Skip to next '>' */
				4016	while ((CUR != 0) && (CUR != '>'))
				4017	NEXT;
				4018	}
				4019	if (CUR == '>')
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4020	NEXT;
				4021
				4022	/*
				4023	* if we ignored misplaced tags in htmlParseStartTag don't pop them
				4024	* out now.
				4025	*/
				4026	if ((ctxt->depth > 0) &&
				4027	(xmlStrEqual(name, BAD_CAST "html") \|\|
				4028	xmlStrEqual(name, BAD_CAST "body") \|\|
				4029	xmlStrEqual(name, BAD_CAST "head"))) {
				4030	ctxt->depth--;
				4031	return (0);
				4032	}
				4033
				4034	/*
				4035	* If the name read is not one of the element in the parsing stack
				4036	* then return, it's just an error.
				4037	*/
				4038	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				4039	if (xmlStrEqual(name, ctxt->nameTab[i]))
				4040	break;
				4041	}
				4042	if (i < 0) {
				4043	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				4044	"Unexpected end tag : %s\n", name, NULL);
				4045	return (0);
				4046	}
				4047
				4048
				4049	/*
				4050	* Check for auto-closure of HTML elements.
				4051	*/
				4052
				4053	htmlAutoCloseOnClose(ctxt, name);
				4054
				4055	/*
				4056	* Well formedness constraints, opening and closing must match.
				4057	* With the exception that the autoclose may have popped stuff out
				4058	* of the stack.
				4059	*/
				4060	if (!xmlStrEqual(name, ctxt->name)) {
				4061	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
				4062	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				4063	"Opening and ending tag mismatch: %s and %s\n",
				4064	name, ctxt->name);
				4065	}
				4066	}
				4067
				4068	/*
				4069	* SAX: End of Tag
				4070	*/
				4071	oldname = ctxt->name;
				4072	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				4073	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4074	ctxt->sax->endElement(ctxt->userData, name);
				4075	htmlNodeInfoPop(ctxt);
				4076	htmlnamePop(ctxt);
				4077	ret = 1;
				4078	} else {
				4079	ret = 0;
				4080	}
				4081
				4082	return (ret);
				4083	}
				4084
				4085
				4086	/**
				4087	* htmlParseReference:
				4088	* @ctxt: an HTML parser context
				4089	*
				4090	* parse and handle entity references in content,
				4091	* this will end-up in a call to character() since this is either a
				4092	* CharRef, or a predefined entity.
				4093	*/
				4094	static void
				4095	htmlParseReference(htmlParserCtxtPtr ctxt) {
				4096	const htmlEntityDesc * ent;
				4097	xmlChar out[6];
				4098	const xmlChar *name;
				4099	if (CUR != '&') return;
				4100
				4101	if (NXT(1) == '#') {
				4102	unsigned int c;
				4103	int bits, i = 0;
				4104
				4105	c = htmlParseCharRef(ctxt);
				4106	if (c == 0)
				4107	return;
				4108
				4109	if (c < 0x80) { out[i++]= c; bits= -6; }
				4110	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				4111	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				4112	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				4113
				4114	for ( ; bits >= 0; bits-= 6) {
				4115	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				4116	}
				4117	out[i] = 0;
				4118
				4119	htmlCheckParagraph(ctxt);
				4120	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4121	ctxt->sax->characters(ctxt->userData, out, i);
				4122	} else {
				4123	ent = htmlParseEntityRef(ctxt, &name);
				4124	if (name == NULL) {
				4125	htmlCheckParagraph(ctxt);
				4126	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4127	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				4128	return;
				4129	}
				4130	if ((ent == NULL) \|\| !(ent->value > 0)) {
				4131	htmlCheckParagraph(ctxt);
				4132	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				4133	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				4134	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				4135	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				4136	}
				4137	} else {
				4138	unsigned int c;
				4139	int bits, i = 0;
				4140
				4141	c = ent->value;
				4142	if (c < 0x80)
				4143	{ out[i++]= c; bits= -6; }
				4144	else if (c < 0x800)
				4145	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				4146	else if (c < 0x10000)
				4147	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				4148	else
				4149	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				4150
				4151	for ( ; bits >= 0; bits-= 6) {
				4152	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				4153	}
				4154	out[i] = 0;
				4155
				4156	htmlCheckParagraph(ctxt);
				4157	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4158	ctxt->sax->characters(ctxt->userData, out, i);
				4159	}
				4160	}
				4161	}
				4162
				4163	/**
				4164	* htmlParseContent:
				4165	* @ctxt: an HTML parser context
				4166	*
				4167	* Parse a content: comment, sub-element, reference or text.
				4168	* Kept for compatibility with old code
				4169	*/
				4170
				4171	static void
				4172	htmlParseContent(htmlParserCtxtPtr ctxt) {
				4173	xmlChar *currentNode;
				4174	int depth;
				4175	const xmlChar *name;
				4176
				4177	currentNode = xmlStrdup(ctxt->name);
				4178	depth = ctxt->nameNr;
				4179	while (1) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4180	GROW;
				4181
				4182	if (ctxt->instate == XML_PARSER_EOF)
				4183	break;
				4184
				4185	/*
				4186	* Our tag or one of it's parent or children is ending.
				4187	*/
				4188	if ((CUR == '<') && (NXT(1) == '/')) {
				4189	if (htmlParseEndTag(ctxt) &&
				4190	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				4191	if (currentNode != NULL)
				4192	xmlFree(currentNode);
				4193	return;
				4194	}
				4195	continue; /* while */
				4196	}
				4197
				4198	else if ((CUR == '<') &&
				4199	((IS_ASCII_LETTER(NXT(1))) \|\|
				4200	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
				4201	name = htmlParseHTMLName_nonInvasive(ctxt);
				4202	if (name == NULL) {
				4203	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				4204	"htmlParseStartTag: invalid element name\n",
				4205	NULL, NULL);
				4206	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4207	while ((CUR != 0) && (CUR != '>'))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4208	NEXT;
				4209
				4210	if (currentNode != NULL)
				4211	xmlFree(currentNode);
				4212	return;
				4213	}
				4214
				4215	if (ctxt->name != NULL) {
				4216	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
				4217	htmlAutoClose(ctxt, name);
				4218	continue;
				4219	}
				4220	}
				4221	}
				4222
				4223	/*
				4224	* Has this node been popped out during parsing of
				4225	* the next element
				4226	*/
				4227	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				4228	(!xmlStrEqual(currentNode, ctxt->name)))
				4229	{
				4230	if (currentNode != NULL) xmlFree(currentNode);
				4231	return;
				4232	}
				4233
				4234	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				4235	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
				4236	/*
				4237	* Handle SCRIPT/STYLE separately
				4238	*/
				4239	htmlParseScript(ctxt);
				4240	} else {
				4241	/*
				4242	* Sometimes DOCTYPE arrives in the middle of the document
				4243	*/
				4244	if ((CUR == '<') && (NXT(1) == '!') &&
				4245	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4246	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4247	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4248	(UPP(8) == 'E')) {
				4249	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				4250	"Misplaced DOCTYPE declaration\n",
				4251	BAD_CAST "DOCTYPE" , NULL);
				4252	htmlParseDocTypeDecl(ctxt);
				4253	}
				4254
				4255	/*
				4256	* First case : a comment
				4257	*/
				4258	if ((CUR == '<') && (NXT(1) == '!') &&
				4259	(NXT(2) == '-') && (NXT(3) == '-')) {
				4260	htmlParseComment(ctxt);
				4261	}
				4262
				4263	/*
				4264	* Second case : a Processing Instruction.
				4265	*/
				4266	else if ((CUR == '<') && (NXT(1) == '?')) {
				4267	htmlParsePI(ctxt);
				4268	}
				4269
				4270	/*
				4271	* Third case : a sub-element.
				4272	*/
				4273	else if (CUR == '<') {
				4274	htmlParseElement(ctxt);
				4275	}
				4276
				4277	/*
				4278	* Fourth case : a reference. If if has not been resolved,
				4279	* parsing returns it's Name, create the node
				4280	*/
				4281	else if (CUR == '&') {
				4282	htmlParseReference(ctxt);
				4283	}
				4284
				4285	/*
				4286	* Fifth case : end of the resource
				4287	*/
				4288	else if (CUR == 0) {
				4289	htmlAutoCloseOnEnd(ctxt);
				4290	break;
				4291	}
				4292
				4293	/*
				4294	* Last case, text. Note that References are handled directly.
				4295	*/
				4296	else {
				4297	htmlParseCharData(ctxt);
				4298	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4299	}
				4300	GROW;
				4301	}
				4302	if (currentNode != NULL) xmlFree(currentNode);
				4303	}
				4304
				4305	/**
				4306	* htmlParseElement:
				4307	* @ctxt: an HTML parser context
				4308	*
				4309	* parse an HTML element, this is highly recursive
				4310	* this is kept for compatibility with previous code versions
				4311	*
				4312	* [39] element ::= EmptyElemTag \| STag content ETag
				4313	*
				4314	* [41] Attribute ::= Name Eq AttValue
				4315	*/
				4316
				4317	void
				4318	htmlParseElement(htmlParserCtxtPtr ctxt) {
				4319	const xmlChar *name;
				4320	xmlChar *currentNode = NULL;
				4321	const htmlElemDesc * info;
				4322	htmlParserNodeInfo node_info;
				4323	int failed;
				4324	int depth;
				4325	const xmlChar *oldptr;
				4326
				4327	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4328	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4329	"htmlParseElement: context error\n", NULL, NULL);
				4330	return;
				4331	}
				4332
				4333	if (ctxt->instate == XML_PARSER_EOF)
				4334	return;
				4335
				4336	/* Capture start position */
				4337	if (ctxt->record_info) {
				4338	node_info.begin_pos = ctxt->input->consumed +
				4339	(CUR_PTR - ctxt->input->base);
				4340	node_info.begin_line = ctxt->input->line;
				4341	}
				4342
				4343	failed = htmlParseStartTag(ctxt);
				4344	name = ctxt->name;
				4345	if ((failed == -1) \|\| (name == NULL)) {
				4346	if (CUR == '>')
				4347	NEXT;
				4348	return;
				4349	}
				4350
				4351	/*
				4352	* Lookup the info for that element.
				4353	*/
				4354	info = htmlTagLookup(name);
				4355	if (info == NULL) {
				4356	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				4357	"Tag %s invalid\n", name, NULL);
				4358	}
				4359
				4360	/*
				4361	* Check for an Empty Element labeled the XML/SGML way
				4362	*/
				4363	if ((CUR == '/') && (NXT(1) == '>')) {
				4364	SKIP(2);
				4365	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4366	ctxt->sax->endElement(ctxt->userData, name);
				4367	htmlnamePop(ctxt);
				4368	return;
				4369	}
				4370
				4371	if (CUR == '>') {
				4372	NEXT;
				4373	} else {
				4374	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				4375	"Couldn't find end of Start Tag %s\n", name, NULL);
				4376
				4377	/*
				4378	* end of parsing of this node.
				4379	*/
				4380	if (xmlStrEqual(name, ctxt->name)) {
				4381	nodePop(ctxt);
				4382	htmlnamePop(ctxt);
				4383	}
				4384
				4385	/*
				4386	* Capture end position and add node
				4387	*/
				4388	if (ctxt->record_info) {
				4389	node_info.end_pos = ctxt->input->consumed +
				4390	(CUR_PTR - ctxt->input->base);
				4391	node_info.end_line = ctxt->input->line;
				4392	node_info.node = ctxt->node;
				4393	xmlParserAddNodeInfo(ctxt, &node_info);
				4394	}
				4395	return;
				4396	}
				4397
				4398	/*
				4399	* Check for an Empty Element from DTD definition
				4400	*/
				4401	if ((info != NULL) && (info->empty)) {
				4402	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4403	ctxt->sax->endElement(ctxt->userData, name);
				4404	htmlnamePop(ctxt);
				4405	return;
				4406	}
				4407
				4408	/*
				4409	* Parse the content of the element:
				4410	*/
				4411	currentNode = xmlStrdup(ctxt->name);
				4412	depth = ctxt->nameNr;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4413	while (CUR != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4414	oldptr = ctxt->input->cur;
				4415	htmlParseContent(ctxt);
				4416	if (oldptr==ctxt->input->cur) break;
				4417	if (ctxt->nameNr < depth) break;
				4418	}
				4419
				4420	/*
				4421	* Capture end position and add node
				4422	*/
				4423	if ( currentNode != NULL && ctxt->record_info ) {
				4424	node_info.end_pos = ctxt->input->consumed +
				4425	(CUR_PTR - ctxt->input->base);
				4426	node_info.end_line = ctxt->input->line;
				4427	node_info.node = ctxt->node;
				4428	xmlParserAddNodeInfo(ctxt, &node_info);
				4429	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4430	if (CUR == 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4431	htmlAutoCloseOnEnd(ctxt);
				4432	}
				4433
				4434	if (currentNode != NULL)
				4435	xmlFree(currentNode);
				4436	}
				4437
				4438	static void
				4439	htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
				4440	/*
				4441	* Capture end position and add node
				4442	*/
				4443	if ( ctxt->node != NULL && ctxt->record_info ) {
				4444	ctxt->nodeInfo->end_pos = ctxt->input->consumed +
				4445	(CUR_PTR - ctxt->input->base);
				4446	ctxt->nodeInfo->end_line = ctxt->input->line;
				4447	ctxt->nodeInfo->node = ctxt->node;
				4448	xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
				4449	htmlNodeInfoPop(ctxt);
				4450	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4451	if (CUR == 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4452	htmlAutoCloseOnEnd(ctxt);
				4453	}
				4454	}
				4455
				4456	/**
				4457	* htmlParseElementInternal:
				4458	* @ctxt: an HTML parser context
				4459	*
				4460	* parse an HTML element, new version, non recursive
				4461	*
				4462	* [39] element ::= EmptyElemTag \| STag content ETag
				4463	*
				4464	* [41] Attribute ::= Name Eq AttValue
				4465	*/
				4466
				4467	static void
				4468	htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
				4469	const xmlChar *name;
				4470	const htmlElemDesc * info;
				4471	htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
				4472	int failed;
				4473
				4474	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4475	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4476	"htmlParseElementInternal: context error\n", NULL, NULL);
				4477	return;
				4478	}
				4479
				4480	if (ctxt->instate == XML_PARSER_EOF)
				4481	return;
				4482
				4483	/* Capture start position */
				4484	if (ctxt->record_info) {
				4485	node_info.begin_pos = ctxt->input->consumed +
				4486	(CUR_PTR - ctxt->input->base);
				4487	node_info.begin_line = ctxt->input->line;
				4488	}
				4489
				4490	failed = htmlParseStartTag(ctxt);
				4491	name = ctxt->name;
				4492	if ((failed == -1) \|\| (name == NULL)) {
				4493	if (CUR == '>')
				4494	NEXT;
				4495	return;
				4496	}
				4497
				4498	/*
				4499	* Lookup the info for that element.
				4500	*/
				4501	info = htmlTagLookup(name);
				4502	if (info == NULL) {
				4503	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				4504	"Tag %s invalid\n", name, NULL);
				4505	}
				4506
				4507	/*
				4508	* Check for an Empty Element labeled the XML/SGML way
				4509	*/
				4510	if ((CUR == '/') && (NXT(1) == '>')) {
				4511	SKIP(2);
				4512	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4513	ctxt->sax->endElement(ctxt->userData, name);
				4514	htmlnamePop(ctxt);
				4515	return;
				4516	}
				4517
				4518	if (CUR == '>') {
				4519	NEXT;
				4520	} else {
				4521	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				4522	"Couldn't find end of Start Tag %s\n", name, NULL);
				4523
				4524	/*
				4525	* end of parsing of this node.
				4526	*/
				4527	if (xmlStrEqual(name, ctxt->name)) {
				4528	nodePop(ctxt);
				4529	htmlnamePop(ctxt);
				4530	}
				4531
				4532	if (ctxt->record_info)
				4533	htmlNodeInfoPush(ctxt, &node_info);
				4534	htmlParserFinishElementParsing(ctxt);
				4535	return;
				4536	}
				4537
				4538	/*
				4539	* Check for an Empty Element from DTD definition
				4540	*/
				4541	if ((info != NULL) && (info->empty)) {
				4542	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4543	ctxt->sax->endElement(ctxt->userData, name);
				4544	htmlnamePop(ctxt);
				4545	return;
				4546	}
				4547
				4548	if (ctxt->record_info)
				4549	htmlNodeInfoPush(ctxt, &node_info);
				4550	}
				4551
				4552	/**
				4553	* htmlParseContentInternal:
				4554	* @ctxt: an HTML parser context
				4555	*
				4556	* Parse a content: comment, sub-element, reference or text.
				4557	* New version for non recursive htmlParseElementInternal
				4558	*/
				4559
				4560	static void
				4561	htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
				4562	xmlChar *currentNode;
				4563	int depth;
				4564	const xmlChar *name;
				4565
				4566	currentNode = xmlStrdup(ctxt->name);
				4567	depth = ctxt->nameNr;
				4568	while (1) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4569	GROW;
				4570
				4571	if (ctxt->instate == XML_PARSER_EOF)
				4572	break;
				4573
				4574	/*
				4575	* Our tag or one of it's parent or children is ending.
				4576	*/
				4577	if ((CUR == '<') && (NXT(1) == '/')) {
				4578	if (htmlParseEndTag(ctxt) &&
				4579	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				4580	if (currentNode != NULL)
				4581	xmlFree(currentNode);
				4582
				4583	currentNode = xmlStrdup(ctxt->name);
				4584	depth = ctxt->nameNr;
				4585	}
				4586	continue; /* while */
				4587	}
				4588
				4589	else if ((CUR == '<') &&
				4590	((IS_ASCII_LETTER(NXT(1))) \|\|
				4591	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
				4592	name = htmlParseHTMLName_nonInvasive(ctxt);
				4593	if (name == NULL) {
				4594	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				4595	"htmlParseStartTag: invalid element name\n",
				4596	NULL, NULL);
				4597	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4598	while ((CUR == 0) && (CUR != '>'))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4599	NEXT;
				4600
				4601	htmlParserFinishElementParsing(ctxt);
				4602	if (currentNode != NULL)
				4603	xmlFree(currentNode);
				4604
				4605	currentNode = xmlStrdup(ctxt->name);
				4606	depth = ctxt->nameNr;
				4607	continue;
				4608	}
				4609
				4610	if (ctxt->name != NULL) {
				4611	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
				4612	htmlAutoClose(ctxt, name);
				4613	continue;
				4614	}
				4615	}
				4616	}
				4617
				4618	/*
				4619	* Has this node been popped out during parsing of
				4620	* the next element
				4621	*/
				4622	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				4623	(!xmlStrEqual(currentNode, ctxt->name)))
				4624	{
				4625	htmlParserFinishElementParsing(ctxt);
				4626	if (currentNode != NULL) xmlFree(currentNode);
				4627
				4628	currentNode = xmlStrdup(ctxt->name);
				4629	depth = ctxt->nameNr;
				4630	continue;
				4631	}
				4632
				4633	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				4634	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
				4635	/*
				4636	* Handle SCRIPT/STYLE separately
				4637	*/
				4638	htmlParseScript(ctxt);
				4639	} else {
				4640	/*
				4641	* Sometimes DOCTYPE arrives in the middle of the document
				4642	*/
				4643	if ((CUR == '<') && (NXT(1) == '!') &&
				4644	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4645	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4646	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4647	(UPP(8) == 'E')) {
				4648	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				4649	"Misplaced DOCTYPE declaration\n",
				4650	BAD_CAST "DOCTYPE" , NULL);
				4651	htmlParseDocTypeDecl(ctxt);
				4652	}
				4653
				4654	/*
				4655	* First case : a comment
				4656	*/
				4657	if ((CUR == '<') && (NXT(1) == '!') &&
				4658	(NXT(2) == '-') && (NXT(3) == '-')) {
				4659	htmlParseComment(ctxt);
				4660	}
				4661
				4662	/*
				4663	* Second case : a Processing Instruction.
				4664	*/
				4665	else if ((CUR == '<') && (NXT(1) == '?')) {
				4666	htmlParsePI(ctxt);
				4667	}
				4668
				4669	/*
				4670	* Third case : a sub-element.
				4671	*/
				4672	else if (CUR == '<') {
				4673	htmlParseElementInternal(ctxt);
				4674	if (currentNode != NULL) xmlFree(currentNode);
				4675
				4676	currentNode = xmlStrdup(ctxt->name);
				4677	depth = ctxt->nameNr;
				4678	}
				4679
				4680	/*
				4681	* Fourth case : a reference. If if has not been resolved,
				4682	* parsing returns it's Name, create the node
				4683	*/
				4684	else if (CUR == '&') {
				4685	htmlParseReference(ctxt);
				4686	}
				4687
				4688	/*
				4689	* Fifth case : end of the resource
				4690	*/
				4691	else if (CUR == 0) {
				4692	htmlAutoCloseOnEnd(ctxt);
				4693	break;
				4694	}
				4695
				4696	/*
				4697	* Last case, text. Note that References are handled directly.
				4698	*/
				4699	else {
				4700	htmlParseCharData(ctxt);
				4701	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4702	}
				4703	GROW;
				4704	}
				4705	if (currentNode != NULL) xmlFree(currentNode);
				4706	}
				4707
				4708	/**
				4709	* htmlParseContent:
				4710	* @ctxt: an HTML parser context
				4711	*
				4712	* Parse a content: comment, sub-element, reference or text.
				4713	* This is the entry point when called from parser.c
				4714	*/
				4715
				4716	void
				4717	__htmlParseContent(void *ctxt) {
				4718	if (ctxt != NULL)
				4719	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
				4720	}
				4721
				4722	/**
				4723	* htmlParseDocument:
				4724	* @ctxt: an HTML parser context
				4725	*
				4726	* parse an HTML document (and build a tree if using the standard SAX
				4727	* interface).
				4728	*
				4729	* Returns 0, -1 in case of error. the parser context is augmented
				4730	* as a result of the parsing.
				4731	*/
				4732
				4733	int
				4734	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				4735	xmlChar start[4];
				4736	xmlCharEncoding enc;
				4737	xmlDtdPtr dtd;
				4738
				4739	xmlInitParser();
				4740
				4741	htmlDefaultSAXHandlerInit();
				4742
				4743	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4744	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4745	"htmlParseDocument: context error\n", NULL, NULL);
				4746	return(XML_ERR_INTERNAL_ERROR);
				4747	}
				4748	ctxt->html = 1;
				4749	ctxt->linenumbers = 1;
				4750	GROW;
				4751	/*
				4752	* SAX: beginning of the document processing.
				4753	*/
				4754	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4755	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				4756
				4757	if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
				4758	((ctxt->input->end - ctxt->input->cur) >= 4)) {
				4759	/*
				4760	* Get the 4 first bytes and decode the charset
				4761	* if enc != XML_CHAR_ENCODING_NONE
				4762	* plug some encoding conversion routines.
				4763	*/
				4764	start[0] = RAW;
				4765	start[1] = NXT(1);
				4766	start[2] = NXT(2);
				4767	start[3] = NXT(3);
				4768	enc = xmlDetectCharEncoding(&start[0], 4);
				4769	if (enc != XML_CHAR_ENCODING_NONE) {
				4770	xmlSwitchEncoding(ctxt, enc);
				4771	}
				4772	}
				4773
				4774	/*
				4775	* Wipe out everything which is before the first '<'
				4776	*/
				4777	SKIP_BLANKS;
				4778	if (CUR == 0) {
				4779	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
				4780	"Document is empty\n", NULL, NULL);
				4781	}
				4782
				4783	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				4784	ctxt->sax->startDocument(ctxt->userData);
				4785
				4786
				4787	/*
				4788	* Parse possible comments and PIs before any content
				4789	*/
				4790	while (((CUR == '<') && (NXT(1) == '!') &&
				4791	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
				4792	((CUR == '<') && (NXT(1) == '?'))) {
				4793	htmlParseComment(ctxt);
				4794	htmlParsePI(ctxt);
				4795	SKIP_BLANKS;
				4796	}
				4797
				4798
				4799	/*
				4800	* Then possibly doc type declaration(s) and more Misc
				4801	* (doctypedecl Misc*)?
				4802	*/
				4803	if ((CUR == '<') && (NXT(1) == '!') &&
				4804	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4805	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4806	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4807	(UPP(8) == 'E')) {
				4808	htmlParseDocTypeDecl(ctxt);
				4809	}
				4810	SKIP_BLANKS;
				4811
				4812	/*
				4813	* Parse possible comments and PIs before any content
				4814	*/
				4815	while (((CUR == '<') && (NXT(1) == '!') &&
				4816	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
				4817	((CUR == '<') && (NXT(1) == '?'))) {
				4818	htmlParseComment(ctxt);
				4819	htmlParsePI(ctxt);
				4820	SKIP_BLANKS;
				4821	}
				4822
				4823	/*
				4824	* Time to start parsing the tree itself
				4825	*/
				4826	htmlParseContentInternal(ctxt);
				4827
				4828	/*
				4829	* autoclose
				4830	*/
				4831	if (CUR == 0)
				4832	htmlAutoCloseOnEnd(ctxt);
				4833
				4834
				4835	/*
				4836	* SAX: end of the document processing.
				4837	*/
				4838	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4839	ctxt->sax->endDocument(ctxt->userData);
				4840
				4841	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
				4842	dtd = xmlGetIntSubset(ctxt->myDoc);
				4843	if (dtd == NULL)
				4844	ctxt->myDoc->intSubset =
				4845	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
				4846	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4847	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4848	}
				4849	if (! ctxt->wellFormed) return(-1);
				4850	return(0);
				4851	}
				4852
				4853
				4854	/************************************************************************
				4855	* *
				4856	* Parser contexts handling *
				4857	* *
				4858	************************************************************************/
				4859
				4860	/**
				4861	* htmlInitParserCtxt:
				4862	* @ctxt: an HTML parser context
				4863	*
				4864	* Initialize a parser context
				4865	*
				4866	* Returns 0 in case of success and -1 in case of error
				4867	*/
				4868
				4869	static int
				4870	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				4871	{
				4872	htmlSAXHandler *sax;
				4873
				4874	if (ctxt == NULL) return(-1);
				4875	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4876
				4877	ctxt->dict = xmlDictCreate();
				4878	if (ctxt->dict == NULL) {
				4879	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4880	return(-1);
				4881	}
				4882	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				4883	if (sax == NULL) {
				4884	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4885	return(-1);
				4886	}
				4887	else
				4888	memset(sax, 0, sizeof(htmlSAXHandler));
				4889
				4890	/* Allocate the Input stack */
				4891	ctxt->inputTab = (htmlParserInputPtr *)
				4892	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				4893	if (ctxt->inputTab == NULL) {
				4894	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4895	ctxt->inputNr = 0;
				4896	ctxt->inputMax = 0;
				4897	ctxt->input = NULL;
				4898	return(-1);
				4899	}
				4900	ctxt->inputNr = 0;
				4901	ctxt->inputMax = 5;
				4902	ctxt->input = NULL;
				4903	ctxt->version = NULL;
				4904	ctxt->encoding = NULL;
				4905	ctxt->standalone = -1;
				4906	ctxt->instate = XML_PARSER_START;
				4907
				4908	/* Allocate the Node stack */
				4909	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				4910	if (ctxt->nodeTab == NULL) {
				4911	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4912	ctxt->nodeNr = 0;
				4913	ctxt->nodeMax = 0;
				4914	ctxt->node = NULL;
				4915	ctxt->inputNr = 0;
				4916	ctxt->inputMax = 0;
				4917	ctxt->input = NULL;
				4918	return(-1);
				4919	}
				4920	ctxt->nodeNr = 0;
				4921	ctxt->nodeMax = 10;
				4922	ctxt->node = NULL;
				4923
				4924	/* Allocate the Name stack */
				4925	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				4926	if (ctxt->nameTab == NULL) {
				4927	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4928	ctxt->nameNr = 0;
				4929	ctxt->nameMax = 0;
				4930	ctxt->name = NULL;
				4931	ctxt->nodeNr = 0;
				4932	ctxt->nodeMax = 0;
				4933	ctxt->node = NULL;
				4934	ctxt->inputNr = 0;
				4935	ctxt->inputMax = 0;
				4936	ctxt->input = NULL;
				4937	return(-1);
				4938	}
				4939	ctxt->nameNr = 0;
				4940	ctxt->nameMax = 10;
				4941	ctxt->name = NULL;
				4942
				4943	ctxt->nodeInfoTab = NULL;
				4944	ctxt->nodeInfoNr = 0;
				4945	ctxt->nodeInfoMax = 0;
				4946
				4947	if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
				4948	else {
				4949	ctxt->sax = sax;
				4950	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
				4951	}
				4952	ctxt->userData = ctxt;
				4953	ctxt->myDoc = NULL;
				4954	ctxt->wellFormed = 1;
				4955	ctxt->replaceEntities = 0;
				4956	ctxt->linenumbers = xmlLineNumbersDefaultValue;
				4957	ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
				4958	ctxt->html = 1;
				4959	ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
				4960	ctxt->vctxt.userData = ctxt;
				4961	ctxt->vctxt.error = xmlParserValidityError;
				4962	ctxt->vctxt.warning = xmlParserValidityWarning;
				4963	ctxt->record_info = 0;
				4964	ctxt->validate = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4965	ctxt->checkIndex = 0;
				4966	ctxt->catalogs = NULL;
				4967	xmlInitNodeInfoSeq(&ctxt->node_seq);
				4968	return(0);
				4969	}
				4970
				4971	/**
				4972	* htmlFreeParserCtxt:
				4973	* @ctxt: an HTML parser context
				4974	*
				4975	* Free all the memory used by a parser context. However the parsed
				4976	* document in ctxt->myDoc is not freed.
				4977	*/
				4978
				4979	void
				4980	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				4981	{
				4982	xmlFreeParserCtxt(ctxt);
				4983	}
				4984
				4985	/**
				4986	* htmlNewParserCtxt:
				4987	*
				4988	* Allocate and initialize a new parser context.
				4989	*
				4990	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
				4991	*/
				4992
				4993	htmlParserCtxtPtr
				4994	htmlNewParserCtxt(void)
				4995	{
				4996	xmlParserCtxtPtr ctxt;
				4997
				4998	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
				4999	if (ctxt == NULL) {
				5000	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
				5001	return(NULL);
				5002	}
				5003	memset(ctxt, 0, sizeof(xmlParserCtxt));
				5004	if (htmlInitParserCtxt(ctxt) < 0) {
				5005	htmlFreeParserCtxt(ctxt);
				5006	return(NULL);
				5007	}
				5008	return(ctxt);
				5009	}
				5010
				5011	/**
				5012	* htmlCreateMemoryParserCtxt:
				5013	* @buffer: a pointer to a char array
				5014	* @size: the size of the array
				5015	*
				5016	* Create a parser context for an HTML in-memory document.
				5017	*
				5018	* Returns the new parser context or NULL
				5019	*/
				5020	htmlParserCtxtPtr
				5021	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
				5022	xmlParserCtxtPtr ctxt;
				5023	xmlParserInputPtr input;
				5024	xmlParserInputBufferPtr buf;
				5025
				5026	if (buffer == NULL)
				5027	return(NULL);
				5028	if (size <= 0)
				5029	return(NULL);
				5030
				5031	ctxt = htmlNewParserCtxt();
				5032	if (ctxt == NULL)
				5033	return(NULL);
				5034
				5035	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				5036	if (buf == NULL) return(NULL);
				5037
				5038	input = xmlNewInputStream(ctxt);
				5039	if (input == NULL) {
				5040	xmlFreeParserCtxt(ctxt);
				5041	return(NULL);
				5042	}
				5043
				5044	input->filename = NULL;
				5045	input->buf = buf;
				5046	xmlBufResetInput(buf->buffer, input);
				5047
				5048	inputPush(ctxt, input);
				5049	return(ctxt);
				5050	}
				5051
				5052	/**
				5053	* htmlCreateDocParserCtxt:
				5054	* @cur: a pointer to an array of xmlChar
				5055	* @encoding: a free form C string describing the HTML document encoding, or NULL
				5056	*
				5057	* Create a parser context for an HTML document.
				5058	*
				5059	* TODO: check the need to add encoding handling there
				5060	*
				5061	* Returns the new parser context or NULL
				5062	*/
				5063	static htmlParserCtxtPtr
				5064	htmlCreateDocParserCtxt(const xmlChar cur, const char encoding) {
				5065	int len;
				5066	htmlParserCtxtPtr ctxt;
				5067
				5068	if (cur == NULL)
				5069	return(NULL);
				5070	len = xmlStrlen(cur);
				5071	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
				5072	if (ctxt == NULL)
				5073	return(NULL);
				5074
				5075	if (encoding != NULL) {
				5076	xmlCharEncoding enc;
				5077	xmlCharEncodingHandlerPtr handler;
				5078
				5079	if (ctxt->input->encoding != NULL)
				5080	xmlFree((xmlChar *) ctxt->input->encoding);
				5081	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
				5082
				5083	enc = xmlParseCharEncoding(encoding);
				5084	/*
				5085	* registered set of known encodings
				5086	*/
				5087	if (enc != XML_CHAR_ENCODING_ERROR) {
				5088	xmlSwitchEncoding(ctxt, enc);
				5089	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
				5090	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				5091	"Unsupported encoding %s\n",
				5092	(const xmlChar *) encoding, NULL);
				5093	}
				5094	} else {
				5095	/*
				5096	* fallback for unknown encodings
				5097	*/
				5098	handler = xmlFindCharEncodingHandler((const char *) encoding);
				5099	if (handler != NULL) {
				5100	xmlSwitchToEncoding(ctxt, handler);
				5101	} else {
				5102	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				5103	"Unsupported encoding %s\n",
				5104	(const xmlChar *) encoding, NULL);
				5105	}
				5106	}
				5107	}
				5108	return(ctxt);
				5109	}
				5110
				5111	#ifdef LIBXML_PUSH_ENABLED
				5112	/************************************************************************
				5113	* *
				5114	* Progressive parsing interfaces *
				5115	* *
				5116	************************************************************************/
				5117
				5118	/**
				5119	* htmlParseLookupSequence:
				5120	* @ctxt: an HTML parser context
				5121	* @first: the first char to lookup
				5122	* @next: the next char to lookup or zero
				5123	* @third: the next char to lookup or zero
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5124	* @ignoreattrval: skip over attribute values
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5125	*
				5126	* Try to find if a sequence (first, next, third) or just (first next) or
				5127	* (first) is available in the input stream.
				5128	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				5129	* to avoid rescanning sequences of bytes, it DOES change the state of the
				5130	* parser, do not use liberally.
				5131	* This is basically similar to xmlParseLookupSequence()
				5132	*
				5133	* Returns the index to the current parsing point if the full sequence
				5134	* is available, -1 otherwise.
				5135	*/
				5136	static int
				5137	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5138	xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5139	{
				5140	int base, len;
				5141	htmlParserInputPtr in;
				5142	const xmlChar *buf;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5143	int invalue = 0;
				5144	char valdellim = 0x0;
				5145
				5146	in = ctxt->input;
				5147	if (in == NULL)
				5148	return (-1);
				5149
				5150	base = in->cur - in->base;
				5151	if (base < 0)
				5152	return (-1);
				5153
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5154	if (ctxt->checkIndex > base) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5155	base = ctxt->checkIndex;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5156	/* Abuse hasPErefs member to restore current state. */
				5157	invalue = ctxt->hasPErefs & 1 ? 1 : 0;
				5158	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5159
				5160	if (in->buf == NULL) {
				5161	buf = in->base;
				5162	len = in->length;
				5163	} else {
				5164	buf = xmlBufContent(in->buf->buffer);
				5165	len = xmlBufUse(in->buf->buffer);
				5166	}
				5167
				5168	/* take into account the sequence length */
				5169	if (third)
				5170	len -= 2;
				5171	else if (next)
				5172	len--;
				5173	for (; base < len; base++) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5174	if (ignoreattrval) {
				5175	if (buf[base] == '"' \|\| buf[base] == '\'') {
				5176	if (invalue) {
				5177	if (buf[base] == valdellim) {
				5178	invalue = 0;
				5179	continue;
				5180	}
				5181	} else {
				5182	valdellim = buf[base];
				5183	invalue = 1;
				5184	continue;
				5185	}
				5186	} else if (invalue) {
				5187	continue;
				5188	}
				5189	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5190	if (buf[base] == first) {
				5191	if (third != 0) {
				5192	if ((buf[base + 1] != next) \|\| (buf[base + 2] != third))
				5193	continue;
				5194	} else if (next != 0) {
				5195	if (buf[base + 1] != next)
				5196	continue;
				5197	}
				5198	ctxt->checkIndex = 0;
				5199	#ifdef DEBUG_PUSH
				5200	if (next == 0)
				5201	xmlGenericError(xmlGenericErrorContext,
				5202	"HPP: lookup '%c' found at %d\n",
				5203	first, base);
				5204	else if (third == 0)
				5205	xmlGenericError(xmlGenericErrorContext,
				5206	"HPP: lookup '%c%c' found at %d\n",
				5207	first, next, base);
				5208	else
				5209	xmlGenericError(xmlGenericErrorContext,
				5210	"HPP: lookup '%c%c%c' found at %d\n",
				5211	first, next, third, base);
				5212	#endif
				5213	return (base - (in->cur - in->base));
				5214	}
				5215	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5216	ctxt->checkIndex = base;
				5217	/* Abuse hasPErefs member to track current state. */
				5218	if (invalue)
				5219	ctxt->hasPErefs \|= 1;
				5220	else
				5221	ctxt->hasPErefs &= ~1;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5222	#ifdef DEBUG_PUSH
				5223	if (next == 0)
				5224	xmlGenericError(xmlGenericErrorContext,
				5225	"HPP: lookup '%c' failed\n", first);
				5226	else if (third == 0)
				5227	xmlGenericError(xmlGenericErrorContext,
				5228	"HPP: lookup '%c%c' failed\n", first, next);
				5229	else
				5230	xmlGenericError(xmlGenericErrorContext,
				5231	"HPP: lookup '%c%c%c' failed\n", first, next,
				5232	third);
				5233	#endif
				5234	return (-1);
				5235	}
				5236
				5237	/**
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	5238	* htmlParseLookupCommentEnd:
				5239	* @ctxt: an HTML parser context
				5240	*
				5241	* Try to find a comment end tag in the input stream
				5242	* The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
				5243	* (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
				5244	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				5245	* to avoid rescanning sequences of bytes, it DOES change the state of the
				5246	* parser, do not use liberally.
				5247	* This wraps to htmlParseLookupSequence()
				5248	*
				5249	* Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
				5250	*/
				5251	static int
				5252	htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
				5253	{
				5254	int mark = 0;
				5255	int cur = CUR_PTR - BASE_PTR;
				5256
				5257	while (mark >= 0) {
				5258	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
				5259	if ((mark < 0) \|\|
				5260	(NXT(mark+2) == '>') \|\|
				5261	((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
				5262	return mark;
				5263	}
				5264	ctxt->checkIndex = cur + mark + 1;
				5265	}
				5266	return mark;
				5267	}
				5268
				5269
				5270	/**
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5271	* htmlParseTryOrFinish:
				5272	* @ctxt: an HTML parser context
				5273	* @terminate: last chunk indicator
				5274	*
				5275	* Try to progress on parsing
				5276	*
				5277	* Returns zero if no parsing was possible
				5278	*/
				5279	static int
				5280	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				5281	int ret = 0;
				5282	htmlParserInputPtr in;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5283	ptrdiff_t avail = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5284	xmlChar cur, next;
				5285
				5286	htmlParserNodeInfo node_info;
				5287
				5288	#ifdef DEBUG_PUSH
				5289	switch (ctxt->instate) {
				5290	case XML_PARSER_EOF:
				5291	xmlGenericError(xmlGenericErrorContext,
				5292	"HPP: try EOF\n"); break;
				5293	case XML_PARSER_START:
				5294	xmlGenericError(xmlGenericErrorContext,
				5295	"HPP: try START\n"); break;
				5296	case XML_PARSER_MISC:
				5297	xmlGenericError(xmlGenericErrorContext,
				5298	"HPP: try MISC\n");break;
				5299	case XML_PARSER_COMMENT:
				5300	xmlGenericError(xmlGenericErrorContext,
				5301	"HPP: try COMMENT\n");break;
				5302	case XML_PARSER_PROLOG:
				5303	xmlGenericError(xmlGenericErrorContext,
				5304	"HPP: try PROLOG\n");break;
				5305	case XML_PARSER_START_TAG:
				5306	xmlGenericError(xmlGenericErrorContext,
				5307	"HPP: try START_TAG\n");break;
				5308	case XML_PARSER_CONTENT:
				5309	xmlGenericError(xmlGenericErrorContext,
				5310	"HPP: try CONTENT\n");break;
				5311	case XML_PARSER_CDATA_SECTION:
				5312	xmlGenericError(xmlGenericErrorContext,
				5313	"HPP: try CDATA_SECTION\n");break;
				5314	case XML_PARSER_END_TAG:
				5315	xmlGenericError(xmlGenericErrorContext,
				5316	"HPP: try END_TAG\n");break;
				5317	case XML_PARSER_ENTITY_DECL:
				5318	xmlGenericError(xmlGenericErrorContext,
				5319	"HPP: try ENTITY_DECL\n");break;
				5320	case XML_PARSER_ENTITY_VALUE:
				5321	xmlGenericError(xmlGenericErrorContext,
				5322	"HPP: try ENTITY_VALUE\n");break;
				5323	case XML_PARSER_ATTRIBUTE_VALUE:
				5324	xmlGenericError(xmlGenericErrorContext,
				5325	"HPP: try ATTRIBUTE_VALUE\n");break;
				5326	case XML_PARSER_DTD:
				5327	xmlGenericError(xmlGenericErrorContext,
				5328	"HPP: try DTD\n");break;
				5329	case XML_PARSER_EPILOG:
				5330	xmlGenericError(xmlGenericErrorContext,
				5331	"HPP: try EPILOG\n");break;
				5332	case XML_PARSER_PI:
				5333	xmlGenericError(xmlGenericErrorContext,
				5334	"HPP: try PI\n");break;
				5335	case XML_PARSER_SYSTEM_LITERAL:
				5336	xmlGenericError(xmlGenericErrorContext,
				5337	"HPP: try SYSTEM_LITERAL\n");break;
				5338	}
				5339	#endif
				5340
				5341	while (1) {
				5342
				5343	in = ctxt->input;
				5344	if (in == NULL) break;
				5345	if (in->buf == NULL)
				5346	avail = in->length - (in->cur - in->base);
				5347	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5348	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5349	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5350	if ((avail == 0) && (terminate)) {
				5351	htmlAutoCloseOnEnd(ctxt);
				5352	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				5353	/*
				5354	* SAX: end of the document processing.
				5355	*/
				5356	ctxt->instate = XML_PARSER_EOF;
				5357	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				5358	ctxt->sax->endDocument(ctxt->userData);
				5359	}
				5360	}
				5361	if (avail < 1)
				5362	goto done;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5363	/*
				5364	* This is done to make progress and avoid an infinite loop
				5365	* if a parsing attempt was aborted by hitting a NUL byte. After
				5366	* changing htmlCurrentChar, this probably isn't necessary anymore.
				5367	* We should consider removing this check.
				5368	*/
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5369	cur = in->cur[0];
				5370	if (cur == 0) {
				5371	SKIP(1);
				5372	continue;
				5373	}
				5374
				5375	switch (ctxt->instate) {
				5376	case XML_PARSER_EOF:
				5377	/*
				5378	* Document parsing is done !
				5379	*/
				5380	goto done;
				5381	case XML_PARSER_START:
				5382	/*
				5383	* Very first chars read from the document flow.
				5384	*/
				5385	cur = in->cur[0];
				5386	if (IS_BLANK_CH(cur)) {
				5387	SKIP_BLANKS;
				5388	if (in->buf == NULL)
				5389	avail = in->length - (in->cur - in->base);
				5390	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5391	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5392	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5393	}
				5394	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				5395	ctxt->sax->setDocumentLocator(ctxt->userData,
				5396	&xmlDefaultSAXLocator);
				5397	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				5398	(!ctxt->disableSAX))
				5399	ctxt->sax->startDocument(ctxt->userData);
				5400
				5401	cur = in->cur[0];
				5402	next = in->cur[1];
				5403	if ((cur == '<') && (next == '!') &&
				5404	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5405	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5406	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5407	(UPP(8) == 'E')) {
				5408	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5409	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5410	goto done;
				5411	#ifdef DEBUG_PUSH
				5412	xmlGenericError(xmlGenericErrorContext,
				5413	"HPP: Parsing internal subset\n");
				5414	#endif
				5415	htmlParseDocTypeDecl(ctxt);
				5416	ctxt->instate = XML_PARSER_PROLOG;
				5417	#ifdef DEBUG_PUSH
				5418	xmlGenericError(xmlGenericErrorContext,
				5419	"HPP: entering PROLOG\n");
				5420	#endif
				5421	} else {
				5422	ctxt->instate = XML_PARSER_MISC;
				5423	#ifdef DEBUG_PUSH
				5424	xmlGenericError(xmlGenericErrorContext,
				5425	"HPP: entering MISC\n");
				5426	#endif
				5427	}
				5428	break;
				5429	case XML_PARSER_MISC:
				5430	SKIP_BLANKS;
				5431	if (in->buf == NULL)
				5432	avail = in->length - (in->cur - in->base);
				5433	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5434	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5435	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5436	/*
				5437	* no chars in buffer
				5438	*/
				5439	if (avail < 1)
				5440	goto done;
				5441	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5442	* not enough chars in buffer
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5443	*/
				5444	if (avail < 2) {
				5445	if (!terminate)
				5446	goto done;
				5447	else
				5448	next = ' ';
				5449	} else {
				5450	next = in->cur[1];
				5451	}
				5452	cur = in->cur[0];
				5453	if ((cur == '<') && (next == '!') &&
				5454	(in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	5455	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5456	goto done;
				5457	#ifdef DEBUG_PUSH
				5458	xmlGenericError(xmlGenericErrorContext,
				5459	"HPP: Parsing Comment\n");
				5460	#endif
				5461	htmlParseComment(ctxt);
				5462	ctxt->instate = XML_PARSER_MISC;
				5463	} else if ((cur == '<') && (next == '?')) {
				5464	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5465	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5466	goto done;
				5467	#ifdef DEBUG_PUSH
				5468	xmlGenericError(xmlGenericErrorContext,
				5469	"HPP: Parsing PI\n");
				5470	#endif
				5471	htmlParsePI(ctxt);
				5472	ctxt->instate = XML_PARSER_MISC;
				5473	} else if ((cur == '<') && (next == '!') &&
				5474	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5475	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5476	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5477	(UPP(8) == 'E')) {
				5478	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5479	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5480	goto done;
				5481	#ifdef DEBUG_PUSH
				5482	xmlGenericError(xmlGenericErrorContext,
				5483	"HPP: Parsing internal subset\n");
				5484	#endif
				5485	htmlParseDocTypeDecl(ctxt);
				5486	ctxt->instate = XML_PARSER_PROLOG;
				5487	#ifdef DEBUG_PUSH
				5488	xmlGenericError(xmlGenericErrorContext,
				5489	"HPP: entering PROLOG\n");
				5490	#endif
				5491	} else if ((cur == '<') && (next == '!') &&
				5492	(avail < 9)) {
				5493	goto done;
				5494	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5495	ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5496	#ifdef DEBUG_PUSH
				5497	xmlGenericError(xmlGenericErrorContext,
				5498	"HPP: entering START_TAG\n");
				5499	#endif
				5500	}
				5501	break;
				5502	case XML_PARSER_PROLOG:
				5503	SKIP_BLANKS;
				5504	if (in->buf == NULL)
				5505	avail = in->length - (in->cur - in->base);
				5506	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5507	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5508	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5509	if (avail < 2)
				5510	goto done;
				5511	cur = in->cur[0];
				5512	next = in->cur[1];
				5513	if ((cur == '<') && (next == '!') &&
				5514	(in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	5515	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5516	goto done;
				5517	#ifdef DEBUG_PUSH
				5518	xmlGenericError(xmlGenericErrorContext,
				5519	"HPP: Parsing Comment\n");
				5520	#endif
				5521	htmlParseComment(ctxt);
				5522	ctxt->instate = XML_PARSER_PROLOG;
				5523	} else if ((cur == '<') && (next == '?')) {
				5524	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5525	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5526	goto done;
				5527	#ifdef DEBUG_PUSH
				5528	xmlGenericError(xmlGenericErrorContext,
				5529	"HPP: Parsing PI\n");
				5530	#endif
				5531	htmlParsePI(ctxt);
				5532	ctxt->instate = XML_PARSER_PROLOG;
				5533	} else if ((cur == '<') && (next == '!') &&
				5534	(avail < 4)) {
				5535	goto done;
				5536	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5537	ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5538	#ifdef DEBUG_PUSH
				5539	xmlGenericError(xmlGenericErrorContext,
				5540	"HPP: entering START_TAG\n");
				5541	#endif
				5542	}
				5543	break;
				5544	case XML_PARSER_EPILOG:
				5545	if (in->buf == NULL)
				5546	avail = in->length - (in->cur - in->base);
				5547	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5548	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5549	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5550	if (avail < 1)
				5551	goto done;
				5552	cur = in->cur[0];
				5553	if (IS_BLANK_CH(cur)) {
				5554	htmlParseCharData(ctxt);
				5555	goto done;
				5556	}
				5557	if (avail < 2)
				5558	goto done;
				5559	next = in->cur[1];
				5560	if ((cur == '<') && (next == '!') &&
				5561	(in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	5562	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5563	goto done;
				5564	#ifdef DEBUG_PUSH
				5565	xmlGenericError(xmlGenericErrorContext,
				5566	"HPP: Parsing Comment\n");
				5567	#endif
				5568	htmlParseComment(ctxt);
				5569	ctxt->instate = XML_PARSER_EPILOG;
				5570	} else if ((cur == '<') && (next == '?')) {
				5571	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5572	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5573	goto done;
				5574	#ifdef DEBUG_PUSH
				5575	xmlGenericError(xmlGenericErrorContext,
				5576	"HPP: Parsing PI\n");
				5577	#endif
				5578	htmlParsePI(ctxt);
				5579	ctxt->instate = XML_PARSER_EPILOG;
				5580	} else if ((cur == '<') && (next == '!') &&
				5581	(avail < 4)) {
				5582	goto done;
				5583	} else {
				5584	ctxt->errNo = XML_ERR_DOCUMENT_END;
				5585	ctxt->wellFormed = 0;
				5586	ctxt->instate = XML_PARSER_EOF;
				5587	#ifdef DEBUG_PUSH
				5588	xmlGenericError(xmlGenericErrorContext,
				5589	"HPP: entering EOF\n");
				5590	#endif
				5591	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				5592	ctxt->sax->endDocument(ctxt->userData);
				5593	goto done;
				5594	}
				5595	break;
				5596	case XML_PARSER_START_TAG: {
				5597	const xmlChar *name;
				5598	int failed;
				5599	const htmlElemDesc * info;
				5600
				5601	/*
				5602	* no chars in buffer
				5603	*/
				5604	if (avail < 1)
				5605	goto done;
				5606	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5607	* not enough chars in buffer
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5608	*/
				5609	if (avail < 2) {
				5610	if (!terminate)
				5611	goto done;
				5612	else
				5613	next = ' ';
				5614	} else {
				5615	next = in->cur[1];
				5616	}
				5617	cur = in->cur[0];
				5618	if (cur != '<') {
				5619	ctxt->instate = XML_PARSER_CONTENT;
				5620	#ifdef DEBUG_PUSH
				5621	xmlGenericError(xmlGenericErrorContext,
				5622	"HPP: entering CONTENT\n");
				5623	#endif
				5624	break;
				5625	}
				5626	if (next == '/') {
				5627	ctxt->instate = XML_PARSER_END_TAG;
				5628	ctxt->checkIndex = 0;
				5629	#ifdef DEBUG_PUSH
				5630	xmlGenericError(xmlGenericErrorContext,
				5631	"HPP: entering END_TAG\n");
				5632	#endif
				5633	break;
				5634	}
				5635	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5636	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5637	goto done;
				5638
				5639	/* Capture start position */
				5640	if (ctxt->record_info) {
				5641	node_info.begin_pos = ctxt->input->consumed +
				5642	(CUR_PTR - ctxt->input->base);
				5643	node_info.begin_line = ctxt->input->line;
				5644	}
				5645
				5646
				5647	failed = htmlParseStartTag(ctxt);
				5648	name = ctxt->name;
				5649	if ((failed == -1) \|\|
				5650	(name == NULL)) {
				5651	if (CUR == '>')
				5652	NEXT;
				5653	break;
				5654	}
				5655
				5656	/*
				5657	* Lookup the info for that element.
				5658	*/
				5659	info = htmlTagLookup(name);
				5660	if (info == NULL) {
				5661	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				5662	"Tag %s invalid\n", name, NULL);
				5663	}
				5664
				5665	/*
				5666	* Check for an Empty Element labeled the XML/SGML way
				5667	*/
				5668	if ((CUR == '/') && (NXT(1) == '>')) {
				5669	SKIP(2);
				5670	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				5671	ctxt->sax->endElement(ctxt->userData, name);
				5672	htmlnamePop(ctxt);
				5673	ctxt->instate = XML_PARSER_CONTENT;
				5674	#ifdef DEBUG_PUSH
				5675	xmlGenericError(xmlGenericErrorContext,
				5676	"HPP: entering CONTENT\n");
				5677	#endif
				5678	break;
				5679	}
				5680
				5681	if (CUR == '>') {
				5682	NEXT;
				5683	} else {
				5684	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				5685	"Couldn't find end of Start Tag %s\n",
				5686	name, NULL);
				5687
				5688	/*
				5689	* end of parsing of this node.
				5690	*/
				5691	if (xmlStrEqual(name, ctxt->name)) {
				5692	nodePop(ctxt);
				5693	htmlnamePop(ctxt);
				5694	}
				5695
				5696	if (ctxt->record_info)
				5697	htmlNodeInfoPush(ctxt, &node_info);
				5698
				5699	ctxt->instate = XML_PARSER_CONTENT;
				5700	#ifdef DEBUG_PUSH
				5701	xmlGenericError(xmlGenericErrorContext,
				5702	"HPP: entering CONTENT\n");
				5703	#endif
				5704	break;
				5705	}
				5706
				5707	/*
				5708	* Check for an Empty Element from DTD definition
				5709	*/
				5710	if ((info != NULL) && (info->empty)) {
				5711	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				5712	ctxt->sax->endElement(ctxt->userData, name);
				5713	htmlnamePop(ctxt);
				5714	}
				5715
				5716	if (ctxt->record_info)
				5717	htmlNodeInfoPush(ctxt, &node_info);
				5718
				5719	ctxt->instate = XML_PARSER_CONTENT;
				5720	#ifdef DEBUG_PUSH
				5721	xmlGenericError(xmlGenericErrorContext,
				5722	"HPP: entering CONTENT\n");
				5723	#endif
				5724	break;
				5725	}
				5726	case XML_PARSER_CONTENT: {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5727	xmlChar chr[2] = { 0, 0 };
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5728
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5729	/*
				5730	* Handle preparsed entities and charRef
				5731	*/
				5732	if (ctxt->token != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5733	chr[0] = (xmlChar) ctxt->token;
				5734	htmlCheckParagraph(ctxt);
				5735	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				5736	ctxt->sax->characters(ctxt->userData, chr, 1);
				5737	ctxt->token = 0;
				5738	ctxt->checkIndex = 0;
				5739	}
				5740	if ((avail == 1) && (terminate)) {
				5741	cur = in->cur[0];
				5742	if ((cur != '<') && (cur != '&')) {
				5743	if (ctxt->sax != NULL) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5744	chr[0] = cur;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5745	if (IS_BLANK_CH(cur)) {
				5746	if (ctxt->keepBlanks) {
				5747	if (ctxt->sax->characters != NULL)
				5748	ctxt->sax->characters(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5749	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5750	} else {
				5751	if (ctxt->sax->ignorableWhitespace != NULL)
				5752	ctxt->sax->ignorableWhitespace(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5753	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5754	}
				5755	} else {
				5756	htmlCheckParagraph(ctxt);
				5757	if (ctxt->sax->characters != NULL)
				5758	ctxt->sax->characters(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5759	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5760	}
				5761	}
				5762	ctxt->token = 0;
				5763	ctxt->checkIndex = 0;
				5764	in->cur++;
				5765	break;
				5766	}
				5767	}
				5768	if (avail < 2)
				5769	goto done;
				5770	cur = in->cur[0];
				5771	next = in->cur[1];
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5772	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				5773	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				5774	/*
				5775	* Handle SCRIPT/STYLE separately
				5776	*/
				5777	if (!terminate) {
				5778	int idx;
				5779	xmlChar val;
				5780
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5781	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5782	if (idx < 0)
				5783	goto done;
				5784	val = in->cur[idx + 2];
				5785	if (val == 0) /* bad cut of input */
				5786	goto done;
				5787	}
				5788	htmlParseScript(ctxt);
				5789	if ((cur == '<') && (next == '/')) {
				5790	ctxt->instate = XML_PARSER_END_TAG;
				5791	ctxt->checkIndex = 0;
				5792	#ifdef DEBUG_PUSH
				5793	xmlGenericError(xmlGenericErrorContext,
				5794	"HPP: entering END_TAG\n");
				5795	#endif
				5796	break;
				5797	}
				5798	} else {
				5799	/*
				5800	* Sometimes DOCTYPE arrives in the middle of the document
				5801	*/
				5802	if ((cur == '<') && (next == '!') &&
				5803	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5804	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5805	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5806	(UPP(8) == 'E')) {
				5807	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5808	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5809	goto done;
				5810	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				5811	"Misplaced DOCTYPE declaration\n",
				5812	BAD_CAST "DOCTYPE" , NULL);
				5813	htmlParseDocTypeDecl(ctxt);
				5814	} else if ((cur == '<') && (next == '!') &&
				5815	(in->cur[2] == '-') && (in->cur[3] == '-')) {
Haibo Huang	d75f389	2021-01-05 21:34:50 -0800	[diff] [blame]	5816	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5817	goto done;
				5818	#ifdef DEBUG_PUSH
				5819	xmlGenericError(xmlGenericErrorContext,
				5820	"HPP: Parsing Comment\n");
				5821	#endif
				5822	htmlParseComment(ctxt);
				5823	ctxt->instate = XML_PARSER_CONTENT;
				5824	} else if ((cur == '<') && (next == '?')) {
				5825	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5826	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5827	goto done;
				5828	#ifdef DEBUG_PUSH
				5829	xmlGenericError(xmlGenericErrorContext,
				5830	"HPP: Parsing PI\n");
				5831	#endif
				5832	htmlParsePI(ctxt);
				5833	ctxt->instate = XML_PARSER_CONTENT;
				5834	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				5835	goto done;
				5836	} else if ((cur == '<') && (next == '/')) {
				5837	ctxt->instate = XML_PARSER_END_TAG;
				5838	ctxt->checkIndex = 0;
				5839	#ifdef DEBUG_PUSH
				5840	xmlGenericError(xmlGenericErrorContext,
				5841	"HPP: entering END_TAG\n");
				5842	#endif
				5843	break;
				5844	} else if (cur == '<') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5845	if ((!terminate) && (next == 0))
				5846	goto done;
				5847	/*
				5848	* Only switch to START_TAG if the next character
				5849	* starts a valid name. Otherwise, htmlParseStartTag
				5850	* might return without consuming all characters
				5851	* up to the final '>'.
				5852	*/
				5853	if ((IS_ASCII_LETTER(next)) \|\|
				5854	(next == '_') \|\| (next == ':') \|\| (next == '.')) {
				5855	ctxt->instate = XML_PARSER_START_TAG;
				5856	ctxt->checkIndex = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5857	#ifdef DEBUG_PUSH
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5858	xmlGenericError(xmlGenericErrorContext,
				5859	"HPP: entering START_TAG\n");
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5860	#endif
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5861	} else {
				5862	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				5863	"htmlParseTryOrFinish: "
				5864	"invalid element name\n",
				5865	NULL, NULL);
				5866	htmlCheckParagraph(ctxt);
				5867	if ((ctxt->sax != NULL) &&
				5868	(ctxt->sax->characters != NULL))
				5869	ctxt->sax->characters(ctxt->userData,
				5870	in->cur, 1);
				5871	NEXT;
				5872	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5873	break;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5874	} else {
				5875	/*
				5876	* check that the text sequence is complete
				5877	* before handing out the data to the parser
				5878	* to avoid problems with erroneous end of
				5879	* data detection.
				5880	*/
				5881	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5882	(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5883	goto done;
				5884	ctxt->checkIndex = 0;
				5885	#ifdef DEBUG_PUSH
				5886	xmlGenericError(xmlGenericErrorContext,
				5887	"HPP: Parsing char data\n");
				5888	#endif
Haibo Huang	ca68927	2021-02-09 16:43:43 -0800	[diff] [blame]	5889	while ((ctxt->instate != XML_PARSER_EOF) &&
				5890	(cur != '<') && (in->cur < in->end)) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5891	if (cur == '&') {
				5892	htmlParseReference(ctxt);
				5893	} else {
				5894	htmlParseCharData(ctxt);
				5895	}
				5896	cur = in->cur[0];
				5897	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5898	}
				5899	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5900
				5901	break;
				5902	}
				5903	case XML_PARSER_END_TAG:
				5904	if (avail < 2)
				5905	goto done;
				5906	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5907	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5908	goto done;
				5909	htmlParseEndTag(ctxt);
				5910	if (ctxt->nameNr == 0) {
				5911	ctxt->instate = XML_PARSER_EPILOG;
				5912	} else {
				5913	ctxt->instate = XML_PARSER_CONTENT;
				5914	}
				5915	ctxt->checkIndex = 0;
				5916	#ifdef DEBUG_PUSH
				5917	xmlGenericError(xmlGenericErrorContext,
				5918	"HPP: entering CONTENT\n");
				5919	#endif
				5920	break;
				5921	case XML_PARSER_CDATA_SECTION:
				5922	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5923	"HPP: internal error, state == CDATA\n",
				5924	NULL, NULL);
				5925	ctxt->instate = XML_PARSER_CONTENT;
				5926	ctxt->checkIndex = 0;
				5927	#ifdef DEBUG_PUSH
				5928	xmlGenericError(xmlGenericErrorContext,
				5929	"HPP: entering CONTENT\n");
				5930	#endif
				5931	break;
				5932	case XML_PARSER_DTD:
				5933	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5934	"HPP: internal error, state == DTD\n",
				5935	NULL, NULL);
				5936	ctxt->instate = XML_PARSER_CONTENT;
				5937	ctxt->checkIndex = 0;
				5938	#ifdef DEBUG_PUSH
				5939	xmlGenericError(xmlGenericErrorContext,
				5940	"HPP: entering CONTENT\n");
				5941	#endif
				5942	break;
				5943	case XML_PARSER_COMMENT:
				5944	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5945	"HPP: internal error, state == COMMENT\n",
				5946	NULL, NULL);
				5947	ctxt->instate = XML_PARSER_CONTENT;
				5948	ctxt->checkIndex = 0;
				5949	#ifdef DEBUG_PUSH
				5950	xmlGenericError(xmlGenericErrorContext,
				5951	"HPP: entering CONTENT\n");
				5952	#endif
				5953	break;
				5954	case XML_PARSER_PI:
				5955	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5956	"HPP: internal error, state == PI\n",
				5957	NULL, NULL);
				5958	ctxt->instate = XML_PARSER_CONTENT;
				5959	ctxt->checkIndex = 0;
				5960	#ifdef DEBUG_PUSH
				5961	xmlGenericError(xmlGenericErrorContext,
				5962	"HPP: entering CONTENT\n");
				5963	#endif
				5964	break;
				5965	case XML_PARSER_ENTITY_DECL:
				5966	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5967	"HPP: internal error, state == ENTITY_DECL\n",
				5968	NULL, NULL);
				5969	ctxt->instate = XML_PARSER_CONTENT;
				5970	ctxt->checkIndex = 0;
				5971	#ifdef DEBUG_PUSH
				5972	xmlGenericError(xmlGenericErrorContext,
				5973	"HPP: entering CONTENT\n");
				5974	#endif
				5975	break;
				5976	case XML_PARSER_ENTITY_VALUE:
				5977	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5978	"HPP: internal error, state == ENTITY_VALUE\n",
				5979	NULL, NULL);
				5980	ctxt->instate = XML_PARSER_CONTENT;
				5981	ctxt->checkIndex = 0;
				5982	#ifdef DEBUG_PUSH
				5983	xmlGenericError(xmlGenericErrorContext,
				5984	"HPP: entering DTD\n");
				5985	#endif
				5986	break;
				5987	case XML_PARSER_ATTRIBUTE_VALUE:
				5988	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5989	"HPP: internal error, state == ATTRIBUTE_VALUE\n",
				5990	NULL, NULL);
				5991	ctxt->instate = XML_PARSER_START_TAG;
				5992	ctxt->checkIndex = 0;
				5993	#ifdef DEBUG_PUSH
				5994	xmlGenericError(xmlGenericErrorContext,
				5995	"HPP: entering START_TAG\n");
				5996	#endif
				5997	break;
				5998	case XML_PARSER_SYSTEM_LITERAL:
				5999	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6000	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
				6001	NULL, NULL);
				6002	ctxt->instate = XML_PARSER_CONTENT;
				6003	ctxt->checkIndex = 0;
				6004	#ifdef DEBUG_PUSH
				6005	xmlGenericError(xmlGenericErrorContext,
				6006	"HPP: entering CONTENT\n");
				6007	#endif
				6008	break;
				6009	case XML_PARSER_IGNORE:
				6010	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6011	"HPP: internal error, state == XML_PARSER_IGNORE\n",
				6012	NULL, NULL);
				6013	ctxt->instate = XML_PARSER_CONTENT;
				6014	ctxt->checkIndex = 0;
				6015	#ifdef DEBUG_PUSH
				6016	xmlGenericError(xmlGenericErrorContext,
				6017	"HPP: entering CONTENT\n");
				6018	#endif
				6019	break;
				6020	case XML_PARSER_PUBLIC_LITERAL:
				6021	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6022	"HPP: internal error, state == XML_PARSER_LITERAL\n",
				6023	NULL, NULL);
				6024	ctxt->instate = XML_PARSER_CONTENT;
				6025	ctxt->checkIndex = 0;
				6026	#ifdef DEBUG_PUSH
				6027	xmlGenericError(xmlGenericErrorContext,
				6028	"HPP: entering CONTENT\n");
				6029	#endif
				6030	break;
				6031
				6032	}
				6033	}
				6034	done:
				6035	if ((avail == 0) && (terminate)) {
				6036	htmlAutoCloseOnEnd(ctxt);
				6037	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				6038	/*
				6039	* SAX: end of the document processing.
				6040	*/
				6041	ctxt->instate = XML_PARSER_EOF;
				6042	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				6043	ctxt->sax->endDocument(ctxt->userData);
				6044	}
				6045	}
				6046	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
				6047	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				6048	(ctxt->instate == XML_PARSER_EPILOG))) {
				6049	xmlDtdPtr dtd;
				6050	dtd = xmlGetIntSubset(ctxt->myDoc);
				6051	if (dtd == NULL)
				6052	ctxt->myDoc->intSubset =
				6053	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
				6054	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				6055	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				6056	}
				6057	#ifdef DEBUG_PUSH
				6058	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				6059	#endif
				6060	return(ret);
				6061	}
				6062
				6063	/**
				6064	* htmlParseChunk:
				6065	* @ctxt: an HTML parser context
				6066	* @chunk: an char array
				6067	* @size: the size in byte of the chunk
				6068	* @terminate: last chunk indicator
				6069	*
				6070	* Parse a Chunk of memory
				6071	*
				6072	* Returns zero if no error, the xmlParserErrors otherwise.
				6073	*/
				6074	int
				6075	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				6076	int terminate) {
				6077	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				6078	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6079	"htmlParseChunk: context error\n", NULL, NULL);
				6080	return(XML_ERR_INTERNAL_ERROR);
				6081	}
				6082	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				6083	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				6084	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
				6085	size_t cur = ctxt->input->cur - ctxt->input->base;
				6086	int res;
				6087
				6088	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	6089	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6090	if (res < 0) {
				6091	ctxt->errNo = XML_PARSER_EOF;
				6092	ctxt->disableSAX = 1;
				6093	return (XML_PARSER_EOF);
				6094	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6095	#ifdef DEBUG_PUSH
				6096	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				6097	#endif
				6098
				6099	#if 0
				6100	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				6101	htmlParseTryOrFinish(ctxt, terminate);
				6102	#endif
				6103	} else if (ctxt->instate != XML_PARSER_EOF) {
				6104	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
				6105	xmlParserInputBufferPtr in = ctxt->input->buf;
				6106	if ((in->encoder != NULL) && (in->buffer != NULL) &&
				6107	(in->raw != NULL)) {
				6108	int nbchars;
				6109	size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
				6110	size_t current = ctxt->input->cur - ctxt->input->base;
				6111
				6112	nbchars = xmlCharEncInput(in, terminate);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	6113	xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6114	if (nbchars < 0) {
				6115	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				6116	"encoder error\n", NULL, NULL);
				6117	return(XML_ERR_INVALID_ENCODING);
				6118	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6119	}
				6120	}
				6121	}
				6122	htmlParseTryOrFinish(ctxt, terminate);
				6123	if (terminate) {
				6124	if ((ctxt->instate != XML_PARSER_EOF) &&
				6125	(ctxt->instate != XML_PARSER_EPILOG) &&
				6126	(ctxt->instate != XML_PARSER_MISC)) {
				6127	ctxt->errNo = XML_ERR_DOCUMENT_END;
				6128	ctxt->wellFormed = 0;
				6129	}
				6130	if (ctxt->instate != XML_PARSER_EOF) {
				6131	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				6132	ctxt->sax->endDocument(ctxt->userData);
				6133	}
				6134	ctxt->instate = XML_PARSER_EOF;
				6135	}
				6136	return((xmlParserErrors) ctxt->errNo);
				6137	}
				6138
				6139	/************************************************************************
				6140	* *
				6141	* User entry points *
				6142	* *
				6143	************************************************************************/
				6144
				6145	/**
				6146	* htmlCreatePushParserCtxt:
				6147	* @sax: a SAX handler
				6148	* @user_data: The user data returned on SAX callbacks
				6149	* @chunk: a pointer to an array of chars
				6150	* @size: number of chars in the array
				6151	* @filename: an optional file name or URI
				6152	* @enc: an optional encoding
				6153	*
				6154	* Create a parser context for using the HTML parser in push mode
				6155	* The value of @filename is used for fetching external entities
				6156	* and error/warning reports.
				6157	*
				6158	* Returns the new parser context or NULL
				6159	*/
				6160	htmlParserCtxtPtr
				6161	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				6162	const char chunk, int size, const char filename,
				6163	xmlCharEncoding enc) {
				6164	htmlParserCtxtPtr ctxt;
				6165	htmlParserInputPtr inputStream;
				6166	xmlParserInputBufferPtr buf;
				6167
				6168	xmlInitParser();
				6169
				6170	buf = xmlAllocParserInputBuffer(enc);
				6171	if (buf == NULL) return(NULL);
				6172
				6173	ctxt = htmlNewParserCtxt();
				6174	if (ctxt == NULL) {
				6175	xmlFreeParserInputBuffer(buf);
				6176	return(NULL);
				6177	}
				6178	if(enc==XML_CHAR_ENCODING_UTF8 \|\| buf->encoder)
				6179	ctxt->charset=XML_CHAR_ENCODING_UTF8;
				6180	if (sax != NULL) {
				6181	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
				6182	xmlFree(ctxt->sax);
				6183	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				6184	if (ctxt->sax == NULL) {
				6185	xmlFree(buf);
				6186	xmlFree(ctxt);
				6187	return(NULL);
				6188	}
				6189	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				6190	if (user_data != NULL)
				6191	ctxt->userData = user_data;
				6192	}
				6193	if (filename == NULL) {
				6194	ctxt->directory = NULL;
				6195	} else {
				6196	ctxt->directory = xmlParserGetDirectory(filename);
				6197	}
				6198
				6199	inputStream = htmlNewInputStream(ctxt);
				6200	if (inputStream == NULL) {
				6201	xmlFreeParserCtxt(ctxt);
				6202	xmlFree(buf);
				6203	return(NULL);
				6204	}
				6205
				6206	if (filename == NULL)
				6207	inputStream->filename = NULL;
				6208	else
				6209	inputStream->filename = (char *)
				6210	xmlCanonicPath((const xmlChar *) filename);
				6211	inputStream->buf = buf;
				6212	xmlBufResetInput(buf->buffer, inputStream);
				6213
				6214	inputPush(ctxt, inputStream);
				6215
				6216	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				6217	(ctxt->input->buf != NULL)) {
				6218	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
				6219	size_t cur = ctxt->input->cur - ctxt->input->base;
				6220
				6221	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				6222
				6223	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
				6224	#ifdef DEBUG_PUSH
				6225	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				6226	#endif
				6227	}
				6228	ctxt->progressive = 1;
				6229
				6230	return(ctxt);
				6231	}
				6232	#endif /* LIBXML_PUSH_ENABLED */
				6233
				6234	/**
				6235	* htmlSAXParseDoc:
				6236	* @cur: a pointer to an array of xmlChar
				6237	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6238	* @sax: the SAX handler block
				6239	* @userData: if using SAX, this pointer will be provided on callbacks.
				6240	*
				6241	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				6242	* to handle parse events. If sax is NULL, fallback to the default DOM
				6243	* behavior and return a tree.
				6244	*
				6245	* Returns the resulting document tree unless SAX is NULL or the document is
				6246	* not well formed.
				6247	*/
				6248
				6249	htmlDocPtr
				6250	htmlSAXParseDoc(const xmlChar cur, const char encoding,
				6251	htmlSAXHandlerPtr sax, void *userData) {
				6252	htmlDocPtr ret;
				6253	htmlParserCtxtPtr ctxt;
				6254
				6255	xmlInitParser();
				6256
				6257	if (cur == NULL) return(NULL);
				6258
				6259
				6260	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				6261	if (ctxt == NULL) return(NULL);
				6262	if (sax != NULL) {
				6263	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
				6264	ctxt->sax = sax;
				6265	ctxt->userData = userData;
				6266	}
				6267
				6268	htmlParseDocument(ctxt);
				6269	ret = ctxt->myDoc;
				6270	if (sax != NULL) {
				6271	ctxt->sax = NULL;
				6272	ctxt->userData = NULL;
				6273	}
				6274	htmlFreeParserCtxt(ctxt);
				6275
				6276	return(ret);
				6277	}
				6278
				6279	/**
				6280	* htmlParseDoc:
				6281	* @cur: a pointer to an array of xmlChar
				6282	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6283	*
				6284	* parse an HTML in-memory document and build a tree.
				6285	*
				6286	* Returns the resulting document tree
				6287	*/
				6288
				6289	htmlDocPtr
				6290	htmlParseDoc(const xmlChar cur, const char encoding) {
				6291	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				6292	}
				6293
				6294
				6295	/**
				6296	* htmlCreateFileParserCtxt:
				6297	* @filename: the filename
				6298	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6299	*
				6300	* Create a parser context for a file content.
				6301	* Automatic support for ZLIB/Compress compressed document is provided
				6302	* by default if found at compile-time.
				6303	*
				6304	* Returns the new parser context or NULL
				6305	*/
				6306	htmlParserCtxtPtr
				6307	htmlCreateFileParserCtxt(const char filename, const char encoding)
				6308	{
				6309	htmlParserCtxtPtr ctxt;
				6310	htmlParserInputPtr inputStream;
				6311	char *canonicFilename;
				6312	/* htmlCharEncoding enc; */
				6313	xmlChar content, content_line = (xmlChar *) "charset=";
				6314
				6315	if (filename == NULL)
				6316	return(NULL);
				6317
				6318	ctxt = htmlNewParserCtxt();
				6319	if (ctxt == NULL) {
				6320	return(NULL);
				6321	}
				6322	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
				6323	if (canonicFilename == NULL) {
				6324	#ifdef LIBXML_SAX1_ENABLED
				6325	if (xmlDefaultSAXHandler.error != NULL) {
				6326	xmlDefaultSAXHandler.error(NULL, "out of memory\n");
				6327	}
				6328	#endif
				6329	xmlFreeParserCtxt(ctxt);
				6330	return(NULL);
				6331	}
				6332
				6333	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
				6334	xmlFree(canonicFilename);
				6335	if (inputStream == NULL) {
				6336	xmlFreeParserCtxt(ctxt);
				6337	return(NULL);
				6338	}
				6339
				6340	inputPush(ctxt, inputStream);
				6341
				6342	/* set encoding */
				6343	if (encoding) {
				6344	size_t l = strlen(encoding);
				6345
				6346	if (l < 1000) {
				6347	content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
				6348	if (content) {
				6349	strcpy ((char )content, (char )content_line);
				6350	strcat ((char )content, (char )encoding);
				6351	htmlCheckEncoding (ctxt, content);
				6352	xmlFree (content);
				6353	}
				6354	}
				6355	}
				6356
				6357	return(ctxt);
				6358	}
				6359
				6360	/**
				6361	* htmlSAXParseFile:
				6362	* @filename: the filename
				6363	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6364	* @sax: the SAX handler block
				6365	* @userData: if using SAX, this pointer will be provided on callbacks.
				6366	*
				6367	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				6368	* compressed document is provided by default if found at compile-time.
				6369	* It use the given SAX function block to handle the parsing callback.
				6370	* If sax is NULL, fallback to the default DOM tree building routines.
				6371	*
				6372	* Returns the resulting document tree unless SAX is NULL or the document is
				6373	* not well formed.
				6374	*/
				6375
				6376	htmlDocPtr
				6377	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				6378	void *userData) {
				6379	htmlDocPtr ret;
				6380	htmlParserCtxtPtr ctxt;
				6381	htmlSAXHandlerPtr oldsax = NULL;
				6382
				6383	xmlInitParser();
				6384
				6385	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				6386	if (ctxt == NULL) return(NULL);
				6387	if (sax != NULL) {
				6388	oldsax = ctxt->sax;
				6389	ctxt->sax = sax;
				6390	ctxt->userData = userData;
				6391	}
				6392
				6393	htmlParseDocument(ctxt);
				6394
				6395	ret = ctxt->myDoc;
				6396	if (sax != NULL) {
				6397	ctxt->sax = oldsax;
				6398	ctxt->userData = NULL;
				6399	}
				6400	htmlFreeParserCtxt(ctxt);
				6401
				6402	return(ret);
				6403	}
				6404
				6405	/**
				6406	* htmlParseFile:
				6407	* @filename: the filename
				6408	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6409	*
				6410	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				6411	* compressed document is provided by default if found at compile-time.
				6412	*
				6413	* Returns the resulting document tree
				6414	*/
				6415
				6416	htmlDocPtr
				6417	htmlParseFile(const char filename, const char encoding) {
				6418	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				6419	}
				6420
				6421	/**
				6422	* htmlHandleOmittedElem:
				6423	* @val: int 0 or 1
				6424	*
				6425	* Set and return the previous value for handling HTML omitted tags.
				6426	*
				6427	* Returns the last value for 0 for no handling, 1 for auto insertion.
				6428	*/
				6429
				6430	int
				6431	htmlHandleOmittedElem(int val) {
				6432	int old = htmlOmittedDefaultValue;
				6433
				6434	htmlOmittedDefaultValue = val;
				6435	return(old);
				6436	}
				6437
				6438	/**
				6439	* htmlElementAllowedHere:
				6440	* @parent: HTML parent element
				6441	* @elt: HTML element
				6442	*
				6443	* Checks whether an HTML element may be a direct child of a parent element.
				6444	* Note - doesn't check for deprecated elements
				6445	*
				6446	* Returns 1 if allowed; 0 otherwise.
				6447	*/
				6448	int
				6449	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
				6450	const char** p ;
				6451
				6452	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
				6453	return 0 ;
				6454
				6455	for ( p = parent->subelts; *p; ++p )
				6456	if ( !xmlStrcmp((const xmlChar )p, elt) )
				6457	return 1 ;
				6458
				6459	return 0 ;
				6460	}
				6461	/**
				6462	* htmlElementStatusHere:
				6463	* @parent: HTML parent element
				6464	* @elt: HTML element
				6465	*
				6466	* Checks whether an HTML element may be a direct child of a parent element.
				6467	* and if so whether it is valid or deprecated.
				6468	*
				6469	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
				6470	*/
				6471	htmlStatus
				6472	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
				6473	if ( ! parent \|\| ! elt )
				6474	return HTML_INVALID ;
				6475	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
				6476	return HTML_INVALID ;
				6477
				6478	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
				6479	}
				6480	/**
				6481	* htmlAttrAllowed:
				6482	* @elt: HTML element
				6483	* @attr: HTML attribute
				6484	* @legacy: whether to allow deprecated attributes
				6485	*
				6486	* Checks whether an attribute is valid for an element
				6487	* Has full knowledge of Required and Deprecated attributes
				6488	*
				6489	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
				6490	*/
				6491	htmlStatus
				6492	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
				6493	const char** p ;
				6494
				6495	if ( !elt \|\| ! attr )
				6496	return HTML_INVALID ;
				6497
				6498	if ( elt->attrs_req )
				6499	for ( p = elt->attrs_req; *p; ++p)
				6500	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6501	return HTML_REQUIRED ;
				6502
				6503	if ( elt->attrs_opt )
				6504	for ( p = elt->attrs_opt; *p; ++p)
				6505	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6506	return HTML_VALID ;
				6507
				6508	if ( legacy && elt->attrs_depr )
				6509	for ( p = elt->attrs_depr; *p; ++p)
				6510	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6511	return HTML_DEPRECATED ;
				6512
				6513	return HTML_INVALID ;
				6514	}
				6515	/**
				6516	* htmlNodeStatus:
				6517	* @node: an htmlNodePtr in a tree
				6518	* @legacy: whether to allow deprecated elements (YES is faster here
				6519	* for Element nodes)
				6520	*
				6521	* Checks whether the tree node is valid. Experimental (the author
				6522	* only uses the HTML enhancements in a SAX parser)
				6523	*
				6524	* Return: for Element nodes, a return from htmlElementAllowedHere (if
				6525	* legacy allowed) or htmlElementStatusHere (otherwise).
				6526	* for Attribute nodes, a return from htmlAttrAllowed
				6527	* for other nodes, HTML_NA (no checks performed)
				6528	*/
				6529	htmlStatus
				6530	htmlNodeStatus(const htmlNodePtr node, int legacy) {
				6531	if ( ! node )
				6532	return HTML_INVALID ;
				6533
				6534	switch ( node->type ) {
				6535	case XML_ELEMENT_NODE:
				6536	return legacy
				6537	? ( htmlElementAllowedHere (
				6538	htmlTagLookup(node->parent->name) , node->name
				6539	) ? HTML_VALID : HTML_INVALID )
				6540	: htmlElementStatusHere(
				6541	htmlTagLookup(node->parent->name) ,
				6542	htmlTagLookup(node->name) )
				6543	;
				6544	case XML_ATTRIBUTE_NODE:
				6545	return htmlAttrAllowed(
				6546	htmlTagLookup(node->parent->name) , node->name, legacy) ;
				6547	default: return HTML_NA ;
				6548	}
				6549	}
				6550	/************************************************************************
				6551	* *
				6552	* New set (2.6.0) of simpler and more flexible APIs *
				6553	* *
				6554	************************************************************************/
				6555	/**
				6556	* DICT_FREE:
				6557	* @str: a string
				6558	*
				6559	* Free a string if it is not owned by the "dict" dictionary in the
				6560	* current scope
				6561	*/
				6562	#define DICT_FREE(str) \
				6563	if ((str) && ((!dict) \|\| \
				6564	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
				6565	xmlFree((char *)(str));
				6566
				6567	/**
				6568	* htmlCtxtReset:
				6569	* @ctxt: an HTML parser context
				6570	*
				6571	* Reset a parser context
				6572	*/
				6573	void
				6574	htmlCtxtReset(htmlParserCtxtPtr ctxt)
				6575	{
				6576	xmlParserInputPtr input;
				6577	xmlDictPtr dict;
				6578
				6579	if (ctxt == NULL)
				6580	return;
				6581
				6582	xmlInitParser();
				6583	dict = ctxt->dict;
				6584
				6585	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
				6586	xmlFreeInputStream(input);
				6587	}
				6588	ctxt->inputNr = 0;
				6589	ctxt->input = NULL;
				6590
				6591	ctxt->spaceNr = 0;
				6592	if (ctxt->spaceTab != NULL) {
				6593	ctxt->spaceTab[0] = -1;
				6594	ctxt->space = &ctxt->spaceTab[0];
				6595	} else {
				6596	ctxt->space = NULL;
				6597	}
				6598
				6599
				6600	ctxt->nodeNr = 0;
				6601	ctxt->node = NULL;
				6602
				6603	ctxt->nameNr = 0;
				6604	ctxt->name = NULL;
				6605
				6606	DICT_FREE(ctxt->version);
				6607	ctxt->version = NULL;
				6608	DICT_FREE(ctxt->encoding);
				6609	ctxt->encoding = NULL;
				6610	DICT_FREE(ctxt->directory);
				6611	ctxt->directory = NULL;
				6612	DICT_FREE(ctxt->extSubURI);
				6613	ctxt->extSubURI = NULL;
				6614	DICT_FREE(ctxt->extSubSystem);
				6615	ctxt->extSubSystem = NULL;
				6616	if (ctxt->myDoc != NULL)
				6617	xmlFreeDoc(ctxt->myDoc);
				6618	ctxt->myDoc = NULL;
				6619
				6620	ctxt->standalone = -1;
				6621	ctxt->hasExternalSubset = 0;
				6622	ctxt->hasPErefs = 0;
				6623	ctxt->html = 1;
				6624	ctxt->external = 0;
				6625	ctxt->instate = XML_PARSER_START;
				6626	ctxt->token = 0;
				6627
				6628	ctxt->wellFormed = 1;
				6629	ctxt->nsWellFormed = 1;
				6630	ctxt->disableSAX = 0;
				6631	ctxt->valid = 1;
				6632	ctxt->vctxt.userData = ctxt;
				6633	ctxt->vctxt.error = xmlParserValidityError;
				6634	ctxt->vctxt.warning = xmlParserValidityWarning;
				6635	ctxt->record_info = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6636	ctxt->checkIndex = 0;
				6637	ctxt->inSubset = 0;
				6638	ctxt->errNo = XML_ERR_OK;
				6639	ctxt->depth = 0;
				6640	ctxt->charset = XML_CHAR_ENCODING_NONE;
				6641	ctxt->catalogs = NULL;
				6642	xmlInitNodeInfoSeq(&ctxt->node_seq);
				6643
				6644	if (ctxt->attsDefault != NULL) {
				6645	xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
				6646	ctxt->attsDefault = NULL;
				6647	}
				6648	if (ctxt->attsSpecial != NULL) {
				6649	xmlHashFree(ctxt->attsSpecial, NULL);
				6650	ctxt->attsSpecial = NULL;
				6651	}
				6652	}
				6653
				6654	/**
				6655	* htmlCtxtUseOptions:
				6656	* @ctxt: an HTML parser context
				6657	* @options: a combination of htmlParserOption(s)
				6658	*
				6659	* Applies the options to the parser context
				6660	*
				6661	* Returns 0 in case of success, the set of unknown or unimplemented options
				6662	* in case of error.
				6663	*/
				6664	int
				6665	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
				6666	{
				6667	if (ctxt == NULL)
				6668	return(-1);
				6669
				6670	if (options & HTML_PARSE_NOWARNING) {
				6671	ctxt->sax->warning = NULL;
				6672	ctxt->vctxt.warning = NULL;
				6673	options -= XML_PARSE_NOWARNING;
				6674	ctxt->options \|= XML_PARSE_NOWARNING;
				6675	}
				6676	if (options & HTML_PARSE_NOERROR) {
				6677	ctxt->sax->error = NULL;
				6678	ctxt->vctxt.error = NULL;
				6679	ctxt->sax->fatalError = NULL;
				6680	options -= XML_PARSE_NOERROR;
				6681	ctxt->options \|= XML_PARSE_NOERROR;
				6682	}
				6683	if (options & HTML_PARSE_PEDANTIC) {
				6684	ctxt->pedantic = 1;
				6685	options -= XML_PARSE_PEDANTIC;
				6686	ctxt->options \|= XML_PARSE_PEDANTIC;
				6687	} else
				6688	ctxt->pedantic = 0;
				6689	if (options & XML_PARSE_NOBLANKS) {
				6690	ctxt->keepBlanks = 0;
				6691	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
				6692	options -= XML_PARSE_NOBLANKS;
				6693	ctxt->options \|= XML_PARSE_NOBLANKS;
				6694	} else
				6695	ctxt->keepBlanks = 1;
				6696	if (options & HTML_PARSE_RECOVER) {
				6697	ctxt->recovery = 1;
				6698	options -= HTML_PARSE_RECOVER;
				6699	} else
				6700	ctxt->recovery = 0;
				6701	if (options & HTML_PARSE_COMPACT) {
				6702	ctxt->options \|= HTML_PARSE_COMPACT;
				6703	options -= HTML_PARSE_COMPACT;
				6704	}
				6705	if (options & XML_PARSE_HUGE) {
				6706	ctxt->options \|= XML_PARSE_HUGE;
				6707	options -= XML_PARSE_HUGE;
				6708	}
				6709	if (options & HTML_PARSE_NODEFDTD) {
				6710	ctxt->options \|= HTML_PARSE_NODEFDTD;
				6711	options -= HTML_PARSE_NODEFDTD;
				6712	}
				6713	if (options & HTML_PARSE_IGNORE_ENC) {
				6714	ctxt->options \|= HTML_PARSE_IGNORE_ENC;
				6715	options -= HTML_PARSE_IGNORE_ENC;
				6716	}
				6717	if (options & HTML_PARSE_NOIMPLIED) {
				6718	ctxt->options \|= HTML_PARSE_NOIMPLIED;
				6719	options -= HTML_PARSE_NOIMPLIED;
				6720	}
				6721	ctxt->dictNames = 0;
				6722	return (options);
				6723	}
				6724
				6725	/**
				6726	* htmlDoRead:
				6727	* @ctxt: an HTML parser context
				6728	* @URL: the base URL to use for the document
				6729	* @encoding: the document encoding, or NULL
				6730	* @options: a combination of htmlParserOption(s)
				6731	* @reuse: keep the context for reuse
				6732	*
				6733	* Common front-end for the htmlRead functions
				6734	*
				6735	* Returns the resulting document tree or NULL
				6736	*/
				6737	static htmlDocPtr
				6738	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
				6739	int options, int reuse)
				6740	{
				6741	htmlDocPtr ret;
				6742
				6743	htmlCtxtUseOptions(ctxt, options);
				6744	ctxt->html = 1;
				6745	if (encoding != NULL) {
				6746	xmlCharEncodingHandlerPtr hdlr;
				6747
				6748	hdlr = xmlFindCharEncodingHandler(encoding);
				6749	if (hdlr != NULL) {
				6750	xmlSwitchToEncoding(ctxt, hdlr);
				6751	if (ctxt->input->encoding != NULL)
				6752	xmlFree((xmlChar *) ctxt->input->encoding);
				6753	ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
				6754	}
				6755	}
				6756	if ((URL != NULL) && (ctxt->input != NULL) &&
				6757	(ctxt->input->filename == NULL))
				6758	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
				6759	htmlParseDocument(ctxt);
				6760	ret = ctxt->myDoc;
				6761	ctxt->myDoc = NULL;
				6762	if (!reuse) {
				6763	if ((ctxt->dictNames) &&
				6764	(ret != NULL) &&
				6765	(ret->dict == ctxt->dict))
				6766	ctxt->dict = NULL;
				6767	xmlFreeParserCtxt(ctxt);
				6768	}
				6769	return (ret);
				6770	}
				6771
				6772	/**
				6773	* htmlReadDoc:
				6774	* @cur: a pointer to a zero terminated string
				6775	* @URL: the base URL to use for the document
				6776	* @encoding: the document encoding, or NULL
				6777	* @options: a combination of htmlParserOption(s)
				6778	*
				6779	* parse an XML in-memory document and build a tree.
				6780	*
				6781	* Returns the resulting document tree
				6782	*/
				6783	htmlDocPtr
				6784	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
				6785	{
				6786	htmlParserCtxtPtr ctxt;
				6787
				6788	if (cur == NULL)
				6789	return (NULL);
				6790
				6791	xmlInitParser();
				6792	ctxt = htmlCreateDocParserCtxt(cur, NULL);
				6793	if (ctxt == NULL)
				6794	return (NULL);
				6795	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6796	}
				6797
				6798	/**
				6799	* htmlReadFile:
				6800	* @filename: a file or URL
				6801	* @encoding: the document encoding, or NULL
				6802	* @options: a combination of htmlParserOption(s)
				6803	*
				6804	* parse an XML file from the filesystem or the network.
				6805	*
				6806	* Returns the resulting document tree
				6807	*/
				6808	htmlDocPtr
				6809	htmlReadFile(const char filename, const char encoding, int options)
				6810	{
				6811	htmlParserCtxtPtr ctxt;
				6812
				6813	xmlInitParser();
				6814	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				6815	if (ctxt == NULL)
				6816	return (NULL);
				6817	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
				6818	}
				6819
				6820	/**
				6821	* htmlReadMemory:
				6822	* @buffer: a pointer to a char array
				6823	* @size: the size of the array
				6824	* @URL: the base URL to use for the document
				6825	* @encoding: the document encoding, or NULL
				6826	* @options: a combination of htmlParserOption(s)
				6827	*
				6828	* parse an XML in-memory document and build a tree.
				6829	*
				6830	* Returns the resulting document tree
				6831	*/
				6832	htmlDocPtr
				6833	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
				6834	{
				6835	htmlParserCtxtPtr ctxt;
				6836
				6837	xmlInitParser();
				6838	ctxt = xmlCreateMemoryParserCtxt(buffer, size);
				6839	if (ctxt == NULL)
				6840	return (NULL);
				6841	htmlDefaultSAXHandlerInit();
				6842	if (ctxt->sax != NULL)
				6843	memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
				6844	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6845	}
				6846
				6847	/**
				6848	* htmlReadFd:
				6849	* @fd: an open file descriptor
				6850	* @URL: the base URL to use for the document
				6851	* @encoding: the document encoding, or NULL
				6852	* @options: a combination of htmlParserOption(s)
				6853	*
				6854	* parse an XML from a file descriptor and build a tree.
				6855	*
				6856	* Returns the resulting document tree
				6857	*/
				6858	htmlDocPtr
				6859	htmlReadFd(int fd, const char URL, const char encoding, int options)
				6860	{
				6861	htmlParserCtxtPtr ctxt;
				6862	xmlParserInputBufferPtr input;
				6863	xmlParserInputPtr stream;
				6864
				6865	if (fd < 0)
				6866	return (NULL);
				6867	xmlInitParser();
				6868
				6869	xmlInitParser();
				6870	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
				6871	if (input == NULL)
				6872	return (NULL);
				6873	ctxt = xmlNewParserCtxt();
				6874	if (ctxt == NULL) {
				6875	xmlFreeParserInputBuffer(input);
				6876	return (NULL);
				6877	}
				6878	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				6879	if (stream == NULL) {
				6880	xmlFreeParserInputBuffer(input);
				6881	xmlFreeParserCtxt(ctxt);
				6882	return (NULL);
				6883	}
				6884	inputPush(ctxt, stream);
				6885	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6886	}
				6887
				6888	/**
				6889	* htmlReadIO:
				6890	* @ioread: an I/O read function
				6891	* @ioclose: an I/O close function
				6892	* @ioctx: an I/O handler
				6893	* @URL: the base URL to use for the document
				6894	* @encoding: the document encoding, or NULL
				6895	* @options: a combination of htmlParserOption(s)
				6896	*
				6897	* parse an HTML document from I/O functions and source and build a tree.
				6898	*
				6899	* Returns the resulting document tree
				6900	*/
				6901	htmlDocPtr
				6902	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
				6903	void ioctx, const char URL, const char *encoding, int options)
				6904	{
				6905	htmlParserCtxtPtr ctxt;
				6906	xmlParserInputBufferPtr input;
				6907	xmlParserInputPtr stream;
				6908
				6909	if (ioread == NULL)
				6910	return (NULL);
				6911	xmlInitParser();
				6912
				6913	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
				6914	XML_CHAR_ENCODING_NONE);
				6915	if (input == NULL) {
				6916	if (ioclose != NULL)
				6917	ioclose(ioctx);
				6918	return (NULL);
				6919	}
				6920	ctxt = htmlNewParserCtxt();
				6921	if (ctxt == NULL) {
				6922	xmlFreeParserInputBuffer(input);
				6923	return (NULL);
				6924	}
				6925	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				6926	if (stream == NULL) {
				6927	xmlFreeParserInputBuffer(input);
				6928	xmlFreeParserCtxt(ctxt);
				6929	return (NULL);
				6930	}
				6931	inputPush(ctxt, stream);
				6932	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6933	}
				6934
				6935	/**
				6936	* htmlCtxtReadDoc:
				6937	* @ctxt: an HTML parser context
				6938	* @cur: a pointer to a zero terminated string
				6939	* @URL: the base URL to use for the document
				6940	* @encoding: the document encoding, or NULL
				6941	* @options: a combination of htmlParserOption(s)
				6942	*
				6943	* parse an XML in-memory document and build a tree.
				6944	* This reuses the existing @ctxt parser context
				6945	*
				6946	* Returns the resulting document tree
				6947	*/
				6948	htmlDocPtr
				6949	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
				6950	const char URL, const char encoding, int options)
				6951	{
				6952	xmlParserInputPtr stream;
				6953
				6954	if (cur == NULL)
				6955	return (NULL);
				6956	if (ctxt == NULL)
				6957	return (NULL);
				6958	xmlInitParser();
				6959
				6960	htmlCtxtReset(ctxt);
				6961
				6962	stream = xmlNewStringInputStream(ctxt, cur);
				6963	if (stream == NULL) {
				6964	return (NULL);
				6965	}
				6966	inputPush(ctxt, stream);
				6967	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				6968	}
				6969
				6970	/**
				6971	* htmlCtxtReadFile:
				6972	* @ctxt: an HTML parser context
				6973	* @filename: a file or URL
				6974	* @encoding: the document encoding, or NULL
				6975	* @options: a combination of htmlParserOption(s)
				6976	*
				6977	* parse an XML file from the filesystem or the network.
				6978	* This reuses the existing @ctxt parser context
				6979	*
				6980	* Returns the resulting document tree
				6981	*/
				6982	htmlDocPtr
				6983	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
				6984	const char *encoding, int options)
				6985	{
				6986	xmlParserInputPtr stream;
				6987
				6988	if (filename == NULL)
				6989	return (NULL);
				6990	if (ctxt == NULL)
				6991	return (NULL);
				6992	xmlInitParser();
				6993
				6994	htmlCtxtReset(ctxt);
				6995
				6996	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
				6997	if (stream == NULL) {
				6998	return (NULL);
				6999	}
				7000	inputPush(ctxt, stream);
				7001	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
				7002	}
				7003
				7004	/**
				7005	* htmlCtxtReadMemory:
				7006	* @ctxt: an HTML parser context
				7007	* @buffer: a pointer to a char array
				7008	* @size: the size of the array
				7009	* @URL: the base URL to use for the document
				7010	* @encoding: the document encoding, or NULL
				7011	* @options: a combination of htmlParserOption(s)
				7012	*
				7013	* parse an XML in-memory document and build a tree.
				7014	* This reuses the existing @ctxt parser context
				7015	*
				7016	* Returns the resulting document tree
				7017	*/
				7018	htmlDocPtr
				7019	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
				7020	const char URL, const char encoding, int options)
				7021	{
				7022	xmlParserInputBufferPtr input;
				7023	xmlParserInputPtr stream;
				7024
				7025	if (ctxt == NULL)
				7026	return (NULL);
				7027	if (buffer == NULL)
				7028	return (NULL);
				7029	xmlInitParser();
				7030
				7031	htmlCtxtReset(ctxt);
				7032
				7033	input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				7034	if (input == NULL) {
				7035	return(NULL);
				7036	}
				7037
				7038	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7039	if (stream == NULL) {
				7040	xmlFreeParserInputBuffer(input);
				7041	return(NULL);
				7042	}
				7043
				7044	inputPush(ctxt, stream);
				7045	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7046	}
				7047
				7048	/**
				7049	* htmlCtxtReadFd:
				7050	* @ctxt: an HTML parser context
				7051	* @fd: an open file descriptor
				7052	* @URL: the base URL to use for the document
				7053	* @encoding: the document encoding, or NULL
				7054	* @options: a combination of htmlParserOption(s)
				7055	*
				7056	* parse an XML from a file descriptor and build a tree.
				7057	* This reuses the existing @ctxt parser context
				7058	*
				7059	* Returns the resulting document tree
				7060	*/
				7061	htmlDocPtr
				7062	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
				7063	const char URL, const char encoding, int options)
				7064	{
				7065	xmlParserInputBufferPtr input;
				7066	xmlParserInputPtr stream;
				7067
				7068	if (fd < 0)
				7069	return (NULL);
				7070	if (ctxt == NULL)
				7071	return (NULL);
				7072	xmlInitParser();
				7073
				7074	htmlCtxtReset(ctxt);
				7075
				7076
				7077	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
				7078	if (input == NULL)
				7079	return (NULL);
				7080	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7081	if (stream == NULL) {
				7082	xmlFreeParserInputBuffer(input);
				7083	return (NULL);
				7084	}
				7085	inputPush(ctxt, stream);
				7086	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7087	}
				7088
				7089	/**
				7090	* htmlCtxtReadIO:
				7091	* @ctxt: an HTML parser context
				7092	* @ioread: an I/O read function
				7093	* @ioclose: an I/O close function
				7094	* @ioctx: an I/O handler
				7095	* @URL: the base URL to use for the document
				7096	* @encoding: the document encoding, or NULL
				7097	* @options: a combination of htmlParserOption(s)
				7098	*
				7099	* parse an HTML document from I/O functions and source and build a tree.
				7100	* This reuses the existing @ctxt parser context
				7101	*
				7102	* Returns the resulting document tree
				7103	*/
				7104	htmlDocPtr
				7105	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
				7106	xmlInputCloseCallback ioclose, void *ioctx,
				7107	const char *URL,
				7108	const char *encoding, int options)
				7109	{
				7110	xmlParserInputBufferPtr input;
				7111	xmlParserInputPtr stream;
				7112
				7113	if (ioread == NULL)
				7114	return (NULL);
				7115	if (ctxt == NULL)
				7116	return (NULL);
				7117	xmlInitParser();
				7118
				7119	htmlCtxtReset(ctxt);
				7120
				7121	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
				7122	XML_CHAR_ENCODING_NONE);
				7123	if (input == NULL) {
				7124	if (ioclose != NULL)
				7125	ioclose(ioctx);
				7126	return (NULL);
				7127	}
				7128	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7129	if (stream == NULL) {
				7130	xmlFreeParserInputBuffer(input);
				7131	return (NULL);
				7132	}
				7133	inputPush(ctxt, stream);
				7134	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7135	}
				7136
				7137	#define bottom_HTMLparser
				7138	#include "elfgcchack.h"
				7139	#endif /* LIBXML_HTML_ENABLED */