Blame - HTMLparser.c - platform/external/libxml2

blob: de624f8d0bf283b380620dbc08a81eb1388be057 [file] [log] [blame]

Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* daniel@veillard.com
				7	*/
				8
				9	#define IN_LIBXML
				10	#include "libxml.h"
				11	#ifdef LIBXML_HTML_ENABLED
				12
				13	#include <string.h>
				14	#ifdef HAVE_CTYPE_H
				15	#include <ctype.h>
				16	#endif
				17	#ifdef HAVE_STDLIB_H
				18	#include <stdlib.h>
				19	#endif
				20	#ifdef HAVE_SYS_STAT_H
				21	#include <sys/stat.h>
				22	#endif
				23	#ifdef HAVE_FCNTL_H
				24	#include <fcntl.h>
				25	#endif
				26	#ifdef HAVE_UNISTD_H
				27	#include <unistd.h>
				28	#endif
				29	#ifdef LIBXML_ZLIB_ENABLED
				30	#include <zlib.h>
				31	#endif
				32
				33	#include <libxml/xmlmemory.h>
				34	#include <libxml/tree.h>
				35	#include <libxml/parser.h>
				36	#include <libxml/parserInternals.h>
				37	#include <libxml/xmlerror.h>
				38	#include <libxml/HTMLparser.h>
				39	#include <libxml/HTMLtree.h>
				40	#include <libxml/entities.h>
				41	#include <libxml/encoding.h>
				42	#include <libxml/valid.h>
				43	#include <libxml/xmlIO.h>
				44	#include <libxml/globals.h>
				45	#include <libxml/uri.h>
				46
				47	#include "buf.h"
				48	#include "enc.h"
				49
				50	#define HTML_MAX_NAMELEN 1000
				51	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				52	#define HTML_PARSER_BUFFER_SIZE 100
				53
				54	/* #define DEBUG */
				55	/* #define DEBUG_PUSH */
				56
				57	static int htmlOmittedDefaultValue = 1;
				58
				59	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				60	xmlChar end, xmlChar end2, xmlChar end3);
				61	static void htmlParseComment(htmlParserCtxtPtr ctxt);
				62
				63	/************************************************************************
				64	* *
				65	* Some factorized error routines *
				66	* *
				67	************************************************************************/
				68
				69	/**
				70	* htmlErrMemory:
				71	* @ctxt: an HTML parser context
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	72	* @extra: extra information
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	73	*
				74	* Handle a redefinition of attribute error
				75	*/
				76	static void
				77	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
				78	{
				79	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				80	(ctxt->instate == XML_PARSER_EOF))
				81	return;
				82	if (ctxt != NULL) {
				83	ctxt->errNo = XML_ERR_NO_MEMORY;
				84	ctxt->instate = XML_PARSER_EOF;
				85	ctxt->disableSAX = 1;
				86	}
				87	if (extra)
				88	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
				89	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
				90	NULL, NULL, 0, 0,
				91	"Memory allocation failed : %s\n", extra);
				92	else
				93	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
				94	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
				95	NULL, NULL, 0, 0, "Memory allocation failed\n");
				96	}
				97
				98	/**
				99	* htmlParseErr:
				100	* @ctxt: an HTML parser context
				101	* @error: the error number
				102	* @msg: the error message
				103	* @str1: string infor
				104	* @str2: string infor
				105	*
				106	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
				107	*/
				108	static void LIBXML_ATTR_FORMAT(3,0)
				109	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
				110	const char msg, const xmlChar str1, const xmlChar *str2)
				111	{
				112	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				113	(ctxt->instate == XML_PARSER_EOF))
				114	return;
				115	if (ctxt != NULL)
				116	ctxt->errNo = error;
				117	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
				118	XML_ERR_ERROR, NULL, 0,
				119	(const char ) str1, (const char ) str2,
				120	NULL, 0, 0,
				121	msg, str1, str2);
				122	if (ctxt != NULL)
				123	ctxt->wellFormed = 0;
				124	}
				125
				126	/**
				127	* htmlParseErrInt:
				128	* @ctxt: an HTML parser context
				129	* @error: the error number
				130	* @msg: the error message
				131	* @val: integer info
				132	*
				133	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
				134	*/
				135	static void LIBXML_ATTR_FORMAT(3,0)
				136	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
				137	const char *msg, int val)
				138	{
				139	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				140	(ctxt->instate == XML_PARSER_EOF))
				141	return;
				142	if (ctxt != NULL)
				143	ctxt->errNo = error;
				144	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
				145	XML_ERR_ERROR, NULL, 0, NULL, NULL,
				146	NULL, val, 0, msg, val);
				147	if (ctxt != NULL)
				148	ctxt->wellFormed = 0;
				149	}
				150
				151	/************************************************************************
				152	* *
				153	* Parser stacks related functions and macros *
				154	* *
				155	************************************************************************/
				156
				157	/**
				158	* htmlnamePush:
				159	* @ctxt: an HTML parser context
				160	* @value: the element name
				161	*
				162	* Pushes a new element name on top of the name stack
				163	*
				164	* Returns 0 in case of error, the index in the stack otherwise
				165	*/
				166	static int
				167	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
				168	{
				169	if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
				170	ctxt->html = 3;
				171	if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
				172	ctxt->html = 10;
				173	if (ctxt->nameNr >= ctxt->nameMax) {
				174	ctxt->nameMax *= 2;
				175	ctxt->nameTab = (const xmlChar * *)
				176	xmlRealloc((xmlChar * *)ctxt->nameTab,
				177	ctxt->nameMax *
				178	sizeof(ctxt->nameTab[0]));
				179	if (ctxt->nameTab == NULL) {
				180	htmlErrMemory(ctxt, NULL);
				181	return (0);
				182	}
				183	}
				184	ctxt->nameTab[ctxt->nameNr] = value;
				185	ctxt->name = value;
				186	return (ctxt->nameNr++);
				187	}
				188	/**
				189	* htmlnamePop:
				190	* @ctxt: an HTML parser context
				191	*
				192	* Pops the top element name from the name stack
				193	*
				194	* Returns the name just removed
				195	*/
				196	static const xmlChar *
				197	htmlnamePop(htmlParserCtxtPtr ctxt)
				198	{
				199	const xmlChar *ret;
				200
				201	if (ctxt->nameNr <= 0)
				202	return (NULL);
				203	ctxt->nameNr--;
				204	if (ctxt->nameNr < 0)
				205	return (NULL);
				206	if (ctxt->nameNr > 0)
				207	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
				208	else
				209	ctxt->name = NULL;
				210	ret = ctxt->nameTab[ctxt->nameNr];
				211	ctxt->nameTab[ctxt->nameNr] = NULL;
				212	return (ret);
				213	}
				214
				215	/**
				216	* htmlNodeInfoPush:
				217	* @ctxt: an HTML parser context
				218	* @value: the node info
				219	*
				220	* Pushes a new element name on top of the node info stack
				221	*
				222	* Returns 0 in case of error, the index in the stack otherwise
				223	*/
				224	static int
				225	htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
				226	{
				227	if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
				228	if (ctxt->nodeInfoMax == 0)
				229	ctxt->nodeInfoMax = 5;
				230	ctxt->nodeInfoMax *= 2;
				231	ctxt->nodeInfoTab = (htmlParserNodeInfo *)
				232	xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
				233	ctxt->nodeInfoMax *
				234	sizeof(ctxt->nodeInfoTab[0]));
				235	if (ctxt->nodeInfoTab == NULL) {
				236	htmlErrMemory(ctxt, NULL);
				237	return (0);
				238	}
				239	}
				240	ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
				241	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
				242	return (ctxt->nodeInfoNr++);
				243	}
				244
				245	/**
				246	* htmlNodeInfoPop:
				247	* @ctxt: an HTML parser context
				248	*
				249	* Pops the top element name from the node info stack
				250	*
				251	* Returns 0 in case of error, the pointer to NodeInfo otherwise
				252	*/
				253	static htmlParserNodeInfo *
				254	htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
				255	{
				256	if (ctxt->nodeInfoNr <= 0)
				257	return (NULL);
				258	ctxt->nodeInfoNr--;
				259	if (ctxt->nodeInfoNr < 0)
				260	return (NULL);
				261	if (ctxt->nodeInfoNr > 0)
				262	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
				263	else
				264	ctxt->nodeInfo = NULL;
				265	return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
				266	}
				267
				268	/*
				269	* Macros for accessing the content. Those should be used only by the parser,
				270	* and not exported.
				271	*
				272	* Dirty macros, i.e. one need to make assumption on the context to use them
				273	*
				274	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				275	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				276	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				277	* in UNICODE mode. This should be used internally by the parser
				278	* only to compare to ASCII values otherwise it would break when
				279	* running with UTF-8 encoding.
				280	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				281	* to compare on ASCII based substring.
				282	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				283	* it should be used only to compare on ASCII based substring.
				284	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				285	* strings without newlines within the parser.
				286	*
				287	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				288	*
				289	* CURRENT Returns the current char value, with the full decoding of
				290	* UTF-8 if we are using this mode. It returns an int.
				291	* NEXT Skip to the next character, this does the proper decoding
				292	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				293	* NEXTL(l) Skip the current unicode character of l xmlChars long.
				294	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				295	*/
				296
				297	#define UPPER (toupper(*ctxt->input->cur))
				298
Haibo Huang	f0a546b	2020-09-01 20:28:19 -0700	[diff] [blame^]	299	#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	300
				301	#define NXT(val) ctxt->input->cur[(val)]
				302
				303	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				304
				305	#define CUR_PTR ctxt->input->cur
				306	#define BASE_PTR ctxt->input->base
				307
				308	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
				309	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
				310	xmlParserInputShrink(ctxt->input)
				311
				312	#define GROW if ((ctxt->progressive == 0) && \
				313	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
				314	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				315
				316	#define CURRENT ((int) (*ctxt->input->cur))
				317
				318	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				319
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	320	/* Imported from XML */
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	321
				322	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				323	#define CUR ((int) (*ctxt->input->cur))
				324	#define NEXT xmlNextChar(ctxt)
				325
				326	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				327
				328
				329	#define NEXTL(l) do { \
				330	if (*(ctxt->input->cur) == '\n') { \
				331	ctxt->input->line++; ctxt->input->col = 1; \
				332	} else ctxt->input->col++; \
Haibo Huang	f0a546b	2020-09-01 20:28:19 -0700	[diff] [blame^]	333	ctxt->token = 0; ctxt->input->cur += l; \
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	334	} while (0)
				335
				336	/************
				337	\
				338	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				339	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				340	************/
				341
				342	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				343	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				344
				345	#define COPY_BUF(l,b,i,v) \
				346	if (l == 1) b[i++] = (xmlChar) v; \
				347	else i += xmlCopyChar(l,&b[i],v)
				348
				349	/**
				350	* htmlFindEncoding:
				351	* @the HTML parser context
				352	*
				353	* Ty to find and encoding in the current data available in the input
				354	* buffer this is needed to try to switch to the proper encoding when
				355	* one face a character error.
				356	* That's an heuristic, since it's operating outside of parsing it could
				357	* try to use a meta which had been commented out, that's the reason it
				358	* should only be used in case of error, not as a default.
				359	*
				360	* Returns an encoding string or NULL if not found, the string need to
				361	* be freed
				362	*/
				363	static xmlChar *
				364	htmlFindEncoding(xmlParserCtxtPtr ctxt) {
				365	const xmlChar start, cur, *end;
				366
				367	if ((ctxt == NULL) \|\| (ctxt->input == NULL) \|\|
				368	(ctxt->input->encoding != NULL) \|\| (ctxt->input->buf == NULL) \|\|
				369	(ctxt->input->buf->encoder != NULL))
				370	return(NULL);
				371	if ((ctxt->input->cur == NULL) \|\| (ctxt->input->end == NULL))
				372	return(NULL);
				373
				374	start = ctxt->input->cur;
				375	end = ctxt->input->end;
				376	/* we also expect the input buffer to be zero terminated */
				377	if (*end != 0)
				378	return(NULL);
				379
				380	cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
				381	if (cur == NULL)
				382	return(NULL);
				383	cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
				384	if (cur == NULL)
				385	return(NULL);
				386	cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
				387	if (cur == NULL)
				388	return(NULL);
				389	cur += 8;
				390	start = cur;
				391	while (((cur >= 'A') && (cur <= 'Z')) \|\|
				392	((cur >= 'a') && (cur <= 'z')) \|\|
				393	((cur >= '0') && (cur <= '9')) \|\|
				394	(cur == '-') \|\| (cur == '_') \|\| (cur == ':') \|\| (cur == '/'))
				395	cur++;
				396	if (cur == start)
				397	return(NULL);
				398	return(xmlStrndup(start, cur - start));
				399	}
				400
				401	/**
				402	* htmlCurrentChar:
				403	* @ctxt: the HTML parser context
				404	* @len: pointer to the length of the char read
				405	*
				406	* The current char value, if using UTF-8 this may actually span multiple
				407	* bytes in the input buffer. Implement the end of line normalization:
				408	* 2.11 End-of-Line Handling
				409	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				410	* char, then the encoding converter is plugged in automatically.
				411	*
				412	* Returns the current char value and its length
				413	*/
				414
				415	static int
				416	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	417	const unsigned char *cur;
				418	unsigned char c;
				419	unsigned int val;
				420
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	421	if (ctxt->instate == XML_PARSER_EOF)
				422	return(0);
				423
				424	if (ctxt->token != 0) {
				425	*len = 0;
				426	return(ctxt->token);
				427	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	428	if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	429	xmlChar * guess;
				430	xmlCharEncodingHandlerPtr handler;
				431
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	432	/*
				433	* Assume it's a fixed length encoding (1) with
				434	* a compatible encoding for the ASCII set, since
				435	* HTML constructs only use < 128 chars
				436	*/
				437	if ((int) *ctxt->input->cur < 0x80) {
				438	*len = 1;
				439	if ((*ctxt->input->cur == 0) &&
				440	(ctxt->input->cur < ctxt->input->end)) {
				441	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				442	"Char 0x%X out of allowed range\n", 0);
				443	return(' ');
				444	}
				445	return((int) *ctxt->input->cur);
				446	}
				447
				448	/*
				449	* Humm this is bad, do an automatic flow conversion
				450	*/
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	451	guess = htmlFindEncoding(ctxt);
				452	if (guess == NULL) {
				453	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				454	} else {
				455	if (ctxt->input->encoding != NULL)
				456	xmlFree((xmlChar *) ctxt->input->encoding);
				457	ctxt->input->encoding = guess;
				458	handler = xmlFindCharEncodingHandler((const char *) guess);
				459	if (handler != NULL) {
				460	xmlSwitchToEncoding(ctxt, handler);
				461	} else {
				462	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				463	"Unsupported encoding %s", guess, NULL);
				464	}
				465	}
				466	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				467	}
				468
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	469	/*
				470	* We are supposed to handle UTF8, check it's valid
				471	* From rfc2044: encoding of the Unicode values on UTF-8:
				472	*
				473	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				474	* 0000 0000-0000 007F 0xxxxxxx
				475	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				476	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				477	*
				478	* Check for the 0x110000 limit too
				479	*/
				480	cur = ctxt->input->cur;
				481	c = *cur;
				482	if (c & 0x80) {
				483	if ((c & 0x40) == 0)
				484	goto encoding_error;
				485	if (cur[1] == 0) {
				486	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				487	cur = ctxt->input->cur;
				488	}
				489	if ((cur[1] & 0xc0) != 0x80)
				490	goto encoding_error;
				491	if ((c & 0xe0) == 0xe0) {
				492
				493	if (cur[2] == 0) {
				494	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				495	cur = ctxt->input->cur;
				496	}
				497	if ((cur[2] & 0xc0) != 0x80)
				498	goto encoding_error;
				499	if ((c & 0xf0) == 0xf0) {
				500	if (cur[3] == 0) {
				501	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				502	cur = ctxt->input->cur;
				503	}
				504	if (((c & 0xf8) != 0xf0) \|\|
				505	((cur[3] & 0xc0) != 0x80))
				506	goto encoding_error;
				507	/* 4-byte code */
				508	*len = 4;
				509	val = (cur[0] & 0x7) << 18;
				510	val \|= (cur[1] & 0x3f) << 12;
				511	val \|= (cur[2] & 0x3f) << 6;
				512	val \|= cur[3] & 0x3f;
				513	if (val < 0x10000)
				514	goto encoding_error;
				515	} else {
				516	/* 3-byte code */
				517	*len = 3;
				518	val = (cur[0] & 0xf) << 12;
				519	val \|= (cur[1] & 0x3f) << 6;
				520	val \|= cur[2] & 0x3f;
				521	if (val < 0x800)
				522	goto encoding_error;
				523	}
				524	} else {
				525	/* 2-byte code */
				526	*len = 2;
				527	val = (cur[0] & 0x1f) << 6;
				528	val \|= cur[1] & 0x3f;
				529	if (val < 0x80)
				530	goto encoding_error;
				531	}
				532	if (!IS_CHAR(val)) {
				533	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				534	"Char 0x%X out of allowed range\n", val);
				535	}
				536	return(val);
				537	} else {
				538	if ((*ctxt->input->cur == 0) &&
				539	(ctxt->input->cur < ctxt->input->end)) {
				540	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				541	"Char 0x%X out of allowed range\n", 0);
				542	*len = 1;
				543	return(' ');
				544	}
				545	/* 1-byte code */
				546	*len = 1;
				547	return((int) *ctxt->input->cur);
				548	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	549
				550	encoding_error:
				551	/*
				552	* If we detect an UTF8 error that probably mean that the
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	553	* input encoding didn't get properly advertised in the
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	554	* declaration header. Report the error and switch the encoding
				555	* to ISO-Latin-1 (if you don't like this policy, just declare the
				556	* encoding !)
				557	*/
				558	{
				559	char buffer[150];
				560
				561	if (ctxt->input->end - ctxt->input->cur >= 4) {
				562	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				563	ctxt->input->cur[0], ctxt->input->cur[1],
				564	ctxt->input->cur[2], ctxt->input->cur[3]);
				565	} else {
				566	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
				567	}
				568	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				569	"Input is not proper UTF-8, indicate encoding !\n",
				570	BAD_CAST buffer, NULL);
				571	}
				572
				573	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				574	*len = 1;
				575	return((int) *ctxt->input->cur);
				576	}
				577
				578	/**
				579	* htmlSkipBlankChars:
				580	* @ctxt: the HTML parser context
				581	*
				582	* skip all blanks character found at that point in the input streams.
				583	*
				584	* Returns the number of space chars skipped
				585	*/
				586
				587	static int
				588	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				589	int res = 0;
				590
				591	while (IS_BLANK_CH(*(ctxt->input->cur))) {
				592	if ((*ctxt->input->cur == 0) &&
				593	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				594	xmlPopInput(ctxt);
				595	} else {
				596	if (*(ctxt->input->cur) == '\n') {
				597	ctxt->input->line++; ctxt->input->col = 1;
				598	} else ctxt->input->col++;
				599	ctxt->input->cur++;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	600	if (*ctxt->input->cur == 0)
				601	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				602	}
				603	res++;
				604	}
				605	return(res);
				606	}
				607
				608
				609
				610	/************************************************************************
				611	* *
				612	* The list of HTML elements and their properties *
				613	* *
				614	************************************************************************/
				615
				616	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	617	* Start Tag: 1 means the start tag can be omitted
				618	* End Tag: 1 means the end tag can be omitted
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	619	* 2 means it's forbidden (empty elements)
				620	* 3 means the tag is stylistic and should be closed easily
				621	* Depr: this element is deprecated
				622	* DTD: 1 means that this element is valid only in the Loose DTD
				623	* 2 means that this element is valid only in the Frameset DTD
				624	*
				625	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
				626	, subElements , impliedsubelt , Attributes, userdata
				627	*/
				628
				629	/* Definitions and a couple of vars for HTML Elements */
				630
				631	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
				632	#define NB_FONTSTYLE 8
				633	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
				634	#define NB_PHRASE 10
				635	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
				636	#define NB_SPECIAL 16
				637	#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
				638	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
				639	#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
				640	#define NB_BLOCK NB_HEADING + NB_LIST + 14
				641	#define FORMCTRL "input", "select", "textarea", "label", "button"
				642	#define NB_FORMCTRL 5
				643	#define PCDATA
				644	#define NB_PCDATA 0
				645	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
				646	#define NB_HEADING 6
				647	#define LIST "ul", "ol", "dir", "menu"
				648	#define NB_LIST 4
				649	#define MODIFIER
				650	#define NB_MODIFIER 0
				651	#define FLOW BLOCK,INLINE
				652	#define NB_FLOW NB_BLOCK + NB_INLINE
				653	#define EMPTY NULL
				654
				655
				656	static const char* const html_flow[] = { FLOW, NULL } ;
				657	static const char* const html_inline[] = { INLINE, NULL } ;
				658
				659	/* placeholders: elts with content but no subelements */
				660	static const char* const html_pcdata[] = { NULL } ;
				661	#define html_cdata html_pcdata
				662
				663
				664	/* ... and for HTML Attributes */
				665
				666	#define COREATTRS "id", "class", "style", "title"
				667	#define NB_COREATTRS 4
				668	#define I18N "lang", "dir"
				669	#define NB_I18N 2
				670	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
				671	#define NB_EVENTS 9
				672	#define ATTRS COREATTRS,I18N,EVENTS
				673	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
				674	#define CELLHALIGN "align", "char", "charoff"
				675	#define NB_CELLHALIGN 3
				676	#define CELLVALIGN "valign"
				677	#define NB_CELLVALIGN 1
				678
				679	static const char* const html_attrs[] = { ATTRS, NULL } ;
				680	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
				681	static const char* const core_attrs[] = { COREATTRS, NULL } ;
				682	static const char* const i18n_attrs[] = { I18N, NULL } ;
				683
				684
				685	/* Other declarations that should go inline ... */
				686	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
				687	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
				688	"tabindex", "onfocus", "onblur", NULL } ;
				689	static const char* const target_attr[] = { "target", NULL } ;
				690	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
				691	static const char* const alt_attr[] = { "alt", NULL } ;
				692	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
				693	static const char* const href_attrs[] = { "href", NULL } ;
				694	static const char* const clear_attrs[] = { "clear", NULL } ;
				695	static const char* const inline_p[] = { INLINE, "p", NULL } ;
				696
				697	static const char* const flow_param[] = { FLOW, "param", NULL } ;
				698	static const char* const applet_attrs[] = { COREATTRS , "codebase",
				699	"archive", "alt", "name", "height", "width", "align",
				700	"hspace", "vspace", NULL } ;
				701	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
				702	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
				703	static const char* const basefont_attrs[] =
				704	{ "id", "size", "color", "face", NULL } ;
				705	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
				706	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
				707	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
				708	static const char* const body_depr[] = { "background", "bgcolor", "text",
				709	"link", "vlink", "alink", NULL } ;
				710	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
				711	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
				712
				713
				714	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
				715	static const char* const col_elt[] = { "col", NULL } ;
				716	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
				717	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
				718	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
				719	static const char* const compact_attr[] = { "compact", NULL } ;
				720	static const char* const label_attr[] = { "label", NULL } ;
				721	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
				722	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
				723	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
				724	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
				725	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
				726	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
				727	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
				728	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
				729	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
				730	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
				731	static const char* const version_attr[] = { "version", NULL } ;
				732	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
				733	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
				734	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
				735	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
				736	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
				737	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
				738	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
				739	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
				740	static const char* const align_attr[] = { "align", NULL } ;
				741	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
				742	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
				743	static const char* const name_attr[] = { "name", NULL } ;
				744	static const char* const action_attr[] = { "action", NULL } ;
				745	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
				746	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
				747	static const char* const content_attr[] = { "content", NULL } ;
				748	static const char* const type_attr[] = { "type", NULL } ;
				749	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
				750	static const char* const object_contents[] = { FLOW, "param", NULL } ;
				751	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
				752	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
				753	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
				754	static const char* const option_elt[] = { "option", NULL } ;
				755	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
				756	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
				757	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
				758	static const char* const width_attr[] = { "width", NULL } ;
				759	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
				760	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
				761	static const char* const language_attr[] = { "language", NULL } ;
				762	static const char* const select_content[] = { "optgroup", "option", NULL } ;
				763	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
				764	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
				765	static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
				766	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
				767	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
				768	static const char* const tr_elt[] = { "tr", NULL } ;
				769	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
				770	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
				771	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
				772	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
				773	static const char* const tr_contents[] = { "th", "td", NULL } ;
				774	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
				775	static const char* const li_elt[] = { "li", NULL } ;
				776	static const char* const ul_depr[] = { "type", "compact", NULL} ;
				777	static const char* const dir_attr[] = { "dir", NULL} ;
				778
				779	#define DECL (const char**)
				780
				781	static const htmlElemDesc
				782	html40ElementTable[] = {
				783	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
				784	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
				785	},
				786	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
				787	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				788	},
				789	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
				790	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				791	},
				792	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
				793	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
				794	},
				795	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
				796	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
				797	},
				798	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
				799	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
				800	},
				801	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
				802	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				803	},
				804	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
				805	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
				806	},
				807	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
				808	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
				809	},
				810	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
				811	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
				812	},
				813	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
				814	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				815	},
				816	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
				817	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
				818	},
				819	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
				820	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
				821	},
				822	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
				823	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
				824	},
				825	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
				826	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
				827	},
				828	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
				829	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				830	},
				831	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
				832	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
				833	},
				834	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
				835	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				836	},
				837	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
				838	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				839	},
				840	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
				841	EMPTY , NULL , DECL col_attrs , NULL, NULL
				842	},
				843	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
				844	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
				845	},
				846	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
				847	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
				848	},
				849	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
				850	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
				851	},
				852	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
				853	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				854	},
				855	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
				856	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
				857	},
				858	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
				859	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
				860	},
				861	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
				862	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
				863	},
				864	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
				865	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				866	},
				867	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
				868	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				869	},
				870	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
				871	EMPTY, NULL, DECL embed_attrs, NULL, NULL
				872	},
				873	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
				874	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
				875	},
				876	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
				877	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
				878	},
				879	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
				880	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
				881	},
				882	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
				883	EMPTY, NULL, NULL, DECL frame_attrs, NULL
				884	},
				885	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
				886	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
				887	},
				888	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
				889	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				890	},
				891	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
				892	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				893	},
				894	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
				895	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				896	},
				897	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
				898	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				899	},
				900	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
				901	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				902	},
				903	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
				904	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				905	},
				906	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
				907	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
				908	},
				909	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
				910	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
				911	},
				912	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
				913	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
				914	},
				915	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
				916	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				917	},
				918	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
				919	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
				920	},
				921	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
				922	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
				923	},
				924	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
				925	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
				926	},
				927	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
				928	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
				929	},
				930	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
				931	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
				932	},
				933	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
				934	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				935	},
				936	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
				937	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
				938	},
				939	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
				940	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
				941	},
				942	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
				943	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
				944	},
				945	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
				946	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
				947	},
				948	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
				949	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
				950	},
				951	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
				952	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
				953	},
				954	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
				955	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
				956	},
				957	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
				958	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
				959	},
				960	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
				961	DECL html_flow, "div", DECL html_attrs, NULL, NULL
				962	},
				963	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
				964	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
				965	},
				966	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
				967	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
				968	},
				969	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
				970	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
				971	},
				972	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
				973	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
				974	},
				975	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
				976	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				977	},
				978	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
				979	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
				980	},
				981	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
				982	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
				983	},
				984	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
				985	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
				986	},
				987	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
				988	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				989	},
				990	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
				991	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				992	},
				993	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
				994	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
				995	},
				996	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
				997	DECL select_content, NULL, DECL select_attrs, NULL, NULL
				998	},
				999	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
				1000	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1001	},
				1002	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
				1003	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1004	},
				1005	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
				1006	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				1007	},
				1008	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
				1009	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1010	},
				1011	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
				1012	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
				1013	},
				1014	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
				1015	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1016	},
				1017	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
				1018	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1019	},
				1020	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
				1021	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
				1022	},
				1023	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
				1024	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1025	},
				1026	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
				1027	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
				1028	},
				1029	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
				1030	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
				1031	},
				1032	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
				1033	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1034	},
				1035	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
				1036	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
				1037	},
				1038	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
				1039	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1040	},
				1041	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
				1042	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
				1043	},
				1044	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
				1045	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
				1046	},
				1047	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
				1048	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1049	},
				1050	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
				1051	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				1052	},
				1053	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
				1054	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
				1055	},
				1056	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
				1057	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1058	}
				1059	};
				1060
				1061	/*
				1062	* start tags that imply the end of current element
				1063	*/
				1064	static const char * const htmlStartClose[] = {
				1065	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				1066	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				1067	"listing", "xmp", "head", NULL,
				1068	"head", "p", NULL,
				1069	"title", "p", NULL,
				1070	"body", "head", "style", "link", "title", "p", NULL,
				1071	"frameset", "head", "style", "link", "title", "p", NULL,
				1072	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				1073	"pre", "listing", "xmp", "head", "li", NULL,
				1074	"hr", "p", "head", NULL,
				1075	"h1", "p", "head", NULL,
				1076	"h2", "p", "head", NULL,
				1077	"h3", "p", "head", NULL,
				1078	"h4", "p", "head", NULL,
				1079	"h5", "p", "head", NULL,
				1080	"h6", "p", "head", NULL,
				1081	"dir", "p", "head", NULL,
				1082	"address", "p", "head", "ul", NULL,
				1083	"pre", "p", "head", "ul", NULL,
				1084	"listing", "p", "head", NULL,
				1085	"xmp", "p", "head", NULL,
				1086	"blockquote", "p", "head", NULL,
				1087	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				1088	"xmp", "head", NULL,
				1089	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				1090	"head", "dd", NULL,
				1091	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				1092	"head", "dt", NULL,
				1093	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				1094	"listing", "xmp", NULL,
				1095	"ol", "p", "head", "ul", NULL,
				1096	"menu", "p", "head", "ul", NULL,
				1097	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
				1098	"div", "p", "head", NULL,
				1099	"noscript", "script", NULL,
				1100	"center", "font", "b", "i", "p", "head", NULL,
				1101	"a", "a", "head", NULL,
				1102	"caption", "p", NULL,
				1103	"colgroup", "caption", "colgroup", "col", "p", NULL,
				1104	"col", "caption", "col", "p", NULL,
				1105	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				1106	"listing", "xmp", "a", NULL,
				1107	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				1108	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				1109	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				1110	"thead", "caption", "col", "colgroup", NULL,
				1111	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				1112	"tbody", "p", NULL,
				1113	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				1114	"tfoot", "tbody", "p", NULL,
				1115	"optgroup", "option", NULL,
				1116	"option", "option", NULL,
				1117	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				1118	"pre", "listing", "xmp", "a", NULL,
				1119	/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
				1120	"tt", "head", NULL,
				1121	"i", "head", NULL,
				1122	"b", "head", NULL,
				1123	"u", "head", NULL,
				1124	"s", "head", NULL,
				1125	"strike", "head", NULL,
				1126	"big", "head", NULL,
				1127	"small", "head", NULL,
				1128
				1129	"em", "head", NULL,
				1130	"strong", "head", NULL,
				1131	"dfn", "head", NULL,
				1132	"code", "head", NULL,
				1133	"samp", "head", NULL,
				1134	"kbd", "head", NULL,
				1135	"var", "head", NULL,
				1136	"cite", "head", NULL,
				1137	"abbr", "head", NULL,
				1138	"acronym", "head", NULL,
				1139
				1140	/* "a" */
				1141	"img", "head", NULL,
				1142	/* "applet" */
				1143	/* "embed" */
				1144	/* "object" */
				1145	"font", "head", NULL,
				1146	/* "basefont" */
				1147	"br", "head", NULL,
				1148	/* "script" */
				1149	"map", "head", NULL,
				1150	"q", "head", NULL,
				1151	"sub", "head", NULL,
				1152	"sup", "head", NULL,
				1153	"span", "head", NULL,
				1154	"bdo", "head", NULL,
				1155	"iframe", "head", NULL,
				1156	NULL
				1157	};
				1158
				1159	/*
				1160	* The list of HTML elements which are supposed not to have
				1161	* CDATA content and where a p element will be implied
				1162	*
				1163	* TODO: extend that list by reading the HTML SGML DTD on
				1164	* implied paragraph
				1165	*/
				1166	static const char *const htmlNoContentElements[] = {
				1167	"html",
				1168	"head",
				1169	NULL
				1170	};
				1171
				1172	/*
				1173	* The list of HTML attributes which are of content %Script;
				1174	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				1175	* it assumes the name starts with 'on'
				1176	*/
				1177	static const char *const htmlScriptAttributes[] = {
				1178	"onclick",
				1179	"ondblclick",
				1180	"onmousedown",
				1181	"onmouseup",
				1182	"onmouseover",
				1183	"onmousemove",
				1184	"onmouseout",
				1185	"onkeypress",
				1186	"onkeydown",
				1187	"onkeyup",
				1188	"onload",
				1189	"onunload",
				1190	"onfocus",
				1191	"onblur",
				1192	"onsubmit",
				1193	"onreset",
				1194	"onchange",
				1195	"onselect"
				1196	};
				1197
				1198	/*
				1199	* This table is used by the htmlparser to know what to do with
				1200	* broken html pages. By assigning different priorities to different
				1201	* elements the parser can decide how to handle extra endtags.
				1202	* Endtags are only allowed to close elements with lower or equal
				1203	* priority.
				1204	*/
				1205
				1206	typedef struct {
				1207	const char *name;
				1208	int priority;
				1209	} elementPriority;
				1210
				1211	static const elementPriority htmlEndPriority[] = {
				1212	{"div", 150},
				1213	{"td", 160},
				1214	{"th", 160},
				1215	{"tr", 170},
				1216	{"thead", 180},
				1217	{"tbody", 180},
				1218	{"tfoot", 180},
				1219	{"table", 190},
				1220	{"head", 200},
				1221	{"body", 200},
				1222	{"html", 220},
				1223	{NULL, 100} /* Default priority */
				1224	};
				1225
				1226	static const char** htmlStartCloseIndex[100];
				1227	static int htmlStartCloseIndexinitialized = 0;
				1228
				1229	/************************************************************************
				1230	* *
				1231	* functions to handle HTML specific data *
				1232	* *
				1233	************************************************************************/
				1234
				1235	/**
				1236	* htmlInitAutoClose:
				1237	*
				1238	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				1239	* This is not reentrant. Call xmlInitParser() once before processing in
				1240	* case of use in multithreaded programs.
				1241	*/
				1242	void
				1243	htmlInitAutoClose(void) {
				1244	int indx, i = 0;
				1245
				1246	if (htmlStartCloseIndexinitialized) return;
				1247
				1248	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				1249	indx = 0;
				1250	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				1251	htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
				1252	while (htmlStartClose[i] != NULL) i++;
				1253	i++;
				1254	}
				1255	htmlStartCloseIndexinitialized = 1;
				1256	}
				1257
				1258	/**
				1259	* htmlTagLookup:
				1260	* @tag: The tag name in lowercase
				1261	*
				1262	* Lookup the HTML tag in the ElementTable
				1263	*
				1264	* Returns the related htmlElemDescPtr or NULL if not found.
				1265	*/
				1266	const htmlElemDesc *
				1267	htmlTagLookup(const xmlChar *tag) {
				1268	unsigned int i;
				1269
				1270	for (i = 0; i < (sizeof(html40ElementTable) /
				1271	sizeof(html40ElementTable[0]));i++) {
				1272	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
				1273	return((htmlElemDescPtr) &html40ElementTable[i]);
				1274	}
				1275	return(NULL);
				1276	}
				1277
				1278	/**
				1279	* htmlGetEndPriority:
				1280	* @name: The name of the element to look up the priority for.
				1281	*
				1282	* Return value: The "endtag" priority.
				1283	**/
				1284	static int
				1285	htmlGetEndPriority (const xmlChar *name) {
				1286	int i = 0;
				1287
				1288	while ((htmlEndPriority[i].name != NULL) &&
				1289	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				1290	i++;
				1291
				1292	return(htmlEndPriority[i].priority);
				1293	}
				1294
				1295
				1296	/**
				1297	* htmlCheckAutoClose:
				1298	* @newtag: The new tag name
				1299	* @oldtag: The old tag name
				1300	*
				1301	* Checks whether the new tag is one of the registered valid tags for
				1302	* closing old.
				1303	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				1304	*
				1305	* Returns 0 if no, 1 if yes.
				1306	*/
				1307	static int
				1308	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
				1309	{
				1310	int i, indx;
				1311	const char **closed = NULL;
				1312
				1313	if (htmlStartCloseIndexinitialized == 0)
				1314	htmlInitAutoClose();
				1315
				1316	/* inefficient, but not a big deal */
				1317	for (indx = 0; indx < 100; indx++) {
				1318	closed = htmlStartCloseIndex[indx];
				1319	if (closed == NULL)
				1320	return (0);
				1321	if (xmlStrEqual(BAD_CAST * closed, newtag))
				1322	break;
				1323	}
				1324
				1325	i = closed - htmlStartClose;
				1326	i++;
				1327	while (htmlStartClose[i] != NULL) {
				1328	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				1329	return (1);
				1330	}
				1331	i++;
				1332	}
				1333	return (0);
				1334	}
				1335
				1336	/**
				1337	* htmlAutoCloseOnClose:
				1338	* @ctxt: an HTML parser context
				1339	* @newtag: The new tag name
				1340	* @force: force the tag closure
				1341	*
				1342	* The HTML DTD allows an ending tag to implicitly close other tags.
				1343	*/
				1344	static void
				1345	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
				1346	{
				1347	const htmlElemDesc *info;
				1348	int i, priority;
				1349
				1350	priority = htmlGetEndPriority(newtag);
				1351
				1352	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				1353
				1354	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
				1355	break;
				1356	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	1357	* A misplaced endtag can only close elements with lower
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	1358	* or equal priority, so if we find an element with higher
				1359	* priority before we find an element with
				1360	* matching name, we just ignore this endtag
				1361	*/
				1362	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
				1363	return;
				1364	}
				1365	if (i < 0)
				1366	return;
				1367
				1368	while (!xmlStrEqual(newtag, ctxt->name)) {
				1369	info = htmlTagLookup(ctxt->name);
				1370	if ((info != NULL) && (info->endTag == 3)) {
				1371	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				1372	"Opening and ending tag mismatch: %s and %s\n",
				1373	newtag, ctxt->name);
				1374	}
				1375	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1376	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1377	htmlnamePop(ctxt);
				1378	}
				1379	}
				1380
				1381	/**
				1382	* htmlAutoCloseOnEnd:
				1383	* @ctxt: an HTML parser context
				1384	*
				1385	* Close all remaining tags at the end of the stream
				1386	*/
				1387	static void
				1388	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
				1389	{
				1390	int i;
				1391
				1392	if (ctxt->nameNr == 0)
				1393	return;
				1394	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				1395	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1396	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1397	htmlnamePop(ctxt);
				1398	}
				1399	}
				1400
				1401	/**
				1402	* htmlAutoClose:
				1403	* @ctxt: an HTML parser context
				1404	* @newtag: The new tag name or NULL
				1405	*
				1406	* The HTML DTD allows a tag to implicitly close other tags.
				1407	* The list is kept in htmlStartClose array. This function is
				1408	* called when a new tag has been detected and generates the
				1409	* appropriates closes if possible/needed.
				1410	* If newtag is NULL this mean we are at the end of the resource
				1411	* and we should check
				1412	*/
				1413	static void
				1414	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
				1415	{
				1416	while ((newtag != NULL) && (ctxt->name != NULL) &&
				1417	(htmlCheckAutoClose(newtag, ctxt->name))) {
				1418	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1419	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1420	htmlnamePop(ctxt);
				1421	}
				1422	if (newtag == NULL) {
				1423	htmlAutoCloseOnEnd(ctxt);
				1424	return;
				1425	}
				1426	while ((newtag == NULL) && (ctxt->name != NULL) &&
				1427	((xmlStrEqual(ctxt->name, BAD_CAST "head")) \|\|
				1428	(xmlStrEqual(ctxt->name, BAD_CAST "body")) \|\|
				1429	(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
				1430	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1431	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1432	htmlnamePop(ctxt);
				1433	}
				1434	}
				1435
				1436	/**
				1437	* htmlAutoCloseTag:
				1438	* @doc: the HTML document
				1439	* @name: The tag name
				1440	* @elem: the HTML element
				1441	*
				1442	* The HTML DTD allows a tag to implicitly close other tags.
				1443	* The list is kept in htmlStartClose array. This function checks
				1444	* if the element or one of it's children would autoclose the
				1445	* given tag.
				1446	*
				1447	* Returns 1 if autoclose, 0 otherwise
				1448	*/
				1449	int
				1450	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				1451	htmlNodePtr child;
				1452
				1453	if (elem == NULL) return(1);
				1454	if (xmlStrEqual(name, elem->name)) return(0);
				1455	if (htmlCheckAutoClose(elem->name, name)) return(1);
				1456	child = elem->children;
				1457	while (child != NULL) {
				1458	if (htmlAutoCloseTag(doc, name, child)) return(1);
				1459	child = child->next;
				1460	}
				1461	return(0);
				1462	}
				1463
				1464	/**
				1465	* htmlIsAutoClosed:
				1466	* @doc: the HTML document
				1467	* @elem: the HTML element
				1468	*
				1469	* The HTML DTD allows a tag to implicitly close other tags.
				1470	* The list is kept in htmlStartClose array. This function checks
				1471	* if a tag is autoclosed by one of it's child
				1472	*
				1473	* Returns 1 if autoclosed, 0 otherwise
				1474	*/
				1475	int
				1476	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				1477	htmlNodePtr child;
				1478
				1479	if (elem == NULL) return(1);
				1480	child = elem->children;
				1481	while (child != NULL) {
				1482	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				1483	child = child->next;
				1484	}
				1485	return(0);
				1486	}
				1487
				1488	/**
				1489	* htmlCheckImplied:
				1490	* @ctxt: an HTML parser context
				1491	* @newtag: The new tag name
				1492	*
				1493	* The HTML DTD allows a tag to exists only implicitly
				1494	* called when a new tag has been detected and generates the
				1495	* appropriates implicit tags if missing
				1496	*/
				1497	static void
				1498	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				1499	int i;
				1500
				1501	if (ctxt->options & HTML_PARSE_NOIMPLIED)
				1502	return;
				1503	if (!htmlOmittedDefaultValue)
				1504	return;
				1505	if (xmlStrEqual(newtag, BAD_CAST"html"))
				1506	return;
				1507	if (ctxt->nameNr <= 0) {
				1508	htmlnamePush(ctxt, BAD_CAST"html");
				1509	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1510	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				1511	}
				1512	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				1513	return;
				1514	if ((ctxt->nameNr <= 1) &&
				1515	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				1516	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				1517	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				1518	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				1519	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				1520	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				1521	if (ctxt->html >= 3) {
				1522	/* we already saw or generated an <head> before */
				1523	return;
				1524	}
				1525	/*
				1526	* dropped OBJECT ... i you put it first BODY will be
				1527	* assumed !
				1528	*/
				1529	htmlnamePush(ctxt, BAD_CAST"head");
				1530	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1531	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				1532	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				1533	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				1534	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				1535	if (ctxt->html >= 10) {
				1536	/* we already saw or generated a <body> before */
				1537	return;
				1538	}
				1539	for (i = 0;i < ctxt->nameNr;i++) {
				1540	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				1541	return;
				1542	}
				1543	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				1544	return;
				1545	}
				1546	}
				1547
				1548	htmlnamePush(ctxt, BAD_CAST"body");
				1549	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1550	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				1551	}
				1552	}
				1553
				1554	/**
				1555	* htmlCheckParagraph
				1556	* @ctxt: an HTML parser context
				1557	*
				1558	* Check whether a p element need to be implied before inserting
				1559	* characters in the current element.
				1560	*
				1561	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				1562	* in case of error.
				1563	*/
				1564
				1565	static int
				1566	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				1567	const xmlChar *tag;
				1568	int i;
				1569
				1570	if (ctxt == NULL)
				1571	return(-1);
				1572	tag = ctxt->name;
				1573	if (tag == NULL) {
				1574	htmlAutoClose(ctxt, BAD_CAST"p");
				1575	htmlCheckImplied(ctxt, BAD_CAST"p");
				1576	htmlnamePush(ctxt, BAD_CAST"p");
				1577	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1578	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1579	return(1);
				1580	}
				1581	if (!htmlOmittedDefaultValue)
				1582	return(0);
				1583	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				1584	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				1585	htmlAutoClose(ctxt, BAD_CAST"p");
				1586	htmlCheckImplied(ctxt, BAD_CAST"p");
				1587	htmlnamePush(ctxt, BAD_CAST"p");
				1588	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1589	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1590	return(1);
				1591	}
				1592	}
				1593	return(0);
				1594	}
				1595
				1596	/**
				1597	* htmlIsScriptAttribute:
				1598	* @name: an attribute name
				1599	*
				1600	* Check if an attribute is of content type Script
				1601	*
				1602	* Returns 1 is the attribute is a script 0 otherwise
				1603	*/
				1604	int
				1605	htmlIsScriptAttribute(const xmlChar *name) {
				1606	unsigned int i;
				1607
				1608	if (name == NULL)
				1609	return(0);
				1610	/*
				1611	* all script attributes start with 'on'
				1612	*/
				1613	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1614	return(0);
				1615	for (i = 0;
				1616	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1617	i++) {
				1618	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1619	return(1);
				1620	}
				1621	return(0);
				1622	}
				1623
				1624	/************************************************************************
				1625	* *
				1626	* The list of HTML predefined entities *
				1627	* *
				1628	************************************************************************/
				1629
				1630
				1631	static const htmlEntityDesc html40EntitiesTable[] = {
				1632	/*
				1633	* the 4 absolute ones, plus apostrophe.
				1634	*/
				1635	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1636	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1637	{ 39, "apos", "single quote" },
				1638	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1639	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1640
				1641	/*
				1642	* A bunch still in the 128-255 range
				1643	* Replacing them depend really on the charset used.
				1644	*/
				1645	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1646	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1647	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1648	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1649	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1650	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1651	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1652	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1653	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1654	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1655	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1656	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1657	{ 172, "not", "not sign, U+00AC ISOnum" },
				1658	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1659	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1660	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1661	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1662	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1663	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1664	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1665	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1666	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1667	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1668	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1669	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1670	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1671	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1672	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1673	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1674	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1675	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1676	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1677	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1678	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1679	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1680	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1681	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1682	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1683	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1684	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1685	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1686	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1687	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1688	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1689	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1690	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1691	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1692	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1693	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1694	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1695	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1696	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1697	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1698	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1699	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1700	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1701	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1702	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1703	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1704	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1705	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1706	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1707	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1708	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1709	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1710	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1711	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1712	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1713	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1714	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1715	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1716	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1717	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1718	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1719	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1720	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1721	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1722	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1723	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1724	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1725	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1726	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1727	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1728	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1729	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1730	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1731	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1732	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1733	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1734	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1735	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1736	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1737	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1738	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1739	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1740	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1741
				1742	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1743	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1744	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1745	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1746	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1747
				1748	/*
				1749	* Anything below should really be kept as entities references
				1750	*/
				1751	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1752
				1753	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1754	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1755
				1756	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1757	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1758	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1759	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1760	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1761	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1762	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1763	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1764	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1765	{ 922, "Kappa","greek capital letter kappa, U+039A" },
				1766	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
				1767	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1768	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1769	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1770	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1771	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1772	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1773	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1774	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1775	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1776	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1777	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1778	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1779	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1780
				1781	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1782	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1783	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1784	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1785	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1786	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1787	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1788	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1789	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1790	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1791	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1792	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1793	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1794	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1795	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1796	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1797	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1798	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1799	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1800	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1801	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1802	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1803	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1804	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1805	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1806	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1807	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1808	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1809
				1810	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1811	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1812	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1813	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1814	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1815	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1816	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1817	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1818	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1819	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1820	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1821	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1822	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1823	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1824	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1825	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1826	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1827
				1828	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1829	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1830
				1831	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1832
				1833	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1834	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1835
				1836	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1837	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1838
				1839	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1840	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1841
				1842	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1843
				1844	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1845	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1846	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1847	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1848	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1849	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1850	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1851	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1852	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1853	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1854	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1855	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1856	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1857	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1858	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1859	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1860
				1861	{ 8704, "forall","for all, U+2200 ISOtech" },
				1862	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1863	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1864	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1865	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1866	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1867	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1868	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1869	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1870	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
				1871	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1872	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1873	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1874	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1875	{ 8734, "infin","infinity, U+221E ISOtech" },
				1876	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1877	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1878	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1879	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1880	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1881	{ 8747, "int", "integral, U+222B ISOtech" },
				1882	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1883	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1884	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1885	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1886	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1887	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1888	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1889	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1890	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1891	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1892	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1893	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1894	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1895	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1896	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1897	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1898	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1899	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1900	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1901	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1902	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1903	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1904	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1905	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1906
				1907	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1908	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1909	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1910	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1911
				1912	};
				1913
				1914	/************************************************************************
				1915	* *
				1916	* Commodity functions to handle entities *
				1917	* *
				1918	************************************************************************/
				1919
				1920	/*
				1921	* Macro used to grow the current buffer.
				1922	*/
				1923	#define growBuffer(buffer) { \
				1924	xmlChar *tmp; \
				1925	buffer##_size *= 2; \
				1926	tmp = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1927	if (tmp == NULL) { \
				1928	htmlErrMemory(ctxt, "growing buffer\n"); \
				1929	xmlFree(buffer); \
				1930	return(NULL); \
				1931	} \
				1932	buffer = tmp; \
				1933	}
				1934
				1935	/**
				1936	* htmlEntityLookup:
				1937	* @name: the entity name
				1938	*
				1939	* Lookup the given entity in EntitiesTable
				1940	*
				1941	* TODO: the linear scan is really ugly, an hash table is really needed.
				1942	*
				1943	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1944	*/
				1945	const htmlEntityDesc *
				1946	htmlEntityLookup(const xmlChar *name) {
				1947	unsigned int i;
				1948
				1949	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1950	sizeof(html40EntitiesTable[0]));i++) {
				1951	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1952	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
				1953	}
				1954	}
				1955	return(NULL);
				1956	}
				1957
				1958	/**
				1959	* htmlEntityValueLookup:
				1960	* @value: the entity's unicode value
				1961	*
				1962	* Lookup the given entity in EntitiesTable
				1963	*
				1964	* TODO: the linear scan is really ugly, an hash table is really needed.
				1965	*
				1966	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1967	*/
				1968	const htmlEntityDesc *
				1969	htmlEntityValueLookup(unsigned int value) {
				1970	unsigned int i;
				1971
				1972	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1973	sizeof(html40EntitiesTable[0]));i++) {
				1974	if (html40EntitiesTable[i].value >= value) {
				1975	if (html40EntitiesTable[i].value > value)
				1976	break;
				1977	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
				1978	}
				1979	}
				1980	return(NULL);
				1981	}
				1982
				1983	/**
				1984	* UTF8ToHtml:
				1985	* @out: a pointer to an array of bytes to store the result
				1986	* @outlen: the length of @out
				1987	* @in: a pointer to an array of UTF-8 chars
				1988	* @inlen: the length of @in
				1989	*
				1990	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1991	* plus HTML entities block of chars out.
				1992	*
				1993	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1994	* The value of @inlen after return is the number of octets consumed
				1995	* as the return value is positive, else unpredictable.
				1996	* The value of @outlen after return is the number of octets consumed.
				1997	*/
				1998	int
				1999	UTF8ToHtml(unsigned char* out, int *outlen,
				2000	const unsigned char* in, int *inlen) {
				2001	const unsigned char* processed = in;
				2002	const unsigned char* outend;
				2003	const unsigned char* outstart = out;
				2004	const unsigned char* instart = in;
				2005	const unsigned char* inend;
				2006	unsigned int c, d;
				2007	int trailing;
				2008
				2009	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
				2010	if (in == NULL) {
				2011	/*
				2012	* initialization nothing to do
				2013	*/
				2014	*outlen = 0;
				2015	*inlen = 0;
				2016	return(0);
				2017	}
				2018	inend = in + (*inlen);
				2019	outend = out + (*outlen);
				2020	while (in < inend) {
				2021	d = *in++;
				2022	if (d < 0x80) { c= d; trailing= 0; }
				2023	else if (d < 0xC0) {
				2024	/* trailing byte in leading position */
				2025	*outlen = out - outstart;
				2026	*inlen = processed - instart;
				2027	return(-2);
				2028	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				2029	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				2030	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				2031	else {
				2032	/* no chance for this in Ascii */
				2033	*outlen = out - outstart;
				2034	*inlen = processed - instart;
				2035	return(-2);
				2036	}
				2037
				2038	if (inend - in < trailing) {
				2039	break;
				2040	}
				2041
				2042	for ( ; trailing; trailing--) {
				2043	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				2044	break;
				2045	c <<= 6;
				2046	c \|= d & 0x3F;
				2047	}
				2048
				2049	/* assertion: c is a single UTF-4 value */
				2050	if (c < 0x80) {
				2051	if (out + 1 >= outend)
				2052	break;
				2053	*out++ = c;
				2054	} else {
				2055	int len;
				2056	const htmlEntityDesc * ent;
				2057	const char *cp;
				2058	char nbuf[16];
				2059
				2060	/*
				2061	* Try to lookup a predefined HTML entity for it
				2062	*/
				2063
				2064	ent = htmlEntityValueLookup(c);
				2065	if (ent == NULL) {
				2066	snprintf(nbuf, sizeof(nbuf), "#%u", c);
				2067	cp = nbuf;
				2068	}
				2069	else
				2070	cp = ent->name;
				2071	len = strlen(cp);
				2072	if (out + 2 + len >= outend)
				2073	break;
				2074	*out++ = '&';
				2075	memcpy(out, cp, len);
				2076	out += len;
				2077	*out++ = ';';
				2078	}
				2079	processed = in;
				2080	}
				2081	*outlen = out - outstart;
				2082	*inlen = processed - instart;
				2083	return(0);
				2084	}
				2085
				2086	/**
				2087	* htmlEncodeEntities:
				2088	* @out: a pointer to an array of bytes to store the result
				2089	* @outlen: the length of @out
				2090	* @in: a pointer to an array of UTF-8 chars
				2091	* @inlen: the length of @in
				2092	* @quoteChar: the quote character to escape (' or ") or zero.
				2093	*
				2094	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				2095	* plus HTML entities block of chars out.
				2096	*
				2097	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				2098	* The value of @inlen after return is the number of octets consumed
				2099	* as the return value is positive, else unpredictable.
				2100	* The value of @outlen after return is the number of octets consumed.
				2101	*/
				2102	int
				2103	htmlEncodeEntities(unsigned char* out, int *outlen,
				2104	const unsigned char* in, int *inlen, int quoteChar) {
				2105	const unsigned char* processed = in;
				2106	const unsigned char* outend;
				2107	const unsigned char* outstart = out;
				2108	const unsigned char* instart = in;
				2109	const unsigned char* inend;
				2110	unsigned int c, d;
				2111	int trailing;
				2112
				2113	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
				2114	return(-1);
				2115	outend = out + (*outlen);
				2116	inend = in + (*inlen);
				2117	while (in < inend) {
				2118	d = *in++;
				2119	if (d < 0x80) { c= d; trailing= 0; }
				2120	else if (d < 0xC0) {
				2121	/* trailing byte in leading position */
				2122	*outlen = out - outstart;
				2123	*inlen = processed - instart;
				2124	return(-2);
				2125	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				2126	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				2127	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				2128	else {
				2129	/* no chance for this in Ascii */
				2130	*outlen = out - outstart;
				2131	*inlen = processed - instart;
				2132	return(-2);
				2133	}
				2134
				2135	if (inend - in < trailing)
				2136	break;
				2137
				2138	while (trailing--) {
				2139	if (((d= *in++) & 0xC0) != 0x80) {
				2140	*outlen = out - outstart;
				2141	*inlen = processed - instart;
				2142	return(-2);
				2143	}
				2144	c <<= 6;
				2145	c \|= d & 0x3F;
				2146	}
				2147
				2148	/* assertion: c is a single UTF-4 value */
				2149	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				2150	(c != '&') && (c != '<') && (c != '>')) {
				2151	if (out >= outend)
				2152	break;
				2153	*out++ = c;
				2154	} else {
				2155	const htmlEntityDesc * ent;
				2156	const char *cp;
				2157	char nbuf[16];
				2158	int len;
				2159
				2160	/*
				2161	* Try to lookup a predefined HTML entity for it
				2162	*/
				2163	ent = htmlEntityValueLookup(c);
				2164	if (ent == NULL) {
				2165	snprintf(nbuf, sizeof(nbuf), "#%u", c);
				2166	cp = nbuf;
				2167	}
				2168	else
				2169	cp = ent->name;
				2170	len = strlen(cp);
				2171	if (out + 2 + len > outend)
				2172	break;
				2173	*out++ = '&';
				2174	memcpy(out, cp, len);
				2175	out += len;
				2176	*out++ = ';';
				2177	}
				2178	processed = in;
				2179	}
				2180	*outlen = out - outstart;
				2181	*inlen = processed - instart;
				2182	return(0);
				2183	}
				2184
				2185	/************************************************************************
				2186	* *
				2187	* Commodity functions to handle streams *
				2188	* *
				2189	************************************************************************/
				2190
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2191	#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2192	/**
				2193	* htmlNewInputStream:
				2194	* @ctxt: an HTML parser context
				2195	*
				2196	* Create a new input stream structure
				2197	* Returns the new input stream or NULL
				2198	*/
				2199	static htmlParserInputPtr
				2200	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				2201	htmlParserInputPtr input;
				2202
				2203	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				2204	if (input == NULL) {
				2205	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
				2206	return(NULL);
				2207	}
				2208	memset(input, 0, sizeof(htmlParserInput));
				2209	input->filename = NULL;
				2210	input->directory = NULL;
				2211	input->base = NULL;
				2212	input->cur = NULL;
				2213	input->buf = NULL;
				2214	input->line = 1;
				2215	input->col = 1;
				2216	input->buf = NULL;
				2217	input->free = NULL;
				2218	input->version = NULL;
				2219	input->consumed = 0;
				2220	input->length = 0;
				2221	return(input);
				2222	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2223	#endif
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2224
				2225
				2226	/************************************************************************
				2227	* *
				2228	* Commodity functions, cleanup needed ? *
				2229	* *
				2230	************************************************************************/
				2231	/*
				2232	* all tags allowing pc data from the html 4.01 loose dtd
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2233	* NOTE: it might be more appropriate to integrate this information
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2234	* into the html40ElementTable array but I don't want to risk any
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2235	* binary incompatibility
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2236	*/
				2237	static const char *allowPCData[] = {
				2238	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
				2239	"blockquote", "body", "button", "caption", "center", "cite", "code",
				2240	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
				2241	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
				2242	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
				2243	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
				2244	};
				2245
				2246	/**
				2247	* areBlanks:
				2248	* @ctxt: an HTML parser context
				2249	* @str: a xmlChar *
				2250	* @len: the size of @str
				2251	*
				2252	* Is this a sequence of blank chars that one can ignore ?
				2253	*
				2254	* Returns 1 if ignorable 0 otherwise.
				2255	*/
				2256
				2257	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				2258	unsigned int i;
				2259	int j;
				2260	xmlNodePtr lastChild;
				2261	xmlDtdPtr dtd;
				2262
				2263	for (j = 0;j < len;j++)
				2264	if (!(IS_BLANK_CH(str[j]))) return(0);
				2265
				2266	if (CUR == 0) return(1);
				2267	if (CUR != '<') return(0);
				2268	if (ctxt->name == NULL)
				2269	return(1);
				2270	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				2271	return(1);
				2272	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				2273	return(1);
				2274
				2275	/* Only strip CDATA children of the body tag for strict HTML DTDs */
				2276	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
				2277	dtd = xmlGetIntSubset(ctxt->myDoc);
				2278	if (dtd != NULL && dtd->ExternalID != NULL) {
				2279	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
				2280	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
				2281	return(1);
				2282	}
				2283	}
				2284
				2285	if (ctxt->node == NULL) return(0);
				2286	lastChild = xmlGetLastChild(ctxt->node);
				2287	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
				2288	lastChild = lastChild->prev;
				2289	if (lastChild == NULL) {
				2290	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				2291	(ctxt->node->content != NULL)) return(0);
				2292	/* keep ws in constructs like ...<b> </b>...
				2293	for all tags "b" allowing PCDATA */
				2294	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				2295	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
				2296	return(0);
				2297	}
				2298	}
				2299	} else if (xmlNodeIsText(lastChild)) {
				2300	return(0);
				2301	} else {
				2302	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
				2303	for all tags "p" allowing PCDATA */
				2304	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				2305	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
				2306	return(0);
				2307	}
				2308	}
				2309	}
				2310	return(1);
				2311	}
				2312
				2313	/**
				2314	* htmlNewDocNoDtD:
				2315	* @URI: URI for the dtd, or NULL
				2316	* @ExternalID: the external ID of the DTD, or NULL
				2317	*
				2318	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				2319	* are NULL
				2320	*
				2321	* Returns a new document, do not initialize the DTD if not provided
				2322	*/
				2323	htmlDocPtr
				2324	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				2325	xmlDocPtr cur;
				2326
				2327	/*
				2328	* Allocate a new document and fill the fields.
				2329	*/
				2330	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				2331	if (cur == NULL) {
				2332	htmlErrMemory(NULL, "HTML document creation failed\n");
				2333	return(NULL);
				2334	}
				2335	memset(cur, 0, sizeof(xmlDoc));
				2336
				2337	cur->type = XML_HTML_DOCUMENT_NODE;
				2338	cur->version = NULL;
				2339	cur->intSubset = NULL;
				2340	cur->doc = cur;
				2341	cur->name = NULL;
				2342	cur->children = NULL;
				2343	cur->extSubset = NULL;
				2344	cur->oldNs = NULL;
				2345	cur->encoding = NULL;
				2346	cur->standalone = 1;
				2347	cur->compression = 0;
				2348	cur->ids = NULL;
				2349	cur->refs = NULL;
				2350	cur->_private = NULL;
				2351	cur->charset = XML_CHAR_ENCODING_UTF8;
				2352	cur->properties = XML_DOC_HTML \| XML_DOC_USERBUILT;
				2353	if ((ExternalID != NULL) \|\|
				2354	(URI != NULL))
				2355	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
				2356	return(cur);
				2357	}
				2358
				2359	/**
				2360	* htmlNewDoc:
				2361	* @URI: URI for the dtd, or NULL
				2362	* @ExternalID: the external ID of the DTD, or NULL
				2363	*
				2364	* Creates a new HTML document
				2365	*
				2366	* Returns a new document
				2367	*/
				2368	htmlDocPtr
				2369	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				2370	if ((URI == NULL) && (ExternalID == NULL))
				2371	return(htmlNewDocNoDtD(
				2372	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				2373	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
				2374
				2375	return(htmlNewDocNoDtD(URI, ExternalID));
				2376	}
				2377
				2378
				2379	/************************************************************************
				2380	* *
				2381	* The parser itself *
				2382	* Relates to http://www.w3.org/TR/html40 *
				2383	* *
				2384	************************************************************************/
				2385
				2386	/************************************************************************
				2387	* *
				2388	* The parser itself *
				2389	* *
				2390	************************************************************************/
				2391
				2392	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
				2393
				2394	/**
				2395	* htmlParseHTMLName:
				2396	* @ctxt: an HTML parser context
				2397	*
				2398	* parse an HTML tag or attribute name, note that we convert it to lowercase
				2399	* since HTML names are not case-sensitive.
				2400	*
				2401	* Returns the Tag Name parsed or NULL
				2402	*/
				2403
				2404	static const xmlChar *
				2405	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				2406	int i = 0;
				2407	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				2408
				2409	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
				2410	(CUR != ':') && (CUR != '.')) return(NULL);
				2411
				2412	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				2413	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
				2414	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_') \|\|
				2415	(CUR == '.'))) {
				2416	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				2417	else loc[i] = CUR;
				2418	i++;
				2419
				2420	NEXT;
				2421	}
				2422
				2423	return(xmlDictLookup(ctxt->dict, loc, i));
				2424	}
				2425
				2426
				2427	/**
				2428	* htmlParseHTMLName_nonInvasive:
				2429	* @ctxt: an HTML parser context
				2430	*
				2431	* parse an HTML tag or attribute name, note that we convert it to lowercase
				2432	* since HTML names are not case-sensitive, this doesn't consume the data
				2433	* from the stream, it's a look-ahead
				2434	*
				2435	* Returns the Tag Name parsed or NULL
				2436	*/
				2437
				2438	static const xmlChar *
				2439	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
				2440	int i = 0;
				2441	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				2442
				2443	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
				2444	(NXT(1) != ':')) return(NULL);
				2445
				2446	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				2447	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
				2448	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
				2449	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
				2450	else loc[i] = NXT(1+i);
				2451	i++;
				2452	}
				2453
				2454	return(xmlDictLookup(ctxt->dict, loc, i));
				2455	}
				2456
				2457
				2458	/**
				2459	* htmlParseName:
				2460	* @ctxt: an HTML parser context
				2461	*
				2462	* parse an HTML name, this routine is case sensitive.
				2463	*
				2464	* Returns the Name parsed or NULL
				2465	*/
				2466
				2467	static const xmlChar *
				2468	htmlParseName(htmlParserCtxtPtr ctxt) {
				2469	const xmlChar *in;
				2470	const xmlChar *ret;
				2471	int count = 0;
				2472
				2473	GROW;
				2474
				2475	/*
				2476	* Accelerator for simple ASCII names
				2477	*/
				2478	in = ctxt->input->cur;
				2479	if (((in >= 0x61) && (in <= 0x7A)) \|\|
				2480	((in >= 0x41) && (in <= 0x5A)) \|\|
				2481	(in == '_') \|\| (in == ':')) {
				2482	in++;
				2483	while (((in >= 0x61) && (in <= 0x7A)) \|\|
				2484	((in >= 0x41) && (in <= 0x5A)) \|\|
				2485	((in >= 0x30) && (in <= 0x39)) \|\|
				2486	(in == '_') \|\| (in == '-') \|\|
				2487	(in == ':') \|\| (in == '.'))
				2488	in++;
				2489
				2490	if (in == ctxt->input->end)
				2491	return(NULL);
				2492
				2493	if ((in > 0) && (in < 0x80)) {
				2494	count = in - ctxt->input->cur;
				2495	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
				2496	ctxt->input->cur = in;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2497	ctxt->input->col += count;
				2498	return(ret);
				2499	}
				2500	}
				2501	return(htmlParseNameComplex(ctxt));
				2502	}
				2503
				2504	static const xmlChar *
				2505	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
				2506	int len = 0, l;
				2507	int c;
				2508	int count = 0;
				2509	const xmlChar *base = ctxt->input->base;
				2510
				2511	/*
				2512	* Handler for more complex cases
				2513	*/
				2514	GROW;
				2515	c = CUR_CHAR(l);
				2516	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
				2517	(!IS_LETTER(c) && (c != '_') &&
				2518	(c != ':'))) {
				2519	return(NULL);
				2520	}
				2521
				2522	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
				2523	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
				2524	(c == '.') \|\| (c == '-') \|\|
				2525	(c == '_') \|\| (c == ':') \|\|
				2526	(IS_COMBINING(c)) \|\|
				2527	(IS_EXTENDER(c)))) {
				2528	if (count++ > 100) {
				2529	count = 0;
				2530	GROW;
				2531	}
				2532	len += l;
				2533	NEXTL(l);
				2534	c = CUR_CHAR(l);
				2535	if (ctxt->input->base != base) {
				2536	/*
				2537	* We changed encoding from an unknown encoding
				2538	* Input buffer changed location, so we better start again
				2539	*/
				2540	return(htmlParseNameComplex(ctxt));
				2541	}
				2542	}
				2543
				2544	if (ctxt->input->cur - ctxt->input->base < len) {
				2545	/* Sanity check */
				2546	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				2547	"unexpected change of input buffer", NULL, NULL);
				2548	return (NULL);
				2549	}
				2550
				2551	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
				2552	}
				2553
				2554
				2555	/**
				2556	* htmlParseHTMLAttribute:
				2557	* @ctxt: an HTML parser context
				2558	* @stop: a char stop value
				2559	*
				2560	* parse an HTML attribute value till the stop (quote), if
				2561	* stop is 0 then it stops at the first space
				2562	*
				2563	* Returns the attribute parsed or NULL
				2564	*/
				2565
				2566	static xmlChar *
				2567	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				2568	xmlChar *buffer = NULL;
				2569	int buffer_size = 0;
				2570	xmlChar *out = NULL;
				2571	const xmlChar *name = NULL;
				2572	const xmlChar *cur = NULL;
				2573	const htmlEntityDesc * ent;
				2574
				2575	/*
				2576	* allocate a translation buffer.
				2577	*/
				2578	buffer_size = HTML_PARSER_BUFFER_SIZE;
				2579	buffer = (xmlChar ) xmlMallocAtomic(buffer_size sizeof(xmlChar));
				2580	if (buffer == NULL) {
				2581	htmlErrMemory(ctxt, "buffer allocation failed\n");
				2582	return(NULL);
				2583	}
				2584	out = buffer;
				2585
				2586	/*
				2587	* Ok loop until we reach one of the ending chars
				2588	*/
				2589	while ((CUR != 0) && (CUR != stop)) {
				2590	if ((stop == 0) && (CUR == '>')) break;
				2591	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
				2592	if (CUR == '&') {
				2593	if (NXT(1) == '#') {
				2594	unsigned int c;
				2595	int bits;
				2596
				2597	c = htmlParseCharRef(ctxt);
				2598	if (c < 0x80)
				2599	{ *out++ = c; bits= -6; }
				2600	else if (c < 0x800)
				2601	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2602	else if (c < 0x10000)
				2603	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2604	else
				2605	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2606
				2607	for ( ; bits >= 0; bits-= 6) {
				2608	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2609	}
				2610
				2611	if (out - buffer > buffer_size - 100) {
				2612	int indx = out - buffer;
				2613
				2614	growBuffer(buffer);
				2615	out = &buffer[indx];
				2616	}
				2617	} else {
				2618	ent = htmlParseEntityRef(ctxt, &name);
				2619	if (name == NULL) {
				2620	*out++ = '&';
				2621	if (out - buffer > buffer_size - 100) {
				2622	int indx = out - buffer;
				2623
				2624	growBuffer(buffer);
				2625	out = &buffer[indx];
				2626	}
				2627	} else if (ent == NULL) {
				2628	*out++ = '&';
				2629	cur = name;
				2630	while (*cur != 0) {
				2631	if (out - buffer > buffer_size - 100) {
				2632	int indx = out - buffer;
				2633
				2634	growBuffer(buffer);
				2635	out = &buffer[indx];
				2636	}
				2637	out++ = cur++;
				2638	}
				2639	} else {
				2640	unsigned int c;
				2641	int bits;
				2642
				2643	if (out - buffer > buffer_size - 100) {
				2644	int indx = out - buffer;
				2645
				2646	growBuffer(buffer);
				2647	out = &buffer[indx];
				2648	}
				2649	c = ent->value;
				2650	if (c < 0x80)
				2651	{ *out++ = c; bits= -6; }
				2652	else if (c < 0x800)
				2653	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2654	else if (c < 0x10000)
				2655	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2656	else
				2657	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2658
				2659	for ( ; bits >= 0; bits-= 6) {
				2660	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2661	}
				2662	}
				2663	}
				2664	} else {
				2665	unsigned int c;
				2666	int bits, l;
				2667
				2668	if (out - buffer > buffer_size - 100) {
				2669	int indx = out - buffer;
				2670
				2671	growBuffer(buffer);
				2672	out = &buffer[indx];
				2673	}
				2674	c = CUR_CHAR(l);
				2675	if (c < 0x80)
				2676	{ *out++ = c; bits= -6; }
				2677	else if (c < 0x800)
				2678	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2679	else if (c < 0x10000)
				2680	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2681	else
				2682	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2683
				2684	for ( ; bits >= 0; bits-= 6) {
				2685	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2686	}
				2687	NEXT;
				2688	}
				2689	}
				2690	*out = 0;
				2691	return(buffer);
				2692	}
				2693
				2694	/**
				2695	* htmlParseEntityRef:
				2696	* @ctxt: an HTML parser context
				2697	* @str: location to store the entity name
				2698	*
				2699	* parse an HTML ENTITY references
				2700	*
				2701	* [68] EntityRef ::= '&' Name ';'
				2702	*
				2703	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2704	* if non-NULL *str will have to be freed by the caller.
				2705	*/
				2706	const htmlEntityDesc *
				2707	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
				2708	const xmlChar *name;
				2709	const htmlEntityDesc * ent = NULL;
				2710
				2711	if (str != NULL) *str = NULL;
				2712	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
				2713
				2714	if (CUR == '&') {
				2715	NEXT;
				2716	name = htmlParseName(ctxt);
				2717	if (name == NULL) {
				2718	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				2719	"htmlParseEntityRef: no name\n", NULL, NULL);
				2720	} else {
				2721	GROW;
				2722	if (CUR == ';') {
				2723	if (str != NULL)
				2724	*str = name;
				2725
				2726	/*
				2727	* Lookup the entity in the table.
				2728	*/
				2729	ent = htmlEntityLookup(name);
				2730	if (ent != NULL) /* OK that's ugly !!! */
				2731	NEXT;
				2732	} else {
				2733	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
				2734	"htmlParseEntityRef: expecting ';'\n",
				2735	NULL, NULL);
				2736	if (str != NULL)
				2737	*str = name;
				2738	}
				2739	}
				2740	}
				2741	return(ent);
				2742	}
				2743
				2744	/**
				2745	* htmlParseAttValue:
				2746	* @ctxt: an HTML parser context
				2747	*
				2748	* parse a value for an attribute
				2749	* Note: the parser won't do substitution of entities here, this
				2750	* will be handled later in xmlStringGetNodeList, unless it was
				2751	* asked for ctxt->replaceEntities != 0
				2752	*
				2753	* Returns the AttValue parsed or NULL.
				2754	*/
				2755
				2756	static xmlChar *
				2757	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2758	xmlChar *ret = NULL;
				2759
				2760	if (CUR == '"') {
				2761	NEXT;
				2762	ret = htmlParseHTMLAttribute(ctxt, '"');
				2763	if (CUR != '"') {
				2764	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
				2765	"AttValue: \" expected\n", NULL, NULL);
				2766	} else
				2767	NEXT;
				2768	} else if (CUR == '\'') {
				2769	NEXT;
				2770	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2771	if (CUR != '\'') {
				2772	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
				2773	"AttValue: ' expected\n", NULL, NULL);
				2774	} else
				2775	NEXT;
				2776	} else {
				2777	/*
				2778	* That's an HTMLism, the attribute value may not be quoted
				2779	*/
				2780	ret = htmlParseHTMLAttribute(ctxt, 0);
				2781	if (ret == NULL) {
				2782	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
				2783	"AttValue: no value found\n", NULL, NULL);
				2784	}
				2785	}
				2786	return(ret);
				2787	}
				2788
				2789	/**
				2790	* htmlParseSystemLiteral:
				2791	* @ctxt: an HTML parser context
				2792	*
				2793	* parse an HTML Literal
				2794	*
				2795	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2796	*
				2797	* Returns the SystemLiteral parsed or NULL
				2798	*/
				2799
				2800	static xmlChar *
				2801	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2802	size_t len = 0, startPosition = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2803	int err = 0;
				2804	int quote;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2805	xmlChar *ret = NULL;
				2806
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2807	if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2808	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2809	"SystemLiteral \" or ' expected\n", NULL, NULL);
				2810	return(NULL);
				2811	}
				2812	quote = CUR;
				2813	NEXT;
				2814
				2815	if (CUR_PTR < BASE_PTR)
				2816	return(ret);
				2817	startPosition = CUR_PTR - BASE_PTR;
				2818
				2819	while ((CUR != 0) && (CUR != quote)) {
				2820	/* TODO: Handle UTF-8 */
				2821	if (!IS_CHAR_CH(CUR)) {
				2822	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2823	"Invalid char in SystemLiteral 0x%X\n", CUR);
				2824	err = 1;
				2825	}
				2826	NEXT;
				2827	len++;
				2828	}
				2829	if (CUR != quote) {
				2830	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
				2831	"Unfinished SystemLiteral\n", NULL, NULL);
				2832	} else {
				2833	NEXT;
				2834	if (err == 0)
				2835	ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2836	}
				2837
				2838	return(ret);
				2839	}
				2840
				2841	/**
				2842	* htmlParsePubidLiteral:
				2843	* @ctxt: an HTML parser context
				2844	*
				2845	* parse an HTML public literal
				2846	*
				2847	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2848	*
				2849	* Returns the PubidLiteral parsed or NULL.
				2850	*/
				2851
				2852	static xmlChar *
				2853	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2854	size_t len = 0, startPosition = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2855	int err = 0;
				2856	int quote;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2857	xmlChar *ret = NULL;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2858
				2859	if ((CUR != '"') && (CUR != '\'')) {
				2860	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
				2861	"PubidLiteral \" or ' expected\n", NULL, NULL);
				2862	return(NULL);
				2863	}
				2864	quote = CUR;
				2865	NEXT;
				2866
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2867	/*
				2868	* Name ::= (Letter \| '_') (NameChar)*
				2869	*/
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2870	if (CUR_PTR < BASE_PTR)
				2871	return(ret);
				2872	startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2873
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2874	while ((CUR != 0) && (CUR != quote)) {
				2875	if (!IS_PUBIDCHAR_CH(CUR)) {
				2876	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2877	"Invalid char in PubidLiteral 0x%X\n", CUR);
				2878	err = 1;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2879	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2880	len++;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2881	NEXT;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2882	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2883
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2884	if (CUR != '"') {
				2885	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
				2886	"Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2887	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2888	NEXT;
				2889	if (err == 0)
				2890	ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2891	}
				2892
				2893	return(ret);
				2894	}
				2895
				2896	/**
				2897	* htmlParseScript:
				2898	* @ctxt: an HTML parser context
				2899	*
				2900	* parse the content of an HTML SCRIPT or STYLE element
				2901	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2902	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2903	* http://www.w3.org/TR/html4/types.html#type-script
				2904	* http://www.w3.org/TR/html4/types.html#h-6.15
				2905	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2906	*
				2907	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2908	* element and the value of intrinsic event attributes. User agents must
				2909	* not evaluate script data as HTML markup but instead must pass it on as
				2910	* data to a script engine.
				2911	* NOTES:
				2912	* - The content is passed like CDATA
				2913	* - the attributes for style and scripting "onXXX" are also described
				2914	* as CDATA but SGML allows entities references in attributes so their
				2915	* processing is identical as other attributes
				2916	*/
				2917	static void
				2918	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2919	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2920	int nbchar = 0;
				2921	int cur,l;
				2922
				2923	SHRINK;
				2924	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2925	while (cur != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2926	if ((cur == '<') && (NXT(1) == '/')) {
				2927	/*
				2928	* One should break here, the specification is clear:
				2929	* Authors should therefore escape "</" within the content.
				2930	* Escape mechanisms are specific to each scripting or
				2931	* style sheet language.
				2932	*
				2933	* In recovery mode, only break if end tag match the
				2934	* current tag, effectively ignoring all tags inside the
				2935	* script/style block and treating the entire block as
				2936	* CDATA.
				2937	*/
				2938	if (ctxt->recovery) {
				2939	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
				2940	xmlStrlen(ctxt->name)) == 0)
				2941	{
				2942	break; /* while */
				2943	} else {
				2944	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				2945	"Element %s embeds close tag\n",
				2946	ctxt->name, NULL);
				2947	}
				2948	} else {
				2949	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2950	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2951	{
				2952	break; /* while */
				2953	}
				2954	}
				2955	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2956	if (IS_CHAR(cur)) {
				2957	COPY_BUF(l,buf,nbchar,cur);
				2958	} else {
				2959	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2960	"Invalid char in CDATA 0x%X\n", cur);
				2961	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2962	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2963	buf[nbchar] = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2964	if (ctxt->sax->cdataBlock!= NULL) {
				2965	/*
				2966	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2967	*/
				2968	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2969	} else if (ctxt->sax->characters != NULL) {
				2970	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2971	}
				2972	nbchar = 0;
				2973	}
				2974	GROW;
				2975	NEXTL(l);
				2976	cur = CUR_CHAR(l);
				2977	}
				2978
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2979	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	2980	buf[nbchar] = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2981	if (ctxt->sax->cdataBlock!= NULL) {
				2982	/*
				2983	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2984	*/
				2985	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2986	} else if (ctxt->sax->characters != NULL) {
				2987	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2988	}
				2989	}
				2990	}
				2991
				2992
				2993	/**
				2994	* htmlParseCharDataInternal:
				2995	* @ctxt: an HTML parser context
				2996	* @readahead: optional read ahead character in ascii range
				2997	*
				2998	* parse a CharData section.
				2999	* if we are within a CDATA section ']]>' marks an end of section.
				3000	*
				3001	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				3002	*/
				3003
				3004	static void
				3005	htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
				3006	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
				3007	int nbchar = 0;
				3008	int cur, l;
				3009	int chunk = 0;
				3010
				3011	if (readahead)
				3012	buf[nbchar++] = readahead;
				3013
				3014	SHRINK;
				3015	cur = CUR_CHAR(l);
				3016	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				3017	((cur != '&') \|\| (ctxt->token == '&')) &&
				3018	(cur != 0)) {
				3019	if (!(IS_CHAR(cur))) {
				3020	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3021	"Invalid char in CDATA 0x%X\n", cur);
				3022	} else {
				3023	COPY_BUF(l,buf,nbchar,cur);
				3024	}
				3025	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3026	buf[nbchar] = 0;
				3027
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3028	/*
				3029	* Ok the segment is to be consumed as chars.
				3030	*/
				3031	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				3032	if (areBlanks(ctxt, buf, nbchar)) {
				3033	if (ctxt->keepBlanks) {
				3034	if (ctxt->sax->characters != NULL)
				3035	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3036	} else {
				3037	if (ctxt->sax->ignorableWhitespace != NULL)
				3038	ctxt->sax->ignorableWhitespace(ctxt->userData,
				3039	buf, nbchar);
				3040	}
				3041	} else {
				3042	htmlCheckParagraph(ctxt);
				3043	if (ctxt->sax->characters != NULL)
				3044	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3045	}
				3046	}
				3047	nbchar = 0;
				3048	}
				3049	NEXTL(l);
				3050	chunk++;
				3051	if (chunk > HTML_PARSER_BUFFER_SIZE) {
				3052	chunk = 0;
				3053	SHRINK;
				3054	GROW;
				3055	}
				3056	cur = CUR_CHAR(l);
				3057	if (cur == 0) {
				3058	SHRINK;
				3059	GROW;
				3060	cur = CUR_CHAR(l);
				3061	}
				3062	}
				3063	if (nbchar != 0) {
				3064	buf[nbchar] = 0;
				3065
				3066	/*
				3067	* Ok the segment is to be consumed as chars.
				3068	*/
				3069	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				3070	if (areBlanks(ctxt, buf, nbchar)) {
				3071	if (ctxt->keepBlanks) {
				3072	if (ctxt->sax->characters != NULL)
				3073	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3074	} else {
				3075	if (ctxt->sax->ignorableWhitespace != NULL)
				3076	ctxt->sax->ignorableWhitespace(ctxt->userData,
				3077	buf, nbchar);
				3078	}
				3079	} else {
				3080	htmlCheckParagraph(ctxt);
				3081	if (ctxt->sax->characters != NULL)
				3082	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3083	}
				3084	}
				3085	} else {
				3086	/*
				3087	* Loop detection
				3088	*/
				3089	if (cur == 0)
				3090	ctxt->instate = XML_PARSER_EOF;
				3091	}
				3092	}
				3093
				3094	/**
				3095	* htmlParseCharData:
				3096	* @ctxt: an HTML parser context
				3097	*
				3098	* parse a CharData section.
				3099	* if we are within a CDATA section ']]>' marks an end of section.
				3100	*
				3101	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				3102	*/
				3103
				3104	static void
				3105	htmlParseCharData(htmlParserCtxtPtr ctxt) {
				3106	htmlParseCharDataInternal(ctxt, 0);
				3107	}
				3108
				3109	/**
				3110	* htmlParseExternalID:
				3111	* @ctxt: an HTML parser context
				3112	* @publicID: a xmlChar** receiving PubidLiteral
				3113	*
				3114	* Parse an External ID or a Public ID
				3115	*
				3116	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				3117	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				3118	*
				3119	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				3120	*
				3121	* Returns the function returns SystemLiteral and in the second
				3122	* case publicID receives PubidLiteral, is strict is off
				3123	* it is possible to return NULL and have publicID set.
				3124	*/
				3125
				3126	static xmlChar *
				3127	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
				3128	xmlChar *URI = NULL;
				3129
				3130	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				3131	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				3132	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				3133	SKIP(6);
				3134	if (!IS_BLANK_CH(CUR)) {
				3135	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3136	"Space required after 'SYSTEM'\n", NULL, NULL);
				3137	}
				3138	SKIP_BLANKS;
				3139	URI = htmlParseSystemLiteral(ctxt);
				3140	if (URI == NULL) {
				3141	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
				3142	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
				3143	}
				3144	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				3145	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				3146	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				3147	SKIP(6);
				3148	if (!IS_BLANK_CH(CUR)) {
				3149	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3150	"Space required after 'PUBLIC'\n", NULL, NULL);
				3151	}
				3152	SKIP_BLANKS;
				3153	*publicID = htmlParsePubidLiteral(ctxt);
				3154	if (*publicID == NULL) {
				3155	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
				3156	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
				3157	NULL, NULL);
				3158	}
				3159	SKIP_BLANKS;
				3160	if ((CUR == '"') \|\| (CUR == '\'')) {
				3161	URI = htmlParseSystemLiteral(ctxt);
				3162	}
				3163	}
				3164	return(URI);
				3165	}
				3166
				3167	/**
				3168	* xmlParsePI:
				3169	* @ctxt: an XML parser context
				3170	*
				3171	* parse an XML Processing Instruction.
				3172	*
				3173	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
				3174	*/
				3175	static void
				3176	htmlParsePI(htmlParserCtxtPtr ctxt) {
				3177	xmlChar *buf = NULL;
				3178	int len = 0;
				3179	int size = HTML_PARSER_BUFFER_SIZE;
				3180	int cur, l;
				3181	const xmlChar *target;
				3182	xmlParserInputState state;
				3183	int count = 0;
				3184
				3185	if ((RAW == '<') && (NXT(1) == '?')) {
				3186	state = ctxt->instate;
				3187	ctxt->instate = XML_PARSER_PI;
				3188	/*
				3189	* this is a Processing Instruction.
				3190	*/
				3191	SKIP(2);
				3192	SHRINK;
				3193
				3194	/*
				3195	* Parse the target name and check for special support like
				3196	* namespace.
				3197	*/
				3198	target = htmlParseName(ctxt);
				3199	if (target != NULL) {
				3200	if (RAW == '>') {
				3201	SKIP(1);
				3202
				3203	/*
				3204	* SAX: PI detected.
				3205	*/
				3206	if ((ctxt->sax) && (!ctxt->disableSAX) &&
				3207	(ctxt->sax->processingInstruction != NULL))
				3208	ctxt->sax->processingInstruction(ctxt->userData,
				3209	target, NULL);
				3210	ctxt->instate = state;
				3211	return;
				3212	}
				3213	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
				3214	if (buf == NULL) {
				3215	htmlErrMemory(ctxt, NULL);
				3216	ctxt->instate = state;
				3217	return;
				3218	}
				3219	cur = CUR;
				3220	if (!IS_BLANK(cur)) {
				3221	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3222	"ParsePI: PI %s space expected\n", target, NULL);
				3223	}
				3224	SKIP_BLANKS;
				3225	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3226	while ((cur != 0) && (cur != '>')) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3227	if (len + 5 >= size) {
				3228	xmlChar *tmp;
				3229
				3230	size *= 2;
				3231	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				3232	if (tmp == NULL) {
				3233	htmlErrMemory(ctxt, NULL);
				3234	xmlFree(buf);
				3235	ctxt->instate = state;
				3236	return;
				3237	}
				3238	buf = tmp;
				3239	}
				3240	count++;
				3241	if (count > 50) {
				3242	GROW;
				3243	count = 0;
				3244	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3245	if (IS_CHAR(cur)) {
				3246	COPY_BUF(l,buf,len,cur);
				3247	} else {
				3248	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3249	"Invalid char in processing instruction "
				3250	"0x%X\n", cur);
				3251	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3252	NEXTL(l);
				3253	cur = CUR_CHAR(l);
				3254	if (cur == 0) {
				3255	SHRINK;
				3256	GROW;
				3257	cur = CUR_CHAR(l);
				3258	}
				3259	}
				3260	buf[len] = 0;
				3261	if (cur != '>') {
				3262	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
				3263	"ParsePI: PI %s never end ...\n", target, NULL);
				3264	} else {
				3265	SKIP(1);
				3266
				3267	/*
				3268	* SAX: PI detected.
				3269	*/
				3270	if ((ctxt->sax) && (!ctxt->disableSAX) &&
				3271	(ctxt->sax->processingInstruction != NULL))
				3272	ctxt->sax->processingInstruction(ctxt->userData,
				3273	target, buf);
				3274	}
				3275	xmlFree(buf);
				3276	} else {
				3277	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
				3278	"PI is not started correctly", NULL, NULL);
				3279	}
				3280	ctxt->instate = state;
				3281	}
				3282	}
				3283
				3284	/**
				3285	* htmlParseComment:
				3286	* @ctxt: an HTML parser context
				3287	*
				3288	* Parse an XML (SGML) comment <!-- .... -->
				3289	*
				3290	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				3291	*/
				3292	static void
				3293	htmlParseComment(htmlParserCtxtPtr ctxt) {
				3294	xmlChar *buf = NULL;
				3295	int len;
				3296	int size = HTML_PARSER_BUFFER_SIZE;
				3297	int q, ql;
				3298	int r, rl;
				3299	int cur, l;
				3300	xmlParserInputState state;
				3301
				3302	/*
				3303	* Check that there is a comment right here.
				3304	*/
				3305	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				3306	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				3307
				3308	state = ctxt->instate;
				3309	ctxt->instate = XML_PARSER_COMMENT;
				3310	SHRINK;
				3311	SKIP(4);
				3312	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
				3313	if (buf == NULL) {
				3314	htmlErrMemory(ctxt, "buffer allocation failed\n");
				3315	ctxt->instate = state;
				3316	return;
				3317	}
				3318	len = 0;
				3319	buf[len] = 0;
				3320	q = CUR_CHAR(ql);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3321	if (q == 0)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3322	goto unfinished;
				3323	NEXTL(ql);
				3324	r = CUR_CHAR(rl);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3325	if (r == 0)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3326	goto unfinished;
				3327	NEXTL(rl);
				3328	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3329	while ((cur != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3330	((cur != '>') \|\|
				3331	(r != '-') \|\| (q != '-'))) {
				3332	if (len + 5 >= size) {
				3333	xmlChar *tmp;
				3334
				3335	size *= 2;
				3336	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				3337	if (tmp == NULL) {
				3338	xmlFree(buf);
				3339	htmlErrMemory(ctxt, "growing buffer failed\n");
				3340	ctxt->instate = state;
				3341	return;
				3342	}
				3343	buf = tmp;
				3344	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3345	if (IS_CHAR(q)) {
				3346	COPY_BUF(ql,buf,len,q);
				3347	} else {
				3348	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3349	"Invalid char in comment 0x%X\n", q);
				3350	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3351	q = r;
				3352	ql = rl;
				3353	r = cur;
				3354	rl = l;
				3355	NEXTL(l);
				3356	cur = CUR_CHAR(l);
				3357	if (cur == 0) {
				3358	SHRINK;
				3359	GROW;
				3360	cur = CUR_CHAR(l);
				3361	}
				3362	}
				3363	buf[len] = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3364	if (cur == '>') {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3365	NEXT;
				3366	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				3367	(!ctxt->disableSAX))
				3368	ctxt->sax->comment(ctxt->userData, buf);
				3369	xmlFree(buf);
				3370	ctxt->instate = state;
				3371	return;
				3372	}
				3373
				3374	unfinished:
				3375	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
				3376	"Comment not terminated \n<!--%.50s\n", buf, NULL);
				3377	xmlFree(buf);
				3378	}
				3379
				3380	/**
				3381	* htmlParseCharRef:
				3382	* @ctxt: an HTML parser context
				3383	*
				3384	* parse Reference declarations
				3385	*
				3386	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				3387	* '&#x' [0-9a-fA-F]+ ';'
				3388	*
				3389	* Returns the value parsed (as an int)
				3390	*/
				3391	int
				3392	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				3393	int val = 0;
				3394
				3395	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				3396	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				3397	"htmlParseCharRef: context error\n",
				3398	NULL, NULL);
				3399	return(0);
				3400	}
				3401	if ((CUR == '&') && (NXT(1) == '#') &&
				3402	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
				3403	SKIP(3);
				3404	while (CUR != ';') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3405	if ((CUR >= '0') && (CUR <= '9')) {
				3406	if (val < 0x110000)
				3407	val = val * 16 + (CUR - '0');
				3408	} else if ((CUR >= 'a') && (CUR <= 'f')) {
				3409	if (val < 0x110000)
				3410	val = val * 16 + (CUR - 'a') + 10;
				3411	} else if ((CUR >= 'A') && (CUR <= 'F')) {
				3412	if (val < 0x110000)
				3413	val = val * 16 + (CUR - 'A') + 10;
				3414	} else {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3415	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
				3416	"htmlParseCharRef: missing semicolon\n",
				3417	NULL, NULL);
				3418	break;
				3419	}
				3420	NEXT;
				3421	}
				3422	if (CUR == ';')
				3423	NEXT;
				3424	} else if ((CUR == '&') && (NXT(1) == '#')) {
				3425	SKIP(2);
				3426	while (CUR != ';') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3427	if ((CUR >= '0') && (CUR <= '9')) {
				3428	if (val < 0x110000)
				3429	val = val * 10 + (CUR - '0');
				3430	} else {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3431	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
				3432	"htmlParseCharRef: missing semicolon\n",
				3433	NULL, NULL);
				3434	break;
				3435	}
				3436	NEXT;
				3437	}
				3438	if (CUR == ';')
				3439	NEXT;
				3440	} else {
				3441	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
				3442	"htmlParseCharRef: invalid value\n", NULL, NULL);
				3443	}
				3444	/*
				3445	* Check the value IS_CHAR ...
				3446	*/
				3447	if (IS_CHAR(val)) {
				3448	return(val);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3449	} else if (val >= 0x110000) {
				3450	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
				3451	"htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3452	} else {
				3453	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3454	"htmlParseCharRef: invalid xmlChar value %d\n",
				3455	val);
				3456	}
				3457	return(0);
				3458	}
				3459
				3460
				3461	/**
				3462	* htmlParseDocTypeDecl:
				3463	* @ctxt: an HTML parser context
				3464	*
				3465	* parse a DOCTYPE declaration
				3466	*
				3467	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				3468	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				3469	*/
				3470
				3471	static void
				3472	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				3473	const xmlChar *name;
				3474	xmlChar *ExternalID = NULL;
				3475	xmlChar *URI = NULL;
				3476
				3477	/*
				3478	* We know that '<!DOCTYPE' has been detected.
				3479	*/
				3480	SKIP(9);
				3481
				3482	SKIP_BLANKS;
				3483
				3484	/*
				3485	* Parse the DOCTYPE name.
				3486	*/
				3487	name = htmlParseName(ctxt);
				3488	if (name == NULL) {
				3489	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3490	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
				3491	NULL, NULL);
				3492	}
				3493	/*
				3494	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				3495	*/
				3496
				3497	SKIP_BLANKS;
				3498
				3499	/*
				3500	* Check for SystemID and ExternalID
				3501	*/
				3502	URI = htmlParseExternalID(ctxt, &ExternalID);
				3503	SKIP_BLANKS;
				3504
				3505	/*
				3506	* We should be at the end of the DOCTYPE declaration.
				3507	*/
				3508	if (CUR != '>') {
				3509	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
				3510	"DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3511	/* Ignore bogus content */
				3512	while ((CUR != 0) && (CUR != '>'))
				3513	NEXT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3514	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3515	if (CUR == '>')
				3516	NEXT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3517
				3518	/*
				3519	* Create or update the document accordingly to the DOCTYPE
				3520	*/
				3521	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				3522	(!ctxt->disableSAX))
				3523	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				3524
				3525	/*
				3526	* Cleanup, since we don't use all those identifiers
				3527	*/
				3528	if (URI != NULL) xmlFree(URI);
				3529	if (ExternalID != NULL) xmlFree(ExternalID);
				3530	}
				3531
				3532	/**
				3533	* htmlParseAttribute:
				3534	* @ctxt: an HTML parser context
				3535	* @value: a xmlChar ** used to store the value of the attribute
				3536	*
				3537	* parse an attribute
				3538	*
				3539	* [41] Attribute ::= Name Eq AttValue
				3540	*
				3541	* [25] Eq ::= S? '=' S?
				3542	*
				3543	* With namespace:
				3544	*
				3545	* [NS 11] Attribute ::= QName Eq AttValue
				3546	*
				3547	* Also the case QName == xmlns:??? is handled independently as a namespace
				3548	* definition.
				3549	*
				3550	* Returns the attribute name, and the value in *value.
				3551	*/
				3552
				3553	static const xmlChar *
				3554	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				3555	const xmlChar *name;
				3556	xmlChar *val = NULL;
				3557
				3558	*value = NULL;
				3559	name = htmlParseHTMLName(ctxt);
				3560	if (name == NULL) {
				3561	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3562	"error parsing attribute name\n", NULL, NULL);
				3563	return(NULL);
				3564	}
				3565
				3566	/*
				3567	* read the value
				3568	*/
				3569	SKIP_BLANKS;
				3570	if (CUR == '=') {
				3571	NEXT;
				3572	SKIP_BLANKS;
				3573	val = htmlParseAttValue(ctxt);
				3574	}
				3575
				3576	*value = val;
				3577	return(name);
				3578	}
				3579
				3580	/**
				3581	* htmlCheckEncodingDirect:
				3582	* @ctxt: an HTML parser context
				3583	* @attvalue: the attribute value
				3584	*
				3585	* Checks an attribute value to detect
				3586	* the encoding
				3587	* If a new encoding is detected the parser is switched to decode
				3588	* it and pass UTF8
				3589	*/
				3590	static void
				3591	htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
				3592
				3593	if ((ctxt == NULL) \|\| (encoding == NULL) \|\|
				3594	(ctxt->options & HTML_PARSE_IGNORE_ENC))
				3595	return;
				3596
				3597	/* do not change encoding */
				3598	if (ctxt->input->encoding != NULL)
				3599	return;
				3600
				3601	if (encoding != NULL) {
				3602	xmlCharEncoding enc;
				3603	xmlCharEncodingHandlerPtr handler;
				3604
				3605	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				3606
				3607	if (ctxt->input->encoding != NULL)
				3608	xmlFree((xmlChar *) ctxt->input->encoding);
				3609	ctxt->input->encoding = xmlStrdup(encoding);
				3610
				3611	enc = xmlParseCharEncoding((const char *) encoding);
				3612	/*
				3613	* registered set of known encodings
				3614	*/
				3615	if (enc != XML_CHAR_ENCODING_ERROR) {
				3616	if (((enc == XML_CHAR_ENCODING_UTF16LE) \|\|
				3617	(enc == XML_CHAR_ENCODING_UTF16BE) \|\|
				3618	(enc == XML_CHAR_ENCODING_UCS4LE) \|\|
				3619	(enc == XML_CHAR_ENCODING_UCS4BE)) &&
				3620	(ctxt->input->buf != NULL) &&
				3621	(ctxt->input->buf->encoder == NULL)) {
				3622	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				3623	"htmlCheckEncoding: wrong encoding meta\n",
				3624	NULL, NULL);
				3625	} else {
				3626	xmlSwitchEncoding(ctxt, enc);
				3627	}
				3628	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				3629	} else {
				3630	/*
				3631	* fallback for unknown encodings
				3632	*/
				3633	handler = xmlFindCharEncodingHandler((const char *) encoding);
				3634	if (handler != NULL) {
				3635	xmlSwitchToEncoding(ctxt, handler);
				3636	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				3637	} else {
				3638	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				3639	"htmlCheckEncoding: unknown encoding %s\n",
				3640	encoding, NULL);
				3641	}
				3642	}
				3643
				3644	if ((ctxt->input->buf != NULL) &&
				3645	(ctxt->input->buf->encoder != NULL) &&
				3646	(ctxt->input->buf->raw != NULL) &&
				3647	(ctxt->input->buf->buffer != NULL)) {
				3648	int nbchars;
				3649	int processed;
				3650
				3651	/*
				3652	* convert as much as possible to the parser reading buffer.
				3653	*/
				3654	processed = ctxt->input->cur - ctxt->input->base;
				3655	xmlBufShrink(ctxt->input->buf->buffer, processed);
				3656	nbchars = xmlCharEncInput(ctxt->input->buf, 1);
				3657	xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
				3658	if (nbchars < 0) {
				3659	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				3660	"htmlCheckEncoding: encoder error\n",
				3661	NULL, NULL);
				3662	}
				3663	}
				3664	}
				3665	}
				3666
				3667	/**
				3668	* htmlCheckEncoding:
				3669	* @ctxt: an HTML parser context
				3670	* @attvalue: the attribute value
				3671	*
				3672	* Checks an http-equiv attribute from a Meta tag to detect
				3673	* the encoding
				3674	* If a new encoding is detected the parser is switched to decode
				3675	* it and pass UTF8
				3676	*/
				3677	static void
				3678	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				3679	const xmlChar *encoding;
				3680
				3681	if (!attvalue)
				3682	return;
				3683
				3684	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
				3685	if (encoding != NULL) {
				3686	encoding += 7;
				3687	}
				3688	/*
				3689	* skip blank
				3690	*/
				3691	if (encoding && IS_BLANK_CH(*encoding))
				3692	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
				3693	if (encoding && *encoding == '=') {
				3694	encoding ++;
				3695	htmlCheckEncodingDirect(ctxt, encoding);
				3696	}
				3697	}
				3698
				3699	/**
				3700	* htmlCheckMeta:
				3701	* @ctxt: an HTML parser context
				3702	* @atts: the attributes values
				3703	*
				3704	* Checks an attributes from a Meta tag
				3705	*/
				3706	static void
				3707	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				3708	int i;
				3709	const xmlChar att, value;
				3710	int http = 0;
				3711	const xmlChar *content = NULL;
				3712
				3713	if ((ctxt == NULL) \|\| (atts == NULL))
				3714	return;
				3715
				3716	i = 0;
				3717	att = atts[i++];
				3718	while (att != NULL) {
				3719	value = atts[i++];
				3720	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				3721	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				3722	http = 1;
				3723	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
				3724	htmlCheckEncodingDirect(ctxt, value);
				3725	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				3726	content = value;
				3727	att = atts[i++];
				3728	}
				3729	if ((http) && (content != NULL))
				3730	htmlCheckEncoding(ctxt, content);
				3731
				3732	}
				3733
				3734	/**
				3735	* htmlParseStartTag:
				3736	* @ctxt: an HTML parser context
				3737	*
				3738	* parse a start of tag either for rule element or
				3739	* EmptyElement. In both case we don't parse the tag closing chars.
				3740	*
				3741	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				3742	*
				3743	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				3744	*
				3745	* With namespace:
				3746	*
				3747	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				3748	*
				3749	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				3750	*
				3751	* Returns 0 in case of success, -1 in case of error and 1 if discarded
				3752	*/
				3753
				3754	static int
				3755	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				3756	const xmlChar *name;
				3757	const xmlChar *attname;
				3758	xmlChar *attvalue;
				3759	const xmlChar **atts;
				3760	int nbatts = 0;
				3761	int maxatts;
				3762	int meta = 0;
				3763	int i;
				3764	int discardtag = 0;
				3765
				3766	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				3767	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				3768	"htmlParseStartTag: context error\n", NULL, NULL);
				3769	return -1;
				3770	}
				3771	if (ctxt->instate == XML_PARSER_EOF)
				3772	return(-1);
				3773	if (CUR != '<') return -1;
				3774	NEXT;
				3775
				3776	atts = ctxt->atts;
				3777	maxatts = ctxt->maxatts;
				3778
				3779	GROW;
				3780	name = htmlParseHTMLName(ctxt);
				3781	if (name == NULL) {
				3782	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3783	"htmlParseStartTag: invalid element name\n",
				3784	NULL, NULL);
				3785	/* if recover preserve text on classic misconstructs */
				3786	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) \|\| (CUR == '<') \|\|
				3787	(CUR == '=') \|\| (CUR == '>') \|\| (((CUR >= '0') && (CUR <= '9'))))) {
				3788	htmlParseCharDataInternal(ctxt, '<');
				3789	return(-1);
				3790	}
				3791
				3792
				3793	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3794	while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3795	(ctxt->instate != XML_PARSER_EOF))
				3796	NEXT;
				3797	return -1;
				3798	}
				3799	if (xmlStrEqual(name, BAD_CAST"meta"))
				3800	meta = 1;
				3801
				3802	/*
				3803	* Check for auto-closure of HTML elements.
				3804	*/
				3805	htmlAutoClose(ctxt, name);
				3806
				3807	/*
				3808	* Check for implied HTML elements.
				3809	*/
				3810	htmlCheckImplied(ctxt, name);
				3811
				3812	/*
				3813	* Avoid html at any level > 0, head at any level != 1
				3814	* or any attempt to recurse body
				3815	*/
				3816	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				3817	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3818	"htmlParseStartTag: misplaced <html> tag\n",
				3819	name, NULL);
				3820	discardtag = 1;
				3821	ctxt->depth++;
				3822	}
				3823	if ((ctxt->nameNr != 1) &&
				3824	(xmlStrEqual(name, BAD_CAST"head"))) {
				3825	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3826	"htmlParseStartTag: misplaced <head> tag\n",
				3827	name, NULL);
				3828	discardtag = 1;
				3829	ctxt->depth++;
				3830	}
				3831	if (xmlStrEqual(name, BAD_CAST"body")) {
				3832	int indx;
				3833	for (indx = 0;indx < ctxt->nameNr;indx++) {
				3834	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
				3835	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3836	"htmlParseStartTag: misplaced <body> tag\n",
				3837	name, NULL);
				3838	discardtag = 1;
				3839	ctxt->depth++;
				3840	}
				3841	}
				3842	}
				3843
				3844	/*
				3845	* Now parse the attributes, it ends up with the ending
				3846	*
				3847	* (S Attribute)* S?
				3848	*/
				3849	SKIP_BLANKS;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3850	while ((CUR != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3851	(CUR != '>') &&
				3852	((CUR != '/') \|\| (NXT(1) != '>'))) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3853	GROW;
				3854	attname = htmlParseAttribute(ctxt, &attvalue);
				3855	if (attname != NULL) {
				3856
				3857	/*
				3858	* Well formedness requires at most one declaration of an attribute
				3859	*/
				3860	for (i = 0; i < nbatts;i += 2) {
				3861	if (xmlStrEqual(atts[i], attname)) {
				3862	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
				3863	"Attribute %s redefined\n", attname, NULL);
				3864	if (attvalue != NULL)
				3865	xmlFree(attvalue);
				3866	goto failed;
				3867	}
				3868	}
				3869
				3870	/*
				3871	* Add the pair to atts
				3872	*/
				3873	if (atts == NULL) {
				3874	maxatts = 22; /* allow for 10 attrs by default */
				3875	atts = (const xmlChar **)
				3876	xmlMalloc(maxatts * sizeof(xmlChar *));
				3877	if (atts == NULL) {
				3878	htmlErrMemory(ctxt, NULL);
				3879	if (attvalue != NULL)
				3880	xmlFree(attvalue);
				3881	goto failed;
				3882	}
				3883	ctxt->atts = atts;
				3884	ctxt->maxatts = maxatts;
				3885	} else if (nbatts + 4 > maxatts) {
				3886	const xmlChar **n;
				3887
				3888	maxatts *= 2;
				3889	n = (const xmlChar *) xmlRealloc((void ) atts,
				3890	maxatts * sizeof(const xmlChar *));
				3891	if (n == NULL) {
				3892	htmlErrMemory(ctxt, NULL);
				3893	if (attvalue != NULL)
				3894	xmlFree(attvalue);
				3895	goto failed;
				3896	}
				3897	atts = n;
				3898	ctxt->atts = atts;
				3899	ctxt->maxatts = maxatts;
				3900	}
				3901	atts[nbatts++] = attname;
				3902	atts[nbatts++] = attvalue;
				3903	atts[nbatts] = NULL;
				3904	atts[nbatts + 1] = NULL;
				3905	}
				3906	else {
				3907	if (attvalue != NULL)
				3908	xmlFree(attvalue);
				3909	/* Dump the bogus attribute string up to the next blank or
				3910	* the end of the tag. */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3911	while ((CUR != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3912	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
				3913	((CUR != '/') \|\| (NXT(1) != '>')))
				3914	NEXT;
				3915	}
				3916
				3917	failed:
				3918	SKIP_BLANKS;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3919	}
				3920
				3921	/*
				3922	* Handle specific association to the META tag
				3923	*/
				3924	if (meta && (nbatts != 0))
				3925	htmlCheckMeta(ctxt, atts);
				3926
				3927	/*
				3928	* SAX: Start of Element !
				3929	*/
				3930	if (!discardtag) {
				3931	htmlnamePush(ctxt, name);
				3932	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
				3933	if (nbatts != 0)
				3934	ctxt->sax->startElement(ctxt->userData, name, atts);
				3935	else
				3936	ctxt->sax->startElement(ctxt->userData, name, NULL);
				3937	}
				3938	}
				3939
				3940	if (atts != NULL) {
				3941	for (i = 1;i < nbatts;i += 2) {
				3942	if (atts[i] != NULL)
				3943	xmlFree((xmlChar *) atts[i]);
				3944	}
				3945	}
				3946
				3947	return(discardtag);
				3948	}
				3949
				3950	/**
				3951	* htmlParseEndTag:
				3952	* @ctxt: an HTML parser context
				3953	*
				3954	* parse an end of tag
				3955	*
				3956	* [42] ETag ::= '</' Name S? '>'
				3957	*
				3958	* With namespace
				3959	*
				3960	* [NS 9] ETag ::= '</' QName S? '>'
				3961	*
				3962	* Returns 1 if the current level should be closed.
				3963	*/
				3964
				3965	static int
				3966	htmlParseEndTag(htmlParserCtxtPtr ctxt)
				3967	{
				3968	const xmlChar *name;
				3969	const xmlChar *oldname;
				3970	int i, ret;
				3971
				3972	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3973	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
				3974	"htmlParseEndTag: '</' not found\n", NULL, NULL);
				3975	return (0);
				3976	}
				3977	SKIP(2);
				3978
				3979	name = htmlParseHTMLName(ctxt);
				3980	if (name == NULL)
				3981	return (0);
				3982	/*
				3983	* We should definitely be at the ending "S? '>'" part
				3984	*/
				3985	SKIP_BLANKS;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3986	if (CUR != '>') {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3987	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				3988	"End tag : expected '>'\n", NULL, NULL);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	3989	/* Skip to next '>' */
				3990	while ((CUR != 0) && (CUR != '>'))
				3991	NEXT;
				3992	}
				3993	if (CUR == '>')
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3994	NEXT;
				3995
				3996	/*
				3997	* if we ignored misplaced tags in htmlParseStartTag don't pop them
				3998	* out now.
				3999	*/
				4000	if ((ctxt->depth > 0) &&
				4001	(xmlStrEqual(name, BAD_CAST "html") \|\|
				4002	xmlStrEqual(name, BAD_CAST "body") \|\|
				4003	xmlStrEqual(name, BAD_CAST "head"))) {
				4004	ctxt->depth--;
				4005	return (0);
				4006	}
				4007
				4008	/*
				4009	* If the name read is not one of the element in the parsing stack
				4010	* then return, it's just an error.
				4011	*/
				4012	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				4013	if (xmlStrEqual(name, ctxt->nameTab[i]))
				4014	break;
				4015	}
				4016	if (i < 0) {
				4017	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				4018	"Unexpected end tag : %s\n", name, NULL);
				4019	return (0);
				4020	}
				4021
				4022
				4023	/*
				4024	* Check for auto-closure of HTML elements.
				4025	*/
				4026
				4027	htmlAutoCloseOnClose(ctxt, name);
				4028
				4029	/*
				4030	* Well formedness constraints, opening and closing must match.
				4031	* With the exception that the autoclose may have popped stuff out
				4032	* of the stack.
				4033	*/
				4034	if (!xmlStrEqual(name, ctxt->name)) {
				4035	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
				4036	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				4037	"Opening and ending tag mismatch: %s and %s\n",
				4038	name, ctxt->name);
				4039	}
				4040	}
				4041
				4042	/*
				4043	* SAX: End of Tag
				4044	*/
				4045	oldname = ctxt->name;
				4046	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				4047	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4048	ctxt->sax->endElement(ctxt->userData, name);
				4049	htmlNodeInfoPop(ctxt);
				4050	htmlnamePop(ctxt);
				4051	ret = 1;
				4052	} else {
				4053	ret = 0;
				4054	}
				4055
				4056	return (ret);
				4057	}
				4058
				4059
				4060	/**
				4061	* htmlParseReference:
				4062	* @ctxt: an HTML parser context
				4063	*
				4064	* parse and handle entity references in content,
				4065	* this will end-up in a call to character() since this is either a
				4066	* CharRef, or a predefined entity.
				4067	*/
				4068	static void
				4069	htmlParseReference(htmlParserCtxtPtr ctxt) {
				4070	const htmlEntityDesc * ent;
				4071	xmlChar out[6];
				4072	const xmlChar *name;
				4073	if (CUR != '&') return;
				4074
				4075	if (NXT(1) == '#') {
				4076	unsigned int c;
				4077	int bits, i = 0;
				4078
				4079	c = htmlParseCharRef(ctxt);
				4080	if (c == 0)
				4081	return;
				4082
				4083	if (c < 0x80) { out[i++]= c; bits= -6; }
				4084	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				4085	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				4086	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				4087
				4088	for ( ; bits >= 0; bits-= 6) {
				4089	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				4090	}
				4091	out[i] = 0;
				4092
				4093	htmlCheckParagraph(ctxt);
				4094	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4095	ctxt->sax->characters(ctxt->userData, out, i);
				4096	} else {
				4097	ent = htmlParseEntityRef(ctxt, &name);
				4098	if (name == NULL) {
				4099	htmlCheckParagraph(ctxt);
				4100	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4101	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				4102	return;
				4103	}
				4104	if ((ent == NULL) \|\| !(ent->value > 0)) {
				4105	htmlCheckParagraph(ctxt);
				4106	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				4107	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				4108	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				4109	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				4110	}
				4111	} else {
				4112	unsigned int c;
				4113	int bits, i = 0;
				4114
				4115	c = ent->value;
				4116	if (c < 0x80)
				4117	{ out[i++]= c; bits= -6; }
				4118	else if (c < 0x800)
				4119	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				4120	else if (c < 0x10000)
				4121	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				4122	else
				4123	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				4124
				4125	for ( ; bits >= 0; bits-= 6) {
				4126	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				4127	}
				4128	out[i] = 0;
				4129
				4130	htmlCheckParagraph(ctxt);
				4131	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4132	ctxt->sax->characters(ctxt->userData, out, i);
				4133	}
				4134	}
				4135	}
				4136
				4137	/**
				4138	* htmlParseContent:
				4139	* @ctxt: an HTML parser context
				4140	*
				4141	* Parse a content: comment, sub-element, reference or text.
				4142	* Kept for compatibility with old code
				4143	*/
				4144
				4145	static void
				4146	htmlParseContent(htmlParserCtxtPtr ctxt) {
				4147	xmlChar *currentNode;
				4148	int depth;
				4149	const xmlChar *name;
				4150
				4151	currentNode = xmlStrdup(ctxt->name);
				4152	depth = ctxt->nameNr;
				4153	while (1) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4154	GROW;
				4155
				4156	if (ctxt->instate == XML_PARSER_EOF)
				4157	break;
				4158
				4159	/*
				4160	* Our tag or one of it's parent or children is ending.
				4161	*/
				4162	if ((CUR == '<') && (NXT(1) == '/')) {
				4163	if (htmlParseEndTag(ctxt) &&
				4164	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				4165	if (currentNode != NULL)
				4166	xmlFree(currentNode);
				4167	return;
				4168	}
				4169	continue; /* while */
				4170	}
				4171
				4172	else if ((CUR == '<') &&
				4173	((IS_ASCII_LETTER(NXT(1))) \|\|
				4174	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
				4175	name = htmlParseHTMLName_nonInvasive(ctxt);
				4176	if (name == NULL) {
				4177	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				4178	"htmlParseStartTag: invalid element name\n",
				4179	NULL, NULL);
				4180	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4181	while ((CUR != 0) && (CUR != '>'))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4182	NEXT;
				4183
				4184	if (currentNode != NULL)
				4185	xmlFree(currentNode);
				4186	return;
				4187	}
				4188
				4189	if (ctxt->name != NULL) {
				4190	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
				4191	htmlAutoClose(ctxt, name);
				4192	continue;
				4193	}
				4194	}
				4195	}
				4196
				4197	/*
				4198	* Has this node been popped out during parsing of
				4199	* the next element
				4200	*/
				4201	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				4202	(!xmlStrEqual(currentNode, ctxt->name)))
				4203	{
				4204	if (currentNode != NULL) xmlFree(currentNode);
				4205	return;
				4206	}
				4207
				4208	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				4209	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
				4210	/*
				4211	* Handle SCRIPT/STYLE separately
				4212	*/
				4213	htmlParseScript(ctxt);
				4214	} else {
				4215	/*
				4216	* Sometimes DOCTYPE arrives in the middle of the document
				4217	*/
				4218	if ((CUR == '<') && (NXT(1) == '!') &&
				4219	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4220	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4221	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4222	(UPP(8) == 'E')) {
				4223	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				4224	"Misplaced DOCTYPE declaration\n",
				4225	BAD_CAST "DOCTYPE" , NULL);
				4226	htmlParseDocTypeDecl(ctxt);
				4227	}
				4228
				4229	/*
				4230	* First case : a comment
				4231	*/
				4232	if ((CUR == '<') && (NXT(1) == '!') &&
				4233	(NXT(2) == '-') && (NXT(3) == '-')) {
				4234	htmlParseComment(ctxt);
				4235	}
				4236
				4237	/*
				4238	* Second case : a Processing Instruction.
				4239	*/
				4240	else if ((CUR == '<') && (NXT(1) == '?')) {
				4241	htmlParsePI(ctxt);
				4242	}
				4243
				4244	/*
				4245	* Third case : a sub-element.
				4246	*/
				4247	else if (CUR == '<') {
				4248	htmlParseElement(ctxt);
				4249	}
				4250
				4251	/*
				4252	* Fourth case : a reference. If if has not been resolved,
				4253	* parsing returns it's Name, create the node
				4254	*/
				4255	else if (CUR == '&') {
				4256	htmlParseReference(ctxt);
				4257	}
				4258
				4259	/*
				4260	* Fifth case : end of the resource
				4261	*/
				4262	else if (CUR == 0) {
				4263	htmlAutoCloseOnEnd(ctxt);
				4264	break;
				4265	}
				4266
				4267	/*
				4268	* Last case, text. Note that References are handled directly.
				4269	*/
				4270	else {
				4271	htmlParseCharData(ctxt);
				4272	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4273	}
				4274	GROW;
				4275	}
				4276	if (currentNode != NULL) xmlFree(currentNode);
				4277	}
				4278
				4279	/**
				4280	* htmlParseElement:
				4281	* @ctxt: an HTML parser context
				4282	*
				4283	* parse an HTML element, this is highly recursive
				4284	* this is kept for compatibility with previous code versions
				4285	*
				4286	* [39] element ::= EmptyElemTag \| STag content ETag
				4287	*
				4288	* [41] Attribute ::= Name Eq AttValue
				4289	*/
				4290
				4291	void
				4292	htmlParseElement(htmlParserCtxtPtr ctxt) {
				4293	const xmlChar *name;
				4294	xmlChar *currentNode = NULL;
				4295	const htmlElemDesc * info;
				4296	htmlParserNodeInfo node_info;
				4297	int failed;
				4298	int depth;
				4299	const xmlChar *oldptr;
				4300
				4301	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4302	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4303	"htmlParseElement: context error\n", NULL, NULL);
				4304	return;
				4305	}
				4306
				4307	if (ctxt->instate == XML_PARSER_EOF)
				4308	return;
				4309
				4310	/* Capture start position */
				4311	if (ctxt->record_info) {
				4312	node_info.begin_pos = ctxt->input->consumed +
				4313	(CUR_PTR - ctxt->input->base);
				4314	node_info.begin_line = ctxt->input->line;
				4315	}
				4316
				4317	failed = htmlParseStartTag(ctxt);
				4318	name = ctxt->name;
				4319	if ((failed == -1) \|\| (name == NULL)) {
				4320	if (CUR == '>')
				4321	NEXT;
				4322	return;
				4323	}
				4324
				4325	/*
				4326	* Lookup the info for that element.
				4327	*/
				4328	info = htmlTagLookup(name);
				4329	if (info == NULL) {
				4330	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				4331	"Tag %s invalid\n", name, NULL);
				4332	}
				4333
				4334	/*
				4335	* Check for an Empty Element labeled the XML/SGML way
				4336	*/
				4337	if ((CUR == '/') && (NXT(1) == '>')) {
				4338	SKIP(2);
				4339	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4340	ctxt->sax->endElement(ctxt->userData, name);
				4341	htmlnamePop(ctxt);
				4342	return;
				4343	}
				4344
				4345	if (CUR == '>') {
				4346	NEXT;
				4347	} else {
				4348	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				4349	"Couldn't find end of Start Tag %s\n", name, NULL);
				4350
				4351	/*
				4352	* end of parsing of this node.
				4353	*/
				4354	if (xmlStrEqual(name, ctxt->name)) {
				4355	nodePop(ctxt);
				4356	htmlnamePop(ctxt);
				4357	}
				4358
				4359	/*
				4360	* Capture end position and add node
				4361	*/
				4362	if (ctxt->record_info) {
				4363	node_info.end_pos = ctxt->input->consumed +
				4364	(CUR_PTR - ctxt->input->base);
				4365	node_info.end_line = ctxt->input->line;
				4366	node_info.node = ctxt->node;
				4367	xmlParserAddNodeInfo(ctxt, &node_info);
				4368	}
				4369	return;
				4370	}
				4371
				4372	/*
				4373	* Check for an Empty Element from DTD definition
				4374	*/
				4375	if ((info != NULL) && (info->empty)) {
				4376	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4377	ctxt->sax->endElement(ctxt->userData, name);
				4378	htmlnamePop(ctxt);
				4379	return;
				4380	}
				4381
				4382	/*
				4383	* Parse the content of the element:
				4384	*/
				4385	currentNode = xmlStrdup(ctxt->name);
				4386	depth = ctxt->nameNr;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4387	while (CUR != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4388	oldptr = ctxt->input->cur;
				4389	htmlParseContent(ctxt);
				4390	if (oldptr==ctxt->input->cur) break;
				4391	if (ctxt->nameNr < depth) break;
				4392	}
				4393
				4394	/*
				4395	* Capture end position and add node
				4396	*/
				4397	if ( currentNode != NULL && ctxt->record_info ) {
				4398	node_info.end_pos = ctxt->input->consumed +
				4399	(CUR_PTR - ctxt->input->base);
				4400	node_info.end_line = ctxt->input->line;
				4401	node_info.node = ctxt->node;
				4402	xmlParserAddNodeInfo(ctxt, &node_info);
				4403	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4404	if (CUR == 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4405	htmlAutoCloseOnEnd(ctxt);
				4406	}
				4407
				4408	if (currentNode != NULL)
				4409	xmlFree(currentNode);
				4410	}
				4411
				4412	static void
				4413	htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
				4414	/*
				4415	* Capture end position and add node
				4416	*/
				4417	if ( ctxt->node != NULL && ctxt->record_info ) {
				4418	ctxt->nodeInfo->end_pos = ctxt->input->consumed +
				4419	(CUR_PTR - ctxt->input->base);
				4420	ctxt->nodeInfo->end_line = ctxt->input->line;
				4421	ctxt->nodeInfo->node = ctxt->node;
				4422	xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
				4423	htmlNodeInfoPop(ctxt);
				4424	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4425	if (CUR == 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4426	htmlAutoCloseOnEnd(ctxt);
				4427	}
				4428	}
				4429
				4430	/**
				4431	* htmlParseElementInternal:
				4432	* @ctxt: an HTML parser context
				4433	*
				4434	* parse an HTML element, new version, non recursive
				4435	*
				4436	* [39] element ::= EmptyElemTag \| STag content ETag
				4437	*
				4438	* [41] Attribute ::= Name Eq AttValue
				4439	*/
				4440
				4441	static void
				4442	htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
				4443	const xmlChar *name;
				4444	const htmlElemDesc * info;
				4445	htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
				4446	int failed;
				4447
				4448	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4449	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4450	"htmlParseElementInternal: context error\n", NULL, NULL);
				4451	return;
				4452	}
				4453
				4454	if (ctxt->instate == XML_PARSER_EOF)
				4455	return;
				4456
				4457	/* Capture start position */
				4458	if (ctxt->record_info) {
				4459	node_info.begin_pos = ctxt->input->consumed +
				4460	(CUR_PTR - ctxt->input->base);
				4461	node_info.begin_line = ctxt->input->line;
				4462	}
				4463
				4464	failed = htmlParseStartTag(ctxt);
				4465	name = ctxt->name;
				4466	if ((failed == -1) \|\| (name == NULL)) {
				4467	if (CUR == '>')
				4468	NEXT;
				4469	return;
				4470	}
				4471
				4472	/*
				4473	* Lookup the info for that element.
				4474	*/
				4475	info = htmlTagLookup(name);
				4476	if (info == NULL) {
				4477	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				4478	"Tag %s invalid\n", name, NULL);
				4479	}
				4480
				4481	/*
				4482	* Check for an Empty Element labeled the XML/SGML way
				4483	*/
				4484	if ((CUR == '/') && (NXT(1) == '>')) {
				4485	SKIP(2);
				4486	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4487	ctxt->sax->endElement(ctxt->userData, name);
				4488	htmlnamePop(ctxt);
				4489	return;
				4490	}
				4491
				4492	if (CUR == '>') {
				4493	NEXT;
				4494	} else {
				4495	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				4496	"Couldn't find end of Start Tag %s\n", name, NULL);
				4497
				4498	/*
				4499	* end of parsing of this node.
				4500	*/
				4501	if (xmlStrEqual(name, ctxt->name)) {
				4502	nodePop(ctxt);
				4503	htmlnamePop(ctxt);
				4504	}
				4505
				4506	if (ctxt->record_info)
				4507	htmlNodeInfoPush(ctxt, &node_info);
				4508	htmlParserFinishElementParsing(ctxt);
				4509	return;
				4510	}
				4511
				4512	/*
				4513	* Check for an Empty Element from DTD definition
				4514	*/
				4515	if ((info != NULL) && (info->empty)) {
				4516	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4517	ctxt->sax->endElement(ctxt->userData, name);
				4518	htmlnamePop(ctxt);
				4519	return;
				4520	}
				4521
				4522	if (ctxt->record_info)
				4523	htmlNodeInfoPush(ctxt, &node_info);
				4524	}
				4525
				4526	/**
				4527	* htmlParseContentInternal:
				4528	* @ctxt: an HTML parser context
				4529	*
				4530	* Parse a content: comment, sub-element, reference or text.
				4531	* New version for non recursive htmlParseElementInternal
				4532	*/
				4533
				4534	static void
				4535	htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
				4536	xmlChar *currentNode;
				4537	int depth;
				4538	const xmlChar *name;
				4539
				4540	currentNode = xmlStrdup(ctxt->name);
				4541	depth = ctxt->nameNr;
				4542	while (1) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4543	GROW;
				4544
				4545	if (ctxt->instate == XML_PARSER_EOF)
				4546	break;
				4547
				4548	/*
				4549	* Our tag or one of it's parent or children is ending.
				4550	*/
				4551	if ((CUR == '<') && (NXT(1) == '/')) {
				4552	if (htmlParseEndTag(ctxt) &&
				4553	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				4554	if (currentNode != NULL)
				4555	xmlFree(currentNode);
				4556
				4557	currentNode = xmlStrdup(ctxt->name);
				4558	depth = ctxt->nameNr;
				4559	}
				4560	continue; /* while */
				4561	}
				4562
				4563	else if ((CUR == '<') &&
				4564	((IS_ASCII_LETTER(NXT(1))) \|\|
				4565	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
				4566	name = htmlParseHTMLName_nonInvasive(ctxt);
				4567	if (name == NULL) {
				4568	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				4569	"htmlParseStartTag: invalid element name\n",
				4570	NULL, NULL);
				4571	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	4572	while ((CUR == 0) && (CUR != '>'))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4573	NEXT;
				4574
				4575	htmlParserFinishElementParsing(ctxt);
				4576	if (currentNode != NULL)
				4577	xmlFree(currentNode);
				4578
				4579	currentNode = xmlStrdup(ctxt->name);
				4580	depth = ctxt->nameNr;
				4581	continue;
				4582	}
				4583
				4584	if (ctxt->name != NULL) {
				4585	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
				4586	htmlAutoClose(ctxt, name);
				4587	continue;
				4588	}
				4589	}
				4590	}
				4591
				4592	/*
				4593	* Has this node been popped out during parsing of
				4594	* the next element
				4595	*/
				4596	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				4597	(!xmlStrEqual(currentNode, ctxt->name)))
				4598	{
				4599	htmlParserFinishElementParsing(ctxt);
				4600	if (currentNode != NULL) xmlFree(currentNode);
				4601
				4602	currentNode = xmlStrdup(ctxt->name);
				4603	depth = ctxt->nameNr;
				4604	continue;
				4605	}
				4606
				4607	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				4608	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
				4609	/*
				4610	* Handle SCRIPT/STYLE separately
				4611	*/
				4612	htmlParseScript(ctxt);
				4613	} else {
				4614	/*
				4615	* Sometimes DOCTYPE arrives in the middle of the document
				4616	*/
				4617	if ((CUR == '<') && (NXT(1) == '!') &&
				4618	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4619	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4620	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4621	(UPP(8) == 'E')) {
				4622	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				4623	"Misplaced DOCTYPE declaration\n",
				4624	BAD_CAST "DOCTYPE" , NULL);
				4625	htmlParseDocTypeDecl(ctxt);
				4626	}
				4627
				4628	/*
				4629	* First case : a comment
				4630	*/
				4631	if ((CUR == '<') && (NXT(1) == '!') &&
				4632	(NXT(2) == '-') && (NXT(3) == '-')) {
				4633	htmlParseComment(ctxt);
				4634	}
				4635
				4636	/*
				4637	* Second case : a Processing Instruction.
				4638	*/
				4639	else if ((CUR == '<') && (NXT(1) == '?')) {
				4640	htmlParsePI(ctxt);
				4641	}
				4642
				4643	/*
				4644	* Third case : a sub-element.
				4645	*/
				4646	else if (CUR == '<') {
				4647	htmlParseElementInternal(ctxt);
				4648	if (currentNode != NULL) xmlFree(currentNode);
				4649
				4650	currentNode = xmlStrdup(ctxt->name);
				4651	depth = ctxt->nameNr;
				4652	}
				4653
				4654	/*
				4655	* Fourth case : a reference. If if has not been resolved,
				4656	* parsing returns it's Name, create the node
				4657	*/
				4658	else if (CUR == '&') {
				4659	htmlParseReference(ctxt);
				4660	}
				4661
				4662	/*
				4663	* Fifth case : end of the resource
				4664	*/
				4665	else if (CUR == 0) {
				4666	htmlAutoCloseOnEnd(ctxt);
				4667	break;
				4668	}
				4669
				4670	/*
				4671	* Last case, text. Note that References are handled directly.
				4672	*/
				4673	else {
				4674	htmlParseCharData(ctxt);
				4675	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4676	}
				4677	GROW;
				4678	}
				4679	if (currentNode != NULL) xmlFree(currentNode);
				4680	}
				4681
				4682	/**
				4683	* htmlParseContent:
				4684	* @ctxt: an HTML parser context
				4685	*
				4686	* Parse a content: comment, sub-element, reference or text.
				4687	* This is the entry point when called from parser.c
				4688	*/
				4689
				4690	void
				4691	__htmlParseContent(void *ctxt) {
				4692	if (ctxt != NULL)
				4693	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
				4694	}
				4695
				4696	/**
				4697	* htmlParseDocument:
				4698	* @ctxt: an HTML parser context
				4699	*
				4700	* parse an HTML document (and build a tree if using the standard SAX
				4701	* interface).
				4702	*
				4703	* Returns 0, -1 in case of error. the parser context is augmented
				4704	* as a result of the parsing.
				4705	*/
				4706
				4707	int
				4708	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				4709	xmlChar start[4];
				4710	xmlCharEncoding enc;
				4711	xmlDtdPtr dtd;
				4712
				4713	xmlInitParser();
				4714
				4715	htmlDefaultSAXHandlerInit();
				4716
				4717	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4718	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4719	"htmlParseDocument: context error\n", NULL, NULL);
				4720	return(XML_ERR_INTERNAL_ERROR);
				4721	}
				4722	ctxt->html = 1;
				4723	ctxt->linenumbers = 1;
				4724	GROW;
				4725	/*
				4726	* SAX: beginning of the document processing.
				4727	*/
				4728	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4729	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				4730
				4731	if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
				4732	((ctxt->input->end - ctxt->input->cur) >= 4)) {
				4733	/*
				4734	* Get the 4 first bytes and decode the charset
				4735	* if enc != XML_CHAR_ENCODING_NONE
				4736	* plug some encoding conversion routines.
				4737	*/
				4738	start[0] = RAW;
				4739	start[1] = NXT(1);
				4740	start[2] = NXT(2);
				4741	start[3] = NXT(3);
				4742	enc = xmlDetectCharEncoding(&start[0], 4);
				4743	if (enc != XML_CHAR_ENCODING_NONE) {
				4744	xmlSwitchEncoding(ctxt, enc);
				4745	}
				4746	}
				4747
				4748	/*
				4749	* Wipe out everything which is before the first '<'
				4750	*/
				4751	SKIP_BLANKS;
				4752	if (CUR == 0) {
				4753	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
				4754	"Document is empty\n", NULL, NULL);
				4755	}
				4756
				4757	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				4758	ctxt->sax->startDocument(ctxt->userData);
				4759
				4760
				4761	/*
				4762	* Parse possible comments and PIs before any content
				4763	*/
				4764	while (((CUR == '<') && (NXT(1) == '!') &&
				4765	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
				4766	((CUR == '<') && (NXT(1) == '?'))) {
				4767	htmlParseComment(ctxt);
				4768	htmlParsePI(ctxt);
				4769	SKIP_BLANKS;
				4770	}
				4771
				4772
				4773	/*
				4774	* Then possibly doc type declaration(s) and more Misc
				4775	* (doctypedecl Misc*)?
				4776	*/
				4777	if ((CUR == '<') && (NXT(1) == '!') &&
				4778	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4779	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4780	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4781	(UPP(8) == 'E')) {
				4782	htmlParseDocTypeDecl(ctxt);
				4783	}
				4784	SKIP_BLANKS;
				4785
				4786	/*
				4787	* Parse possible comments and PIs before any content
				4788	*/
				4789	while (((CUR == '<') && (NXT(1) == '!') &&
				4790	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
				4791	((CUR == '<') && (NXT(1) == '?'))) {
				4792	htmlParseComment(ctxt);
				4793	htmlParsePI(ctxt);
				4794	SKIP_BLANKS;
				4795	}
				4796
				4797	/*
				4798	* Time to start parsing the tree itself
				4799	*/
				4800	htmlParseContentInternal(ctxt);
				4801
				4802	/*
				4803	* autoclose
				4804	*/
				4805	if (CUR == 0)
				4806	htmlAutoCloseOnEnd(ctxt);
				4807
				4808
				4809	/*
				4810	* SAX: end of the document processing.
				4811	*/
				4812	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4813	ctxt->sax->endDocument(ctxt->userData);
				4814
				4815	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
				4816	dtd = xmlGetIntSubset(ctxt->myDoc);
				4817	if (dtd == NULL)
				4818	ctxt->myDoc->intSubset =
				4819	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
				4820	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4821	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4822	}
				4823	if (! ctxt->wellFormed) return(-1);
				4824	return(0);
				4825	}
				4826
				4827
				4828	/************************************************************************
				4829	* *
				4830	* Parser contexts handling *
				4831	* *
				4832	************************************************************************/
				4833
				4834	/**
				4835	* htmlInitParserCtxt:
				4836	* @ctxt: an HTML parser context
				4837	*
				4838	* Initialize a parser context
				4839	*
				4840	* Returns 0 in case of success and -1 in case of error
				4841	*/
				4842
				4843	static int
				4844	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				4845	{
				4846	htmlSAXHandler *sax;
				4847
				4848	if (ctxt == NULL) return(-1);
				4849	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4850
				4851	ctxt->dict = xmlDictCreate();
				4852	if (ctxt->dict == NULL) {
				4853	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4854	return(-1);
				4855	}
				4856	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				4857	if (sax == NULL) {
				4858	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4859	return(-1);
				4860	}
				4861	else
				4862	memset(sax, 0, sizeof(htmlSAXHandler));
				4863
				4864	/* Allocate the Input stack */
				4865	ctxt->inputTab = (htmlParserInputPtr *)
				4866	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				4867	if (ctxt->inputTab == NULL) {
				4868	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4869	ctxt->inputNr = 0;
				4870	ctxt->inputMax = 0;
				4871	ctxt->input = NULL;
				4872	return(-1);
				4873	}
				4874	ctxt->inputNr = 0;
				4875	ctxt->inputMax = 5;
				4876	ctxt->input = NULL;
				4877	ctxt->version = NULL;
				4878	ctxt->encoding = NULL;
				4879	ctxt->standalone = -1;
				4880	ctxt->instate = XML_PARSER_START;
				4881
				4882	/* Allocate the Node stack */
				4883	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				4884	if (ctxt->nodeTab == NULL) {
				4885	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4886	ctxt->nodeNr = 0;
				4887	ctxt->nodeMax = 0;
				4888	ctxt->node = NULL;
				4889	ctxt->inputNr = 0;
				4890	ctxt->inputMax = 0;
				4891	ctxt->input = NULL;
				4892	return(-1);
				4893	}
				4894	ctxt->nodeNr = 0;
				4895	ctxt->nodeMax = 10;
				4896	ctxt->node = NULL;
				4897
				4898	/* Allocate the Name stack */
				4899	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				4900	if (ctxt->nameTab == NULL) {
				4901	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4902	ctxt->nameNr = 0;
				4903	ctxt->nameMax = 0;
				4904	ctxt->name = NULL;
				4905	ctxt->nodeNr = 0;
				4906	ctxt->nodeMax = 0;
				4907	ctxt->node = NULL;
				4908	ctxt->inputNr = 0;
				4909	ctxt->inputMax = 0;
				4910	ctxt->input = NULL;
				4911	return(-1);
				4912	}
				4913	ctxt->nameNr = 0;
				4914	ctxt->nameMax = 10;
				4915	ctxt->name = NULL;
				4916
				4917	ctxt->nodeInfoTab = NULL;
				4918	ctxt->nodeInfoNr = 0;
				4919	ctxt->nodeInfoMax = 0;
				4920
				4921	if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
				4922	else {
				4923	ctxt->sax = sax;
				4924	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
				4925	}
				4926	ctxt->userData = ctxt;
				4927	ctxt->myDoc = NULL;
				4928	ctxt->wellFormed = 1;
				4929	ctxt->replaceEntities = 0;
				4930	ctxt->linenumbers = xmlLineNumbersDefaultValue;
				4931	ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
				4932	ctxt->html = 1;
				4933	ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
				4934	ctxt->vctxt.userData = ctxt;
				4935	ctxt->vctxt.error = xmlParserValidityError;
				4936	ctxt->vctxt.warning = xmlParserValidityWarning;
				4937	ctxt->record_info = 0;
				4938	ctxt->validate = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4939	ctxt->checkIndex = 0;
				4940	ctxt->catalogs = NULL;
				4941	xmlInitNodeInfoSeq(&ctxt->node_seq);
				4942	return(0);
				4943	}
				4944
				4945	/**
				4946	* htmlFreeParserCtxt:
				4947	* @ctxt: an HTML parser context
				4948	*
				4949	* Free all the memory used by a parser context. However the parsed
				4950	* document in ctxt->myDoc is not freed.
				4951	*/
				4952
				4953	void
				4954	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				4955	{
				4956	xmlFreeParserCtxt(ctxt);
				4957	}
				4958
				4959	/**
				4960	* htmlNewParserCtxt:
				4961	*
				4962	* Allocate and initialize a new parser context.
				4963	*
				4964	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
				4965	*/
				4966
				4967	htmlParserCtxtPtr
				4968	htmlNewParserCtxt(void)
				4969	{
				4970	xmlParserCtxtPtr ctxt;
				4971
				4972	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
				4973	if (ctxt == NULL) {
				4974	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
				4975	return(NULL);
				4976	}
				4977	memset(ctxt, 0, sizeof(xmlParserCtxt));
				4978	if (htmlInitParserCtxt(ctxt) < 0) {
				4979	htmlFreeParserCtxt(ctxt);
				4980	return(NULL);
				4981	}
				4982	return(ctxt);
				4983	}
				4984
				4985	/**
				4986	* htmlCreateMemoryParserCtxt:
				4987	* @buffer: a pointer to a char array
				4988	* @size: the size of the array
				4989	*
				4990	* Create a parser context for an HTML in-memory document.
				4991	*
				4992	* Returns the new parser context or NULL
				4993	*/
				4994	htmlParserCtxtPtr
				4995	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
				4996	xmlParserCtxtPtr ctxt;
				4997	xmlParserInputPtr input;
				4998	xmlParserInputBufferPtr buf;
				4999
				5000	if (buffer == NULL)
				5001	return(NULL);
				5002	if (size <= 0)
				5003	return(NULL);
				5004
				5005	ctxt = htmlNewParserCtxt();
				5006	if (ctxt == NULL)
				5007	return(NULL);
				5008
				5009	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				5010	if (buf == NULL) return(NULL);
				5011
				5012	input = xmlNewInputStream(ctxt);
				5013	if (input == NULL) {
				5014	xmlFreeParserCtxt(ctxt);
				5015	return(NULL);
				5016	}
				5017
				5018	input->filename = NULL;
				5019	input->buf = buf;
				5020	xmlBufResetInput(buf->buffer, input);
				5021
				5022	inputPush(ctxt, input);
				5023	return(ctxt);
				5024	}
				5025
				5026	/**
				5027	* htmlCreateDocParserCtxt:
				5028	* @cur: a pointer to an array of xmlChar
				5029	* @encoding: a free form C string describing the HTML document encoding, or NULL
				5030	*
				5031	* Create a parser context for an HTML document.
				5032	*
				5033	* TODO: check the need to add encoding handling there
				5034	*
				5035	* Returns the new parser context or NULL
				5036	*/
				5037	static htmlParserCtxtPtr
				5038	htmlCreateDocParserCtxt(const xmlChar cur, const char encoding) {
				5039	int len;
				5040	htmlParserCtxtPtr ctxt;
				5041
				5042	if (cur == NULL)
				5043	return(NULL);
				5044	len = xmlStrlen(cur);
				5045	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
				5046	if (ctxt == NULL)
				5047	return(NULL);
				5048
				5049	if (encoding != NULL) {
				5050	xmlCharEncoding enc;
				5051	xmlCharEncodingHandlerPtr handler;
				5052
				5053	if (ctxt->input->encoding != NULL)
				5054	xmlFree((xmlChar *) ctxt->input->encoding);
				5055	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
				5056
				5057	enc = xmlParseCharEncoding(encoding);
				5058	/*
				5059	* registered set of known encodings
				5060	*/
				5061	if (enc != XML_CHAR_ENCODING_ERROR) {
				5062	xmlSwitchEncoding(ctxt, enc);
				5063	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
				5064	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				5065	"Unsupported encoding %s\n",
				5066	(const xmlChar *) encoding, NULL);
				5067	}
				5068	} else {
				5069	/*
				5070	* fallback for unknown encodings
				5071	*/
				5072	handler = xmlFindCharEncodingHandler((const char *) encoding);
				5073	if (handler != NULL) {
				5074	xmlSwitchToEncoding(ctxt, handler);
				5075	} else {
				5076	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				5077	"Unsupported encoding %s\n",
				5078	(const xmlChar *) encoding, NULL);
				5079	}
				5080	}
				5081	}
				5082	return(ctxt);
				5083	}
				5084
				5085	#ifdef LIBXML_PUSH_ENABLED
				5086	/************************************************************************
				5087	* *
				5088	* Progressive parsing interfaces *
				5089	* *
				5090	************************************************************************/
				5091
				5092	/**
				5093	* htmlParseLookupSequence:
				5094	* @ctxt: an HTML parser context
				5095	* @first: the first char to lookup
				5096	* @next: the next char to lookup or zero
				5097	* @third: the next char to lookup or zero
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5098	* @ignoreattrval: skip over attribute values
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5099	*
				5100	* Try to find if a sequence (first, next, third) or just (first next) or
				5101	* (first) is available in the input stream.
				5102	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				5103	* to avoid rescanning sequences of bytes, it DOES change the state of the
				5104	* parser, do not use liberally.
				5105	* This is basically similar to xmlParseLookupSequence()
				5106	*
				5107	* Returns the index to the current parsing point if the full sequence
				5108	* is available, -1 otherwise.
				5109	*/
				5110	static int
				5111	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5112	xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5113	{
				5114	int base, len;
				5115	htmlParserInputPtr in;
				5116	const xmlChar *buf;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5117	int invalue = 0;
				5118	char valdellim = 0x0;
				5119
				5120	in = ctxt->input;
				5121	if (in == NULL)
				5122	return (-1);
				5123
				5124	base = in->cur - in->base;
				5125	if (base < 0)
				5126	return (-1);
				5127
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5128	if (ctxt->checkIndex > base) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5129	base = ctxt->checkIndex;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5130	/* Abuse hasPErefs member to restore current state. */
				5131	invalue = ctxt->hasPErefs & 1 ? 1 : 0;
				5132	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5133
				5134	if (in->buf == NULL) {
				5135	buf = in->base;
				5136	len = in->length;
				5137	} else {
				5138	buf = xmlBufContent(in->buf->buffer);
				5139	len = xmlBufUse(in->buf->buffer);
				5140	}
				5141
				5142	/* take into account the sequence length */
				5143	if (third)
				5144	len -= 2;
				5145	else if (next)
				5146	len--;
				5147	for (; base < len; base++) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5148	if (ignoreattrval) {
				5149	if (buf[base] == '"' \|\| buf[base] == '\'') {
				5150	if (invalue) {
				5151	if (buf[base] == valdellim) {
				5152	invalue = 0;
				5153	continue;
				5154	}
				5155	} else {
				5156	valdellim = buf[base];
				5157	invalue = 1;
				5158	continue;
				5159	}
				5160	} else if (invalue) {
				5161	continue;
				5162	}
				5163	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5164	if (buf[base] == first) {
				5165	if (third != 0) {
				5166	if ((buf[base + 1] != next) \|\| (buf[base + 2] != third))
				5167	continue;
				5168	} else if (next != 0) {
				5169	if (buf[base + 1] != next)
				5170	continue;
				5171	}
				5172	ctxt->checkIndex = 0;
				5173	#ifdef DEBUG_PUSH
				5174	if (next == 0)
				5175	xmlGenericError(xmlGenericErrorContext,
				5176	"HPP: lookup '%c' found at %d\n",
				5177	first, base);
				5178	else if (third == 0)
				5179	xmlGenericError(xmlGenericErrorContext,
				5180	"HPP: lookup '%c%c' found at %d\n",
				5181	first, next, base);
				5182	else
				5183	xmlGenericError(xmlGenericErrorContext,
				5184	"HPP: lookup '%c%c%c' found at %d\n",
				5185	first, next, third, base);
				5186	#endif
				5187	return (base - (in->cur - in->base));
				5188	}
				5189	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5190	ctxt->checkIndex = base;
				5191	/* Abuse hasPErefs member to track current state. */
				5192	if (invalue)
				5193	ctxt->hasPErefs \|= 1;
				5194	else
				5195	ctxt->hasPErefs &= ~1;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5196	#ifdef DEBUG_PUSH
				5197	if (next == 0)
				5198	xmlGenericError(xmlGenericErrorContext,
				5199	"HPP: lookup '%c' failed\n", first);
				5200	else if (third == 0)
				5201	xmlGenericError(xmlGenericErrorContext,
				5202	"HPP: lookup '%c%c' failed\n", first, next);
				5203	else
				5204	xmlGenericError(xmlGenericErrorContext,
				5205	"HPP: lookup '%c%c%c' failed\n", first, next,
				5206	third);
				5207	#endif
				5208	return (-1);
				5209	}
				5210
				5211	/**
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5212	* htmlParseTryOrFinish:
				5213	* @ctxt: an HTML parser context
				5214	* @terminate: last chunk indicator
				5215	*
				5216	* Try to progress on parsing
				5217	*
				5218	* Returns zero if no parsing was possible
				5219	*/
				5220	static int
				5221	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				5222	int ret = 0;
				5223	htmlParserInputPtr in;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5224	ptrdiff_t avail = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5225	xmlChar cur, next;
				5226
				5227	htmlParserNodeInfo node_info;
				5228
				5229	#ifdef DEBUG_PUSH
				5230	switch (ctxt->instate) {
				5231	case XML_PARSER_EOF:
				5232	xmlGenericError(xmlGenericErrorContext,
				5233	"HPP: try EOF\n"); break;
				5234	case XML_PARSER_START:
				5235	xmlGenericError(xmlGenericErrorContext,
				5236	"HPP: try START\n"); break;
				5237	case XML_PARSER_MISC:
				5238	xmlGenericError(xmlGenericErrorContext,
				5239	"HPP: try MISC\n");break;
				5240	case XML_PARSER_COMMENT:
				5241	xmlGenericError(xmlGenericErrorContext,
				5242	"HPP: try COMMENT\n");break;
				5243	case XML_PARSER_PROLOG:
				5244	xmlGenericError(xmlGenericErrorContext,
				5245	"HPP: try PROLOG\n");break;
				5246	case XML_PARSER_START_TAG:
				5247	xmlGenericError(xmlGenericErrorContext,
				5248	"HPP: try START_TAG\n");break;
				5249	case XML_PARSER_CONTENT:
				5250	xmlGenericError(xmlGenericErrorContext,
				5251	"HPP: try CONTENT\n");break;
				5252	case XML_PARSER_CDATA_SECTION:
				5253	xmlGenericError(xmlGenericErrorContext,
				5254	"HPP: try CDATA_SECTION\n");break;
				5255	case XML_PARSER_END_TAG:
				5256	xmlGenericError(xmlGenericErrorContext,
				5257	"HPP: try END_TAG\n");break;
				5258	case XML_PARSER_ENTITY_DECL:
				5259	xmlGenericError(xmlGenericErrorContext,
				5260	"HPP: try ENTITY_DECL\n");break;
				5261	case XML_PARSER_ENTITY_VALUE:
				5262	xmlGenericError(xmlGenericErrorContext,
				5263	"HPP: try ENTITY_VALUE\n");break;
				5264	case XML_PARSER_ATTRIBUTE_VALUE:
				5265	xmlGenericError(xmlGenericErrorContext,
				5266	"HPP: try ATTRIBUTE_VALUE\n");break;
				5267	case XML_PARSER_DTD:
				5268	xmlGenericError(xmlGenericErrorContext,
				5269	"HPP: try DTD\n");break;
				5270	case XML_PARSER_EPILOG:
				5271	xmlGenericError(xmlGenericErrorContext,
				5272	"HPP: try EPILOG\n");break;
				5273	case XML_PARSER_PI:
				5274	xmlGenericError(xmlGenericErrorContext,
				5275	"HPP: try PI\n");break;
				5276	case XML_PARSER_SYSTEM_LITERAL:
				5277	xmlGenericError(xmlGenericErrorContext,
				5278	"HPP: try SYSTEM_LITERAL\n");break;
				5279	}
				5280	#endif
				5281
				5282	while (1) {
				5283
				5284	in = ctxt->input;
				5285	if (in == NULL) break;
				5286	if (in->buf == NULL)
				5287	avail = in->length - (in->cur - in->base);
				5288	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5289	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5290	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5291	if ((avail == 0) && (terminate)) {
				5292	htmlAutoCloseOnEnd(ctxt);
				5293	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				5294	/*
				5295	* SAX: end of the document processing.
				5296	*/
				5297	ctxt->instate = XML_PARSER_EOF;
				5298	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				5299	ctxt->sax->endDocument(ctxt->userData);
				5300	}
				5301	}
				5302	if (avail < 1)
				5303	goto done;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5304	/*
				5305	* This is done to make progress and avoid an infinite loop
				5306	* if a parsing attempt was aborted by hitting a NUL byte. After
				5307	* changing htmlCurrentChar, this probably isn't necessary anymore.
				5308	* We should consider removing this check.
				5309	*/
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5310	cur = in->cur[0];
				5311	if (cur == 0) {
				5312	SKIP(1);
				5313	continue;
				5314	}
				5315
				5316	switch (ctxt->instate) {
				5317	case XML_PARSER_EOF:
				5318	/*
				5319	* Document parsing is done !
				5320	*/
				5321	goto done;
				5322	case XML_PARSER_START:
				5323	/*
				5324	* Very first chars read from the document flow.
				5325	*/
				5326	cur = in->cur[0];
				5327	if (IS_BLANK_CH(cur)) {
				5328	SKIP_BLANKS;
				5329	if (in->buf == NULL)
				5330	avail = in->length - (in->cur - in->base);
				5331	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5332	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5333	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5334	}
				5335	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				5336	ctxt->sax->setDocumentLocator(ctxt->userData,
				5337	&xmlDefaultSAXLocator);
				5338	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				5339	(!ctxt->disableSAX))
				5340	ctxt->sax->startDocument(ctxt->userData);
				5341
				5342	cur = in->cur[0];
				5343	next = in->cur[1];
				5344	if ((cur == '<') && (next == '!') &&
				5345	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5346	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5347	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5348	(UPP(8) == 'E')) {
				5349	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5350	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5351	goto done;
				5352	#ifdef DEBUG_PUSH
				5353	xmlGenericError(xmlGenericErrorContext,
				5354	"HPP: Parsing internal subset\n");
				5355	#endif
				5356	htmlParseDocTypeDecl(ctxt);
				5357	ctxt->instate = XML_PARSER_PROLOG;
				5358	#ifdef DEBUG_PUSH
				5359	xmlGenericError(xmlGenericErrorContext,
				5360	"HPP: entering PROLOG\n");
				5361	#endif
				5362	} else {
				5363	ctxt->instate = XML_PARSER_MISC;
				5364	#ifdef DEBUG_PUSH
				5365	xmlGenericError(xmlGenericErrorContext,
				5366	"HPP: entering MISC\n");
				5367	#endif
				5368	}
				5369	break;
				5370	case XML_PARSER_MISC:
				5371	SKIP_BLANKS;
				5372	if (in->buf == NULL)
				5373	avail = in->length - (in->cur - in->base);
				5374	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5375	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5376	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5377	/*
				5378	* no chars in buffer
				5379	*/
				5380	if (avail < 1)
				5381	goto done;
				5382	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5383	* not enough chars in buffer
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5384	*/
				5385	if (avail < 2) {
				5386	if (!terminate)
				5387	goto done;
				5388	else
				5389	next = ' ';
				5390	} else {
				5391	next = in->cur[1];
				5392	}
				5393	cur = in->cur[0];
				5394	if ((cur == '<') && (next == '!') &&
				5395	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5396	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5397	(htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5398	goto done;
				5399	#ifdef DEBUG_PUSH
				5400	xmlGenericError(xmlGenericErrorContext,
				5401	"HPP: Parsing Comment\n");
				5402	#endif
				5403	htmlParseComment(ctxt);
				5404	ctxt->instate = XML_PARSER_MISC;
				5405	} else if ((cur == '<') && (next == '?')) {
				5406	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5407	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5408	goto done;
				5409	#ifdef DEBUG_PUSH
				5410	xmlGenericError(xmlGenericErrorContext,
				5411	"HPP: Parsing PI\n");
				5412	#endif
				5413	htmlParsePI(ctxt);
				5414	ctxt->instate = XML_PARSER_MISC;
				5415	} else if ((cur == '<') && (next == '!') &&
				5416	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5417	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5418	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5419	(UPP(8) == 'E')) {
				5420	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5421	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5422	goto done;
				5423	#ifdef DEBUG_PUSH
				5424	xmlGenericError(xmlGenericErrorContext,
				5425	"HPP: Parsing internal subset\n");
				5426	#endif
				5427	htmlParseDocTypeDecl(ctxt);
				5428	ctxt->instate = XML_PARSER_PROLOG;
				5429	#ifdef DEBUG_PUSH
				5430	xmlGenericError(xmlGenericErrorContext,
				5431	"HPP: entering PROLOG\n");
				5432	#endif
				5433	} else if ((cur == '<') && (next == '!') &&
				5434	(avail < 9)) {
				5435	goto done;
				5436	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5437	ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5438	#ifdef DEBUG_PUSH
				5439	xmlGenericError(xmlGenericErrorContext,
				5440	"HPP: entering START_TAG\n");
				5441	#endif
				5442	}
				5443	break;
				5444	case XML_PARSER_PROLOG:
				5445	SKIP_BLANKS;
				5446	if (in->buf == NULL)
				5447	avail = in->length - (in->cur - in->base);
				5448	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5449	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5450	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5451	if (avail < 2)
				5452	goto done;
				5453	cur = in->cur[0];
				5454	next = in->cur[1];
				5455	if ((cur == '<') && (next == '!') &&
				5456	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5457	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5458	(htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5459	goto done;
				5460	#ifdef DEBUG_PUSH
				5461	xmlGenericError(xmlGenericErrorContext,
				5462	"HPP: Parsing Comment\n");
				5463	#endif
				5464	htmlParseComment(ctxt);
				5465	ctxt->instate = XML_PARSER_PROLOG;
				5466	} else if ((cur == '<') && (next == '?')) {
				5467	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5468	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5469	goto done;
				5470	#ifdef DEBUG_PUSH
				5471	xmlGenericError(xmlGenericErrorContext,
				5472	"HPP: Parsing PI\n");
				5473	#endif
				5474	htmlParsePI(ctxt);
				5475	ctxt->instate = XML_PARSER_PROLOG;
				5476	} else if ((cur == '<') && (next == '!') &&
				5477	(avail < 4)) {
				5478	goto done;
				5479	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5480	ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5481	#ifdef DEBUG_PUSH
				5482	xmlGenericError(xmlGenericErrorContext,
				5483	"HPP: entering START_TAG\n");
				5484	#endif
				5485	}
				5486	break;
				5487	case XML_PARSER_EPILOG:
				5488	if (in->buf == NULL)
				5489	avail = in->length - (in->cur - in->base);
				5490	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5491	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5492	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5493	if (avail < 1)
				5494	goto done;
				5495	cur = in->cur[0];
				5496	if (IS_BLANK_CH(cur)) {
				5497	htmlParseCharData(ctxt);
				5498	goto done;
				5499	}
				5500	if (avail < 2)
				5501	goto done;
				5502	next = in->cur[1];
				5503	if ((cur == '<') && (next == '!') &&
				5504	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5505	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5506	(htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5507	goto done;
				5508	#ifdef DEBUG_PUSH
				5509	xmlGenericError(xmlGenericErrorContext,
				5510	"HPP: Parsing Comment\n");
				5511	#endif
				5512	htmlParseComment(ctxt);
				5513	ctxt->instate = XML_PARSER_EPILOG;
				5514	} else if ((cur == '<') && (next == '?')) {
				5515	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5516	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5517	goto done;
				5518	#ifdef DEBUG_PUSH
				5519	xmlGenericError(xmlGenericErrorContext,
				5520	"HPP: Parsing PI\n");
				5521	#endif
				5522	htmlParsePI(ctxt);
				5523	ctxt->instate = XML_PARSER_EPILOG;
				5524	} else if ((cur == '<') && (next == '!') &&
				5525	(avail < 4)) {
				5526	goto done;
				5527	} else {
				5528	ctxt->errNo = XML_ERR_DOCUMENT_END;
				5529	ctxt->wellFormed = 0;
				5530	ctxt->instate = XML_PARSER_EOF;
				5531	#ifdef DEBUG_PUSH
				5532	xmlGenericError(xmlGenericErrorContext,
				5533	"HPP: entering EOF\n");
				5534	#endif
				5535	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				5536	ctxt->sax->endDocument(ctxt->userData);
				5537	goto done;
				5538	}
				5539	break;
				5540	case XML_PARSER_START_TAG: {
				5541	const xmlChar *name;
				5542	int failed;
				5543	const htmlElemDesc * info;
				5544
				5545	/*
				5546	* no chars in buffer
				5547	*/
				5548	if (avail < 1)
				5549	goto done;
				5550	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5551	* not enough chars in buffer
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5552	*/
				5553	if (avail < 2) {
				5554	if (!terminate)
				5555	goto done;
				5556	else
				5557	next = ' ';
				5558	} else {
				5559	next = in->cur[1];
				5560	}
				5561	cur = in->cur[0];
				5562	if (cur != '<') {
				5563	ctxt->instate = XML_PARSER_CONTENT;
				5564	#ifdef DEBUG_PUSH
				5565	xmlGenericError(xmlGenericErrorContext,
				5566	"HPP: entering CONTENT\n");
				5567	#endif
				5568	break;
				5569	}
				5570	if (next == '/') {
				5571	ctxt->instate = XML_PARSER_END_TAG;
				5572	ctxt->checkIndex = 0;
				5573	#ifdef DEBUG_PUSH
				5574	xmlGenericError(xmlGenericErrorContext,
				5575	"HPP: entering END_TAG\n");
				5576	#endif
				5577	break;
				5578	}
				5579	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5580	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5581	goto done;
				5582
				5583	/* Capture start position */
				5584	if (ctxt->record_info) {
				5585	node_info.begin_pos = ctxt->input->consumed +
				5586	(CUR_PTR - ctxt->input->base);
				5587	node_info.begin_line = ctxt->input->line;
				5588	}
				5589
				5590
				5591	failed = htmlParseStartTag(ctxt);
				5592	name = ctxt->name;
				5593	if ((failed == -1) \|\|
				5594	(name == NULL)) {
				5595	if (CUR == '>')
				5596	NEXT;
				5597	break;
				5598	}
				5599
				5600	/*
				5601	* Lookup the info for that element.
				5602	*/
				5603	info = htmlTagLookup(name);
				5604	if (info == NULL) {
				5605	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				5606	"Tag %s invalid\n", name, NULL);
				5607	}
				5608
				5609	/*
				5610	* Check for an Empty Element labeled the XML/SGML way
				5611	*/
				5612	if ((CUR == '/') && (NXT(1) == '>')) {
				5613	SKIP(2);
				5614	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				5615	ctxt->sax->endElement(ctxt->userData, name);
				5616	htmlnamePop(ctxt);
				5617	ctxt->instate = XML_PARSER_CONTENT;
				5618	#ifdef DEBUG_PUSH
				5619	xmlGenericError(xmlGenericErrorContext,
				5620	"HPP: entering CONTENT\n");
				5621	#endif
				5622	break;
				5623	}
				5624
				5625	if (CUR == '>') {
				5626	NEXT;
				5627	} else {
				5628	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				5629	"Couldn't find end of Start Tag %s\n",
				5630	name, NULL);
				5631
				5632	/*
				5633	* end of parsing of this node.
				5634	*/
				5635	if (xmlStrEqual(name, ctxt->name)) {
				5636	nodePop(ctxt);
				5637	htmlnamePop(ctxt);
				5638	}
				5639
				5640	if (ctxt->record_info)
				5641	htmlNodeInfoPush(ctxt, &node_info);
				5642
				5643	ctxt->instate = XML_PARSER_CONTENT;
				5644	#ifdef DEBUG_PUSH
				5645	xmlGenericError(xmlGenericErrorContext,
				5646	"HPP: entering CONTENT\n");
				5647	#endif
				5648	break;
				5649	}
				5650
				5651	/*
				5652	* Check for an Empty Element from DTD definition
				5653	*/
				5654	if ((info != NULL) && (info->empty)) {
				5655	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				5656	ctxt->sax->endElement(ctxt->userData, name);
				5657	htmlnamePop(ctxt);
				5658	}
				5659
				5660	if (ctxt->record_info)
				5661	htmlNodeInfoPush(ctxt, &node_info);
				5662
				5663	ctxt->instate = XML_PARSER_CONTENT;
				5664	#ifdef DEBUG_PUSH
				5665	xmlGenericError(xmlGenericErrorContext,
				5666	"HPP: entering CONTENT\n");
				5667	#endif
				5668	break;
				5669	}
				5670	case XML_PARSER_CONTENT: {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5671	xmlChar chr[2] = { 0, 0 };
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5672
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5673	/*
				5674	* Handle preparsed entities and charRef
				5675	*/
				5676	if (ctxt->token != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5677	chr[0] = (xmlChar) ctxt->token;
				5678	htmlCheckParagraph(ctxt);
				5679	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				5680	ctxt->sax->characters(ctxt->userData, chr, 1);
				5681	ctxt->token = 0;
				5682	ctxt->checkIndex = 0;
				5683	}
				5684	if ((avail == 1) && (terminate)) {
				5685	cur = in->cur[0];
				5686	if ((cur != '<') && (cur != '&')) {
				5687	if (ctxt->sax != NULL) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5688	chr[0] = cur;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5689	if (IS_BLANK_CH(cur)) {
				5690	if (ctxt->keepBlanks) {
				5691	if (ctxt->sax->characters != NULL)
				5692	ctxt->sax->characters(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5693	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5694	} else {
				5695	if (ctxt->sax->ignorableWhitespace != NULL)
				5696	ctxt->sax->ignorableWhitespace(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5697	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5698	}
				5699	} else {
				5700	htmlCheckParagraph(ctxt);
				5701	if (ctxt->sax->characters != NULL)
				5702	ctxt->sax->characters(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5703	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5704	}
				5705	}
				5706	ctxt->token = 0;
				5707	ctxt->checkIndex = 0;
				5708	in->cur++;
				5709	break;
				5710	}
				5711	}
				5712	if (avail < 2)
				5713	goto done;
				5714	cur = in->cur[0];
				5715	next = in->cur[1];
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5716	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				5717	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				5718	/*
				5719	* Handle SCRIPT/STYLE separately
				5720	*/
				5721	if (!terminate) {
				5722	int idx;
				5723	xmlChar val;
				5724
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5725	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5726	if (idx < 0)
				5727	goto done;
				5728	val = in->cur[idx + 2];
				5729	if (val == 0) /* bad cut of input */
				5730	goto done;
				5731	}
				5732	htmlParseScript(ctxt);
				5733	if ((cur == '<') && (next == '/')) {
				5734	ctxt->instate = XML_PARSER_END_TAG;
				5735	ctxt->checkIndex = 0;
				5736	#ifdef DEBUG_PUSH
				5737	xmlGenericError(xmlGenericErrorContext,
				5738	"HPP: entering END_TAG\n");
				5739	#endif
				5740	break;
				5741	}
				5742	} else {
				5743	/*
				5744	* Sometimes DOCTYPE arrives in the middle of the document
				5745	*/
				5746	if ((cur == '<') && (next == '!') &&
				5747	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5748	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5749	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5750	(UPP(8) == 'E')) {
				5751	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5752	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5753	goto done;
				5754	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				5755	"Misplaced DOCTYPE declaration\n",
				5756	BAD_CAST "DOCTYPE" , NULL);
				5757	htmlParseDocTypeDecl(ctxt);
				5758	} else if ((cur == '<') && (next == '!') &&
				5759	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5760	if ((!terminate) &&
				5761	(htmlParseLookupSequence(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5762	ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5763	goto done;
				5764	#ifdef DEBUG_PUSH
				5765	xmlGenericError(xmlGenericErrorContext,
				5766	"HPP: Parsing Comment\n");
				5767	#endif
				5768	htmlParseComment(ctxt);
				5769	ctxt->instate = XML_PARSER_CONTENT;
				5770	} else if ((cur == '<') && (next == '?')) {
				5771	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5772	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5773	goto done;
				5774	#ifdef DEBUG_PUSH
				5775	xmlGenericError(xmlGenericErrorContext,
				5776	"HPP: Parsing PI\n");
				5777	#endif
				5778	htmlParsePI(ctxt);
				5779	ctxt->instate = XML_PARSER_CONTENT;
				5780	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				5781	goto done;
				5782	} else if ((cur == '<') && (next == '/')) {
				5783	ctxt->instate = XML_PARSER_END_TAG;
				5784	ctxt->checkIndex = 0;
				5785	#ifdef DEBUG_PUSH
				5786	xmlGenericError(xmlGenericErrorContext,
				5787	"HPP: entering END_TAG\n");
				5788	#endif
				5789	break;
				5790	} else if (cur == '<') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5791	if ((!terminate) && (next == 0))
				5792	goto done;
				5793	/*
				5794	* Only switch to START_TAG if the next character
				5795	* starts a valid name. Otherwise, htmlParseStartTag
				5796	* might return without consuming all characters
				5797	* up to the final '>'.
				5798	*/
				5799	if ((IS_ASCII_LETTER(next)) \|\|
				5800	(next == '_') \|\| (next == ':') \|\| (next == '.')) {
				5801	ctxt->instate = XML_PARSER_START_TAG;
				5802	ctxt->checkIndex = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5803	#ifdef DEBUG_PUSH
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5804	xmlGenericError(xmlGenericErrorContext,
				5805	"HPP: entering START_TAG\n");
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5806	#endif
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5807	} else {
				5808	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				5809	"htmlParseTryOrFinish: "
				5810	"invalid element name\n",
				5811	NULL, NULL);
				5812	htmlCheckParagraph(ctxt);
				5813	if ((ctxt->sax != NULL) &&
				5814	(ctxt->sax->characters != NULL))
				5815	ctxt->sax->characters(ctxt->userData,
				5816	in->cur, 1);
				5817	NEXT;
				5818	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5819	break;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5820	} else {
				5821	/*
				5822	* check that the text sequence is complete
				5823	* before handing out the data to the parser
				5824	* to avoid problems with erroneous end of
				5825	* data detection.
				5826	*/
				5827	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5828	(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5829	goto done;
				5830	ctxt->checkIndex = 0;
				5831	#ifdef DEBUG_PUSH
				5832	xmlGenericError(xmlGenericErrorContext,
				5833	"HPP: Parsing char data\n");
				5834	#endif
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5835	while ((cur != '<') && (cur != 0)) {
				5836	if (cur == '&') {
				5837	htmlParseReference(ctxt);
				5838	} else {
				5839	htmlParseCharData(ctxt);
				5840	}
				5841	cur = in->cur[0];
				5842	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5843	}
				5844	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5845
				5846	break;
				5847	}
				5848	case XML_PARSER_END_TAG:
				5849	if (avail < 2)
				5850	goto done;
				5851	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	5852	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5853	goto done;
				5854	htmlParseEndTag(ctxt);
				5855	if (ctxt->nameNr == 0) {
				5856	ctxt->instate = XML_PARSER_EPILOG;
				5857	} else {
				5858	ctxt->instate = XML_PARSER_CONTENT;
				5859	}
				5860	ctxt->checkIndex = 0;
				5861	#ifdef DEBUG_PUSH
				5862	xmlGenericError(xmlGenericErrorContext,
				5863	"HPP: entering CONTENT\n");
				5864	#endif
				5865	break;
				5866	case XML_PARSER_CDATA_SECTION:
				5867	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5868	"HPP: internal error, state == CDATA\n",
				5869	NULL, NULL);
				5870	ctxt->instate = XML_PARSER_CONTENT;
				5871	ctxt->checkIndex = 0;
				5872	#ifdef DEBUG_PUSH
				5873	xmlGenericError(xmlGenericErrorContext,
				5874	"HPP: entering CONTENT\n");
				5875	#endif
				5876	break;
				5877	case XML_PARSER_DTD:
				5878	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5879	"HPP: internal error, state == DTD\n",
				5880	NULL, NULL);
				5881	ctxt->instate = XML_PARSER_CONTENT;
				5882	ctxt->checkIndex = 0;
				5883	#ifdef DEBUG_PUSH
				5884	xmlGenericError(xmlGenericErrorContext,
				5885	"HPP: entering CONTENT\n");
				5886	#endif
				5887	break;
				5888	case XML_PARSER_COMMENT:
				5889	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5890	"HPP: internal error, state == COMMENT\n",
				5891	NULL, NULL);
				5892	ctxt->instate = XML_PARSER_CONTENT;
				5893	ctxt->checkIndex = 0;
				5894	#ifdef DEBUG_PUSH
				5895	xmlGenericError(xmlGenericErrorContext,
				5896	"HPP: entering CONTENT\n");
				5897	#endif
				5898	break;
				5899	case XML_PARSER_PI:
				5900	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5901	"HPP: internal error, state == PI\n",
				5902	NULL, NULL);
				5903	ctxt->instate = XML_PARSER_CONTENT;
				5904	ctxt->checkIndex = 0;
				5905	#ifdef DEBUG_PUSH
				5906	xmlGenericError(xmlGenericErrorContext,
				5907	"HPP: entering CONTENT\n");
				5908	#endif
				5909	break;
				5910	case XML_PARSER_ENTITY_DECL:
				5911	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5912	"HPP: internal error, state == ENTITY_DECL\n",
				5913	NULL, NULL);
				5914	ctxt->instate = XML_PARSER_CONTENT;
				5915	ctxt->checkIndex = 0;
				5916	#ifdef DEBUG_PUSH
				5917	xmlGenericError(xmlGenericErrorContext,
				5918	"HPP: entering CONTENT\n");
				5919	#endif
				5920	break;
				5921	case XML_PARSER_ENTITY_VALUE:
				5922	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5923	"HPP: internal error, state == ENTITY_VALUE\n",
				5924	NULL, NULL);
				5925	ctxt->instate = XML_PARSER_CONTENT;
				5926	ctxt->checkIndex = 0;
				5927	#ifdef DEBUG_PUSH
				5928	xmlGenericError(xmlGenericErrorContext,
				5929	"HPP: entering DTD\n");
				5930	#endif
				5931	break;
				5932	case XML_PARSER_ATTRIBUTE_VALUE:
				5933	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5934	"HPP: internal error, state == ATTRIBUTE_VALUE\n",
				5935	NULL, NULL);
				5936	ctxt->instate = XML_PARSER_START_TAG;
				5937	ctxt->checkIndex = 0;
				5938	#ifdef DEBUG_PUSH
				5939	xmlGenericError(xmlGenericErrorContext,
				5940	"HPP: entering START_TAG\n");
				5941	#endif
				5942	break;
				5943	case XML_PARSER_SYSTEM_LITERAL:
				5944	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5945	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
				5946	NULL, NULL);
				5947	ctxt->instate = XML_PARSER_CONTENT;
				5948	ctxt->checkIndex = 0;
				5949	#ifdef DEBUG_PUSH
				5950	xmlGenericError(xmlGenericErrorContext,
				5951	"HPP: entering CONTENT\n");
				5952	#endif
				5953	break;
				5954	case XML_PARSER_IGNORE:
				5955	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5956	"HPP: internal error, state == XML_PARSER_IGNORE\n",
				5957	NULL, NULL);
				5958	ctxt->instate = XML_PARSER_CONTENT;
				5959	ctxt->checkIndex = 0;
				5960	#ifdef DEBUG_PUSH
				5961	xmlGenericError(xmlGenericErrorContext,
				5962	"HPP: entering CONTENT\n");
				5963	#endif
				5964	break;
				5965	case XML_PARSER_PUBLIC_LITERAL:
				5966	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5967	"HPP: internal error, state == XML_PARSER_LITERAL\n",
				5968	NULL, NULL);
				5969	ctxt->instate = XML_PARSER_CONTENT;
				5970	ctxt->checkIndex = 0;
				5971	#ifdef DEBUG_PUSH
				5972	xmlGenericError(xmlGenericErrorContext,
				5973	"HPP: entering CONTENT\n");
				5974	#endif
				5975	break;
				5976
				5977	}
				5978	}
				5979	done:
				5980	if ((avail == 0) && (terminate)) {
				5981	htmlAutoCloseOnEnd(ctxt);
				5982	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				5983	/*
				5984	* SAX: end of the document processing.
				5985	*/
				5986	ctxt->instate = XML_PARSER_EOF;
				5987	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				5988	ctxt->sax->endDocument(ctxt->userData);
				5989	}
				5990	}
				5991	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
				5992	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				5993	(ctxt->instate == XML_PARSER_EPILOG))) {
				5994	xmlDtdPtr dtd;
				5995	dtd = xmlGetIntSubset(ctxt->myDoc);
				5996	if (dtd == NULL)
				5997	ctxt->myDoc->intSubset =
				5998	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
				5999	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				6000	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				6001	}
				6002	#ifdef DEBUG_PUSH
				6003	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				6004	#endif
				6005	return(ret);
				6006	}
				6007
				6008	/**
				6009	* htmlParseChunk:
				6010	* @ctxt: an HTML parser context
				6011	* @chunk: an char array
				6012	* @size: the size in byte of the chunk
				6013	* @terminate: last chunk indicator
				6014	*
				6015	* Parse a Chunk of memory
				6016	*
				6017	* Returns zero if no error, the xmlParserErrors otherwise.
				6018	*/
				6019	int
				6020	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				6021	int terminate) {
				6022	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				6023	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6024	"htmlParseChunk: context error\n", NULL, NULL);
				6025	return(XML_ERR_INTERNAL_ERROR);
				6026	}
				6027	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				6028	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				6029	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
				6030	size_t cur = ctxt->input->cur - ctxt->input->base;
				6031	int res;
				6032
				6033	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	6034	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6035	if (res < 0) {
				6036	ctxt->errNo = XML_PARSER_EOF;
				6037	ctxt->disableSAX = 1;
				6038	return (XML_PARSER_EOF);
				6039	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6040	#ifdef DEBUG_PUSH
				6041	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				6042	#endif
				6043
				6044	#if 0
				6045	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				6046	htmlParseTryOrFinish(ctxt, terminate);
				6047	#endif
				6048	} else if (ctxt->instate != XML_PARSER_EOF) {
				6049	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
				6050	xmlParserInputBufferPtr in = ctxt->input->buf;
				6051	if ((in->encoder != NULL) && (in->buffer != NULL) &&
				6052	(in->raw != NULL)) {
				6053	int nbchars;
				6054	size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
				6055	size_t current = ctxt->input->cur - ctxt->input->base;
				6056
				6057	nbchars = xmlCharEncInput(in, terminate);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame]	6058	xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6059	if (nbchars < 0) {
				6060	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				6061	"encoder error\n", NULL, NULL);
				6062	return(XML_ERR_INVALID_ENCODING);
				6063	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6064	}
				6065	}
				6066	}
				6067	htmlParseTryOrFinish(ctxt, terminate);
				6068	if (terminate) {
				6069	if ((ctxt->instate != XML_PARSER_EOF) &&
				6070	(ctxt->instate != XML_PARSER_EPILOG) &&
				6071	(ctxt->instate != XML_PARSER_MISC)) {
				6072	ctxt->errNo = XML_ERR_DOCUMENT_END;
				6073	ctxt->wellFormed = 0;
				6074	}
				6075	if (ctxt->instate != XML_PARSER_EOF) {
				6076	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				6077	ctxt->sax->endDocument(ctxt->userData);
				6078	}
				6079	ctxt->instate = XML_PARSER_EOF;
				6080	}
				6081	return((xmlParserErrors) ctxt->errNo);
				6082	}
				6083
				6084	/************************************************************************
				6085	* *
				6086	* User entry points *
				6087	* *
				6088	************************************************************************/
				6089
				6090	/**
				6091	* htmlCreatePushParserCtxt:
				6092	* @sax: a SAX handler
				6093	* @user_data: The user data returned on SAX callbacks
				6094	* @chunk: a pointer to an array of chars
				6095	* @size: number of chars in the array
				6096	* @filename: an optional file name or URI
				6097	* @enc: an optional encoding
				6098	*
				6099	* Create a parser context for using the HTML parser in push mode
				6100	* The value of @filename is used for fetching external entities
				6101	* and error/warning reports.
				6102	*
				6103	* Returns the new parser context or NULL
				6104	*/
				6105	htmlParserCtxtPtr
				6106	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				6107	const char chunk, int size, const char filename,
				6108	xmlCharEncoding enc) {
				6109	htmlParserCtxtPtr ctxt;
				6110	htmlParserInputPtr inputStream;
				6111	xmlParserInputBufferPtr buf;
				6112
				6113	xmlInitParser();
				6114
				6115	buf = xmlAllocParserInputBuffer(enc);
				6116	if (buf == NULL) return(NULL);
				6117
				6118	ctxt = htmlNewParserCtxt();
				6119	if (ctxt == NULL) {
				6120	xmlFreeParserInputBuffer(buf);
				6121	return(NULL);
				6122	}
				6123	if(enc==XML_CHAR_ENCODING_UTF8 \|\| buf->encoder)
				6124	ctxt->charset=XML_CHAR_ENCODING_UTF8;
				6125	if (sax != NULL) {
				6126	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
				6127	xmlFree(ctxt->sax);
				6128	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				6129	if (ctxt->sax == NULL) {
				6130	xmlFree(buf);
				6131	xmlFree(ctxt);
				6132	return(NULL);
				6133	}
				6134	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				6135	if (user_data != NULL)
				6136	ctxt->userData = user_data;
				6137	}
				6138	if (filename == NULL) {
				6139	ctxt->directory = NULL;
				6140	} else {
				6141	ctxt->directory = xmlParserGetDirectory(filename);
				6142	}
				6143
				6144	inputStream = htmlNewInputStream(ctxt);
				6145	if (inputStream == NULL) {
				6146	xmlFreeParserCtxt(ctxt);
				6147	xmlFree(buf);
				6148	return(NULL);
				6149	}
				6150
				6151	if (filename == NULL)
				6152	inputStream->filename = NULL;
				6153	else
				6154	inputStream->filename = (char *)
				6155	xmlCanonicPath((const xmlChar *) filename);
				6156	inputStream->buf = buf;
				6157	xmlBufResetInput(buf->buffer, inputStream);
				6158
				6159	inputPush(ctxt, inputStream);
				6160
				6161	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				6162	(ctxt->input->buf != NULL)) {
				6163	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
				6164	size_t cur = ctxt->input->cur - ctxt->input->base;
				6165
				6166	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				6167
				6168	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
				6169	#ifdef DEBUG_PUSH
				6170	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				6171	#endif
				6172	}
				6173	ctxt->progressive = 1;
				6174
				6175	return(ctxt);
				6176	}
				6177	#endif /* LIBXML_PUSH_ENABLED */
				6178
				6179	/**
				6180	* htmlSAXParseDoc:
				6181	* @cur: a pointer to an array of xmlChar
				6182	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6183	* @sax: the SAX handler block
				6184	* @userData: if using SAX, this pointer will be provided on callbacks.
				6185	*
				6186	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				6187	* to handle parse events. If sax is NULL, fallback to the default DOM
				6188	* behavior and return a tree.
				6189	*
				6190	* Returns the resulting document tree unless SAX is NULL or the document is
				6191	* not well formed.
				6192	*/
				6193
				6194	htmlDocPtr
				6195	htmlSAXParseDoc(const xmlChar cur, const char encoding,
				6196	htmlSAXHandlerPtr sax, void *userData) {
				6197	htmlDocPtr ret;
				6198	htmlParserCtxtPtr ctxt;
				6199
				6200	xmlInitParser();
				6201
				6202	if (cur == NULL) return(NULL);
				6203
				6204
				6205	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				6206	if (ctxt == NULL) return(NULL);
				6207	if (sax != NULL) {
				6208	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
				6209	ctxt->sax = sax;
				6210	ctxt->userData = userData;
				6211	}
				6212
				6213	htmlParseDocument(ctxt);
				6214	ret = ctxt->myDoc;
				6215	if (sax != NULL) {
				6216	ctxt->sax = NULL;
				6217	ctxt->userData = NULL;
				6218	}
				6219	htmlFreeParserCtxt(ctxt);
				6220
				6221	return(ret);
				6222	}
				6223
				6224	/**
				6225	* htmlParseDoc:
				6226	* @cur: a pointer to an array of xmlChar
				6227	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6228	*
				6229	* parse an HTML in-memory document and build a tree.
				6230	*
				6231	* Returns the resulting document tree
				6232	*/
				6233
				6234	htmlDocPtr
				6235	htmlParseDoc(const xmlChar cur, const char encoding) {
				6236	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				6237	}
				6238
				6239
				6240	/**
				6241	* htmlCreateFileParserCtxt:
				6242	* @filename: the filename
				6243	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6244	*
				6245	* Create a parser context for a file content.
				6246	* Automatic support for ZLIB/Compress compressed document is provided
				6247	* by default if found at compile-time.
				6248	*
				6249	* Returns the new parser context or NULL
				6250	*/
				6251	htmlParserCtxtPtr
				6252	htmlCreateFileParserCtxt(const char filename, const char encoding)
				6253	{
				6254	htmlParserCtxtPtr ctxt;
				6255	htmlParserInputPtr inputStream;
				6256	char *canonicFilename;
				6257	/* htmlCharEncoding enc; */
				6258	xmlChar content, content_line = (xmlChar *) "charset=";
				6259
				6260	if (filename == NULL)
				6261	return(NULL);
				6262
				6263	ctxt = htmlNewParserCtxt();
				6264	if (ctxt == NULL) {
				6265	return(NULL);
				6266	}
				6267	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
				6268	if (canonicFilename == NULL) {
				6269	#ifdef LIBXML_SAX1_ENABLED
				6270	if (xmlDefaultSAXHandler.error != NULL) {
				6271	xmlDefaultSAXHandler.error(NULL, "out of memory\n");
				6272	}
				6273	#endif
				6274	xmlFreeParserCtxt(ctxt);
				6275	return(NULL);
				6276	}
				6277
				6278	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
				6279	xmlFree(canonicFilename);
				6280	if (inputStream == NULL) {
				6281	xmlFreeParserCtxt(ctxt);
				6282	return(NULL);
				6283	}
				6284
				6285	inputPush(ctxt, inputStream);
				6286
				6287	/* set encoding */
				6288	if (encoding) {
				6289	size_t l = strlen(encoding);
				6290
				6291	if (l < 1000) {
				6292	content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
				6293	if (content) {
				6294	strcpy ((char )content, (char )content_line);
				6295	strcat ((char )content, (char )encoding);
				6296	htmlCheckEncoding (ctxt, content);
				6297	xmlFree (content);
				6298	}
				6299	}
				6300	}
				6301
				6302	return(ctxt);
				6303	}
				6304
				6305	/**
				6306	* htmlSAXParseFile:
				6307	* @filename: the filename
				6308	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6309	* @sax: the SAX handler block
				6310	* @userData: if using SAX, this pointer will be provided on callbacks.
				6311	*
				6312	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				6313	* compressed document is provided by default if found at compile-time.
				6314	* It use the given SAX function block to handle the parsing callback.
				6315	* If sax is NULL, fallback to the default DOM tree building routines.
				6316	*
				6317	* Returns the resulting document tree unless SAX is NULL or the document is
				6318	* not well formed.
				6319	*/
				6320
				6321	htmlDocPtr
				6322	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				6323	void *userData) {
				6324	htmlDocPtr ret;
				6325	htmlParserCtxtPtr ctxt;
				6326	htmlSAXHandlerPtr oldsax = NULL;
				6327
				6328	xmlInitParser();
				6329
				6330	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				6331	if (ctxt == NULL) return(NULL);
				6332	if (sax != NULL) {
				6333	oldsax = ctxt->sax;
				6334	ctxt->sax = sax;
				6335	ctxt->userData = userData;
				6336	}
				6337
				6338	htmlParseDocument(ctxt);
				6339
				6340	ret = ctxt->myDoc;
				6341	if (sax != NULL) {
				6342	ctxt->sax = oldsax;
				6343	ctxt->userData = NULL;
				6344	}
				6345	htmlFreeParserCtxt(ctxt);
				6346
				6347	return(ret);
				6348	}
				6349
				6350	/**
				6351	* htmlParseFile:
				6352	* @filename: the filename
				6353	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6354	*
				6355	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				6356	* compressed document is provided by default if found at compile-time.
				6357	*
				6358	* Returns the resulting document tree
				6359	*/
				6360
				6361	htmlDocPtr
				6362	htmlParseFile(const char filename, const char encoding) {
				6363	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				6364	}
				6365
				6366	/**
				6367	* htmlHandleOmittedElem:
				6368	* @val: int 0 or 1
				6369	*
				6370	* Set and return the previous value for handling HTML omitted tags.
				6371	*
				6372	* Returns the last value for 0 for no handling, 1 for auto insertion.
				6373	*/
				6374
				6375	int
				6376	htmlHandleOmittedElem(int val) {
				6377	int old = htmlOmittedDefaultValue;
				6378
				6379	htmlOmittedDefaultValue = val;
				6380	return(old);
				6381	}
				6382
				6383	/**
				6384	* htmlElementAllowedHere:
				6385	* @parent: HTML parent element
				6386	* @elt: HTML element
				6387	*
				6388	* Checks whether an HTML element may be a direct child of a parent element.
				6389	* Note - doesn't check for deprecated elements
				6390	*
				6391	* Returns 1 if allowed; 0 otherwise.
				6392	*/
				6393	int
				6394	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
				6395	const char** p ;
				6396
				6397	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
				6398	return 0 ;
				6399
				6400	for ( p = parent->subelts; *p; ++p )
				6401	if ( !xmlStrcmp((const xmlChar )p, elt) )
				6402	return 1 ;
				6403
				6404	return 0 ;
				6405	}
				6406	/**
				6407	* htmlElementStatusHere:
				6408	* @parent: HTML parent element
				6409	* @elt: HTML element
				6410	*
				6411	* Checks whether an HTML element may be a direct child of a parent element.
				6412	* and if so whether it is valid or deprecated.
				6413	*
				6414	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
				6415	*/
				6416	htmlStatus
				6417	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
				6418	if ( ! parent \|\| ! elt )
				6419	return HTML_INVALID ;
				6420	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
				6421	return HTML_INVALID ;
				6422
				6423	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
				6424	}
				6425	/**
				6426	* htmlAttrAllowed:
				6427	* @elt: HTML element
				6428	* @attr: HTML attribute
				6429	* @legacy: whether to allow deprecated attributes
				6430	*
				6431	* Checks whether an attribute is valid for an element
				6432	* Has full knowledge of Required and Deprecated attributes
				6433	*
				6434	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
				6435	*/
				6436	htmlStatus
				6437	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
				6438	const char** p ;
				6439
				6440	if ( !elt \|\| ! attr )
				6441	return HTML_INVALID ;
				6442
				6443	if ( elt->attrs_req )
				6444	for ( p = elt->attrs_req; *p; ++p)
				6445	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6446	return HTML_REQUIRED ;
				6447
				6448	if ( elt->attrs_opt )
				6449	for ( p = elt->attrs_opt; *p; ++p)
				6450	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6451	return HTML_VALID ;
				6452
				6453	if ( legacy && elt->attrs_depr )
				6454	for ( p = elt->attrs_depr; *p; ++p)
				6455	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6456	return HTML_DEPRECATED ;
				6457
				6458	return HTML_INVALID ;
				6459	}
				6460	/**
				6461	* htmlNodeStatus:
				6462	* @node: an htmlNodePtr in a tree
				6463	* @legacy: whether to allow deprecated elements (YES is faster here
				6464	* for Element nodes)
				6465	*
				6466	* Checks whether the tree node is valid. Experimental (the author
				6467	* only uses the HTML enhancements in a SAX parser)
				6468	*
				6469	* Return: for Element nodes, a return from htmlElementAllowedHere (if
				6470	* legacy allowed) or htmlElementStatusHere (otherwise).
				6471	* for Attribute nodes, a return from htmlAttrAllowed
				6472	* for other nodes, HTML_NA (no checks performed)
				6473	*/
				6474	htmlStatus
				6475	htmlNodeStatus(const htmlNodePtr node, int legacy) {
				6476	if ( ! node )
				6477	return HTML_INVALID ;
				6478
				6479	switch ( node->type ) {
				6480	case XML_ELEMENT_NODE:
				6481	return legacy
				6482	? ( htmlElementAllowedHere (
				6483	htmlTagLookup(node->parent->name) , node->name
				6484	) ? HTML_VALID : HTML_INVALID )
				6485	: htmlElementStatusHere(
				6486	htmlTagLookup(node->parent->name) ,
				6487	htmlTagLookup(node->name) )
				6488	;
				6489	case XML_ATTRIBUTE_NODE:
				6490	return htmlAttrAllowed(
				6491	htmlTagLookup(node->parent->name) , node->name, legacy) ;
				6492	default: return HTML_NA ;
				6493	}
				6494	}
				6495	/************************************************************************
				6496	* *
				6497	* New set (2.6.0) of simpler and more flexible APIs *
				6498	* *
				6499	************************************************************************/
				6500	/**
				6501	* DICT_FREE:
				6502	* @str: a string
				6503	*
				6504	* Free a string if it is not owned by the "dict" dictionary in the
				6505	* current scope
				6506	*/
				6507	#define DICT_FREE(str) \
				6508	if ((str) && ((!dict) \|\| \
				6509	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
				6510	xmlFree((char *)(str));
				6511
				6512	/**
				6513	* htmlCtxtReset:
				6514	* @ctxt: an HTML parser context
				6515	*
				6516	* Reset a parser context
				6517	*/
				6518	void
				6519	htmlCtxtReset(htmlParserCtxtPtr ctxt)
				6520	{
				6521	xmlParserInputPtr input;
				6522	xmlDictPtr dict;
				6523
				6524	if (ctxt == NULL)
				6525	return;
				6526
				6527	xmlInitParser();
				6528	dict = ctxt->dict;
				6529
				6530	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
				6531	xmlFreeInputStream(input);
				6532	}
				6533	ctxt->inputNr = 0;
				6534	ctxt->input = NULL;
				6535
				6536	ctxt->spaceNr = 0;
				6537	if (ctxt->spaceTab != NULL) {
				6538	ctxt->spaceTab[0] = -1;
				6539	ctxt->space = &ctxt->spaceTab[0];
				6540	} else {
				6541	ctxt->space = NULL;
				6542	}
				6543
				6544
				6545	ctxt->nodeNr = 0;
				6546	ctxt->node = NULL;
				6547
				6548	ctxt->nameNr = 0;
				6549	ctxt->name = NULL;
				6550
				6551	DICT_FREE(ctxt->version);
				6552	ctxt->version = NULL;
				6553	DICT_FREE(ctxt->encoding);
				6554	ctxt->encoding = NULL;
				6555	DICT_FREE(ctxt->directory);
				6556	ctxt->directory = NULL;
				6557	DICT_FREE(ctxt->extSubURI);
				6558	ctxt->extSubURI = NULL;
				6559	DICT_FREE(ctxt->extSubSystem);
				6560	ctxt->extSubSystem = NULL;
				6561	if (ctxt->myDoc != NULL)
				6562	xmlFreeDoc(ctxt->myDoc);
				6563	ctxt->myDoc = NULL;
				6564
				6565	ctxt->standalone = -1;
				6566	ctxt->hasExternalSubset = 0;
				6567	ctxt->hasPErefs = 0;
				6568	ctxt->html = 1;
				6569	ctxt->external = 0;
				6570	ctxt->instate = XML_PARSER_START;
				6571	ctxt->token = 0;
				6572
				6573	ctxt->wellFormed = 1;
				6574	ctxt->nsWellFormed = 1;
				6575	ctxt->disableSAX = 0;
				6576	ctxt->valid = 1;
				6577	ctxt->vctxt.userData = ctxt;
				6578	ctxt->vctxt.error = xmlParserValidityError;
				6579	ctxt->vctxt.warning = xmlParserValidityWarning;
				6580	ctxt->record_info = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6581	ctxt->checkIndex = 0;
				6582	ctxt->inSubset = 0;
				6583	ctxt->errNo = XML_ERR_OK;
				6584	ctxt->depth = 0;
				6585	ctxt->charset = XML_CHAR_ENCODING_NONE;
				6586	ctxt->catalogs = NULL;
				6587	xmlInitNodeInfoSeq(&ctxt->node_seq);
				6588
				6589	if (ctxt->attsDefault != NULL) {
				6590	xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
				6591	ctxt->attsDefault = NULL;
				6592	}
				6593	if (ctxt->attsSpecial != NULL) {
				6594	xmlHashFree(ctxt->attsSpecial, NULL);
				6595	ctxt->attsSpecial = NULL;
				6596	}
				6597	}
				6598
				6599	/**
				6600	* htmlCtxtUseOptions:
				6601	* @ctxt: an HTML parser context
				6602	* @options: a combination of htmlParserOption(s)
				6603	*
				6604	* Applies the options to the parser context
				6605	*
				6606	* Returns 0 in case of success, the set of unknown or unimplemented options
				6607	* in case of error.
				6608	*/
				6609	int
				6610	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
				6611	{
				6612	if (ctxt == NULL)
				6613	return(-1);
				6614
				6615	if (options & HTML_PARSE_NOWARNING) {
				6616	ctxt->sax->warning = NULL;
				6617	ctxt->vctxt.warning = NULL;
				6618	options -= XML_PARSE_NOWARNING;
				6619	ctxt->options \|= XML_PARSE_NOWARNING;
				6620	}
				6621	if (options & HTML_PARSE_NOERROR) {
				6622	ctxt->sax->error = NULL;
				6623	ctxt->vctxt.error = NULL;
				6624	ctxt->sax->fatalError = NULL;
				6625	options -= XML_PARSE_NOERROR;
				6626	ctxt->options \|= XML_PARSE_NOERROR;
				6627	}
				6628	if (options & HTML_PARSE_PEDANTIC) {
				6629	ctxt->pedantic = 1;
				6630	options -= XML_PARSE_PEDANTIC;
				6631	ctxt->options \|= XML_PARSE_PEDANTIC;
				6632	} else
				6633	ctxt->pedantic = 0;
				6634	if (options & XML_PARSE_NOBLANKS) {
				6635	ctxt->keepBlanks = 0;
				6636	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
				6637	options -= XML_PARSE_NOBLANKS;
				6638	ctxt->options \|= XML_PARSE_NOBLANKS;
				6639	} else
				6640	ctxt->keepBlanks = 1;
				6641	if (options & HTML_PARSE_RECOVER) {
				6642	ctxt->recovery = 1;
				6643	options -= HTML_PARSE_RECOVER;
				6644	} else
				6645	ctxt->recovery = 0;
				6646	if (options & HTML_PARSE_COMPACT) {
				6647	ctxt->options \|= HTML_PARSE_COMPACT;
				6648	options -= HTML_PARSE_COMPACT;
				6649	}
				6650	if (options & XML_PARSE_HUGE) {
				6651	ctxt->options \|= XML_PARSE_HUGE;
				6652	options -= XML_PARSE_HUGE;
				6653	}
				6654	if (options & HTML_PARSE_NODEFDTD) {
				6655	ctxt->options \|= HTML_PARSE_NODEFDTD;
				6656	options -= HTML_PARSE_NODEFDTD;
				6657	}
				6658	if (options & HTML_PARSE_IGNORE_ENC) {
				6659	ctxt->options \|= HTML_PARSE_IGNORE_ENC;
				6660	options -= HTML_PARSE_IGNORE_ENC;
				6661	}
				6662	if (options & HTML_PARSE_NOIMPLIED) {
				6663	ctxt->options \|= HTML_PARSE_NOIMPLIED;
				6664	options -= HTML_PARSE_NOIMPLIED;
				6665	}
				6666	ctxt->dictNames = 0;
				6667	return (options);
				6668	}
				6669
				6670	/**
				6671	* htmlDoRead:
				6672	* @ctxt: an HTML parser context
				6673	* @URL: the base URL to use for the document
				6674	* @encoding: the document encoding, or NULL
				6675	* @options: a combination of htmlParserOption(s)
				6676	* @reuse: keep the context for reuse
				6677	*
				6678	* Common front-end for the htmlRead functions
				6679	*
				6680	* Returns the resulting document tree or NULL
				6681	*/
				6682	static htmlDocPtr
				6683	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
				6684	int options, int reuse)
				6685	{
				6686	htmlDocPtr ret;
				6687
				6688	htmlCtxtUseOptions(ctxt, options);
				6689	ctxt->html = 1;
				6690	if (encoding != NULL) {
				6691	xmlCharEncodingHandlerPtr hdlr;
				6692
				6693	hdlr = xmlFindCharEncodingHandler(encoding);
				6694	if (hdlr != NULL) {
				6695	xmlSwitchToEncoding(ctxt, hdlr);
				6696	if (ctxt->input->encoding != NULL)
				6697	xmlFree((xmlChar *) ctxt->input->encoding);
				6698	ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
				6699	}
				6700	}
				6701	if ((URL != NULL) && (ctxt->input != NULL) &&
				6702	(ctxt->input->filename == NULL))
				6703	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
				6704	htmlParseDocument(ctxt);
				6705	ret = ctxt->myDoc;
				6706	ctxt->myDoc = NULL;
				6707	if (!reuse) {
				6708	if ((ctxt->dictNames) &&
				6709	(ret != NULL) &&
				6710	(ret->dict == ctxt->dict))
				6711	ctxt->dict = NULL;
				6712	xmlFreeParserCtxt(ctxt);
				6713	}
				6714	return (ret);
				6715	}
				6716
				6717	/**
				6718	* htmlReadDoc:
				6719	* @cur: a pointer to a zero terminated string
				6720	* @URL: the base URL to use for the document
				6721	* @encoding: the document encoding, or NULL
				6722	* @options: a combination of htmlParserOption(s)
				6723	*
				6724	* parse an XML in-memory document and build a tree.
				6725	*
				6726	* Returns the resulting document tree
				6727	*/
				6728	htmlDocPtr
				6729	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
				6730	{
				6731	htmlParserCtxtPtr ctxt;
				6732
				6733	if (cur == NULL)
				6734	return (NULL);
				6735
				6736	xmlInitParser();
				6737	ctxt = htmlCreateDocParserCtxt(cur, NULL);
				6738	if (ctxt == NULL)
				6739	return (NULL);
				6740	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6741	}
				6742
				6743	/**
				6744	* htmlReadFile:
				6745	* @filename: a file or URL
				6746	* @encoding: the document encoding, or NULL
				6747	* @options: a combination of htmlParserOption(s)
				6748	*
				6749	* parse an XML file from the filesystem or the network.
				6750	*
				6751	* Returns the resulting document tree
				6752	*/
				6753	htmlDocPtr
				6754	htmlReadFile(const char filename, const char encoding, int options)
				6755	{
				6756	htmlParserCtxtPtr ctxt;
				6757
				6758	xmlInitParser();
				6759	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				6760	if (ctxt == NULL)
				6761	return (NULL);
				6762	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
				6763	}
				6764
				6765	/**
				6766	* htmlReadMemory:
				6767	* @buffer: a pointer to a char array
				6768	* @size: the size of the array
				6769	* @URL: the base URL to use for the document
				6770	* @encoding: the document encoding, or NULL
				6771	* @options: a combination of htmlParserOption(s)
				6772	*
				6773	* parse an XML in-memory document and build a tree.
				6774	*
				6775	* Returns the resulting document tree
				6776	*/
				6777	htmlDocPtr
				6778	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
				6779	{
				6780	htmlParserCtxtPtr ctxt;
				6781
				6782	xmlInitParser();
				6783	ctxt = xmlCreateMemoryParserCtxt(buffer, size);
				6784	if (ctxt == NULL)
				6785	return (NULL);
				6786	htmlDefaultSAXHandlerInit();
				6787	if (ctxt->sax != NULL)
				6788	memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
				6789	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6790	}
				6791
				6792	/**
				6793	* htmlReadFd:
				6794	* @fd: an open file descriptor
				6795	* @URL: the base URL to use for the document
				6796	* @encoding: the document encoding, or NULL
				6797	* @options: a combination of htmlParserOption(s)
				6798	*
				6799	* parse an XML from a file descriptor and build a tree.
				6800	*
				6801	* Returns the resulting document tree
				6802	*/
				6803	htmlDocPtr
				6804	htmlReadFd(int fd, const char URL, const char encoding, int options)
				6805	{
				6806	htmlParserCtxtPtr ctxt;
				6807	xmlParserInputBufferPtr input;
				6808	xmlParserInputPtr stream;
				6809
				6810	if (fd < 0)
				6811	return (NULL);
				6812	xmlInitParser();
				6813
				6814	xmlInitParser();
				6815	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
				6816	if (input == NULL)
				6817	return (NULL);
				6818	ctxt = xmlNewParserCtxt();
				6819	if (ctxt == NULL) {
				6820	xmlFreeParserInputBuffer(input);
				6821	return (NULL);
				6822	}
				6823	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				6824	if (stream == NULL) {
				6825	xmlFreeParserInputBuffer(input);
				6826	xmlFreeParserCtxt(ctxt);
				6827	return (NULL);
				6828	}
				6829	inputPush(ctxt, stream);
				6830	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6831	}
				6832
				6833	/**
				6834	* htmlReadIO:
				6835	* @ioread: an I/O read function
				6836	* @ioclose: an I/O close function
				6837	* @ioctx: an I/O handler
				6838	* @URL: the base URL to use for the document
				6839	* @encoding: the document encoding, or NULL
				6840	* @options: a combination of htmlParserOption(s)
				6841	*
				6842	* parse an HTML document from I/O functions and source and build a tree.
				6843	*
				6844	* Returns the resulting document tree
				6845	*/
				6846	htmlDocPtr
				6847	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
				6848	void ioctx, const char URL, const char *encoding, int options)
				6849	{
				6850	htmlParserCtxtPtr ctxt;
				6851	xmlParserInputBufferPtr input;
				6852	xmlParserInputPtr stream;
				6853
				6854	if (ioread == NULL)
				6855	return (NULL);
				6856	xmlInitParser();
				6857
				6858	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
				6859	XML_CHAR_ENCODING_NONE);
				6860	if (input == NULL) {
				6861	if (ioclose != NULL)
				6862	ioclose(ioctx);
				6863	return (NULL);
				6864	}
				6865	ctxt = htmlNewParserCtxt();
				6866	if (ctxt == NULL) {
				6867	xmlFreeParserInputBuffer(input);
				6868	return (NULL);
				6869	}
				6870	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				6871	if (stream == NULL) {
				6872	xmlFreeParserInputBuffer(input);
				6873	xmlFreeParserCtxt(ctxt);
				6874	return (NULL);
				6875	}
				6876	inputPush(ctxt, stream);
				6877	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6878	}
				6879
				6880	/**
				6881	* htmlCtxtReadDoc:
				6882	* @ctxt: an HTML parser context
				6883	* @cur: a pointer to a zero terminated string
				6884	* @URL: the base URL to use for the document
				6885	* @encoding: the document encoding, or NULL
				6886	* @options: a combination of htmlParserOption(s)
				6887	*
				6888	* parse an XML in-memory document and build a tree.
				6889	* This reuses the existing @ctxt parser context
				6890	*
				6891	* Returns the resulting document tree
				6892	*/
				6893	htmlDocPtr
				6894	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
				6895	const char URL, const char encoding, int options)
				6896	{
				6897	xmlParserInputPtr stream;
				6898
				6899	if (cur == NULL)
				6900	return (NULL);
				6901	if (ctxt == NULL)
				6902	return (NULL);
				6903	xmlInitParser();
				6904
				6905	htmlCtxtReset(ctxt);
				6906
				6907	stream = xmlNewStringInputStream(ctxt, cur);
				6908	if (stream == NULL) {
				6909	return (NULL);
				6910	}
				6911	inputPush(ctxt, stream);
				6912	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				6913	}
				6914
				6915	/**
				6916	* htmlCtxtReadFile:
				6917	* @ctxt: an HTML parser context
				6918	* @filename: a file or URL
				6919	* @encoding: the document encoding, or NULL
				6920	* @options: a combination of htmlParserOption(s)
				6921	*
				6922	* parse an XML file from the filesystem or the network.
				6923	* This reuses the existing @ctxt parser context
				6924	*
				6925	* Returns the resulting document tree
				6926	*/
				6927	htmlDocPtr
				6928	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
				6929	const char *encoding, int options)
				6930	{
				6931	xmlParserInputPtr stream;
				6932
				6933	if (filename == NULL)
				6934	return (NULL);
				6935	if (ctxt == NULL)
				6936	return (NULL);
				6937	xmlInitParser();
				6938
				6939	htmlCtxtReset(ctxt);
				6940
				6941	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
				6942	if (stream == NULL) {
				6943	return (NULL);
				6944	}
				6945	inputPush(ctxt, stream);
				6946	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
				6947	}
				6948
				6949	/**
				6950	* htmlCtxtReadMemory:
				6951	* @ctxt: an HTML parser context
				6952	* @buffer: a pointer to a char array
				6953	* @size: the size of the array
				6954	* @URL: the base URL to use for the document
				6955	* @encoding: the document encoding, or NULL
				6956	* @options: a combination of htmlParserOption(s)
				6957	*
				6958	* parse an XML in-memory document and build a tree.
				6959	* This reuses the existing @ctxt parser context
				6960	*
				6961	* Returns the resulting document tree
				6962	*/
				6963	htmlDocPtr
				6964	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
				6965	const char URL, const char encoding, int options)
				6966	{
				6967	xmlParserInputBufferPtr input;
				6968	xmlParserInputPtr stream;
				6969
				6970	if (ctxt == NULL)
				6971	return (NULL);
				6972	if (buffer == NULL)
				6973	return (NULL);
				6974	xmlInitParser();
				6975
				6976	htmlCtxtReset(ctxt);
				6977
				6978	input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				6979	if (input == NULL) {
				6980	return(NULL);
				6981	}
				6982
				6983	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				6984	if (stream == NULL) {
				6985	xmlFreeParserInputBuffer(input);
				6986	return(NULL);
				6987	}
				6988
				6989	inputPush(ctxt, stream);
				6990	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				6991	}
				6992
				6993	/**
				6994	* htmlCtxtReadFd:
				6995	* @ctxt: an HTML parser context
				6996	* @fd: an open file descriptor
				6997	* @URL: the base URL to use for the document
				6998	* @encoding: the document encoding, or NULL
				6999	* @options: a combination of htmlParserOption(s)
				7000	*
				7001	* parse an XML from a file descriptor and build a tree.
				7002	* This reuses the existing @ctxt parser context
				7003	*
				7004	* Returns the resulting document tree
				7005	*/
				7006	htmlDocPtr
				7007	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
				7008	const char URL, const char encoding, int options)
				7009	{
				7010	xmlParserInputBufferPtr input;
				7011	xmlParserInputPtr stream;
				7012
				7013	if (fd < 0)
				7014	return (NULL);
				7015	if (ctxt == NULL)
				7016	return (NULL);
				7017	xmlInitParser();
				7018
				7019	htmlCtxtReset(ctxt);
				7020
				7021
				7022	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
				7023	if (input == NULL)
				7024	return (NULL);
				7025	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7026	if (stream == NULL) {
				7027	xmlFreeParserInputBuffer(input);
				7028	return (NULL);
				7029	}
				7030	inputPush(ctxt, stream);
				7031	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7032	}
				7033
				7034	/**
				7035	* htmlCtxtReadIO:
				7036	* @ctxt: an HTML parser context
				7037	* @ioread: an I/O read function
				7038	* @ioclose: an I/O close function
				7039	* @ioctx: an I/O handler
				7040	* @URL: the base URL to use for the document
				7041	* @encoding: the document encoding, or NULL
				7042	* @options: a combination of htmlParserOption(s)
				7043	*
				7044	* parse an HTML document from I/O functions and source and build a tree.
				7045	* This reuses the existing @ctxt parser context
				7046	*
				7047	* Returns the resulting document tree
				7048	*/
				7049	htmlDocPtr
				7050	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
				7051	xmlInputCloseCallback ioclose, void *ioctx,
				7052	const char *URL,
				7053	const char *encoding, int options)
				7054	{
				7055	xmlParserInputBufferPtr input;
				7056	xmlParserInputPtr stream;
				7057
				7058	if (ioread == NULL)
				7059	return (NULL);
				7060	if (ctxt == NULL)
				7061	return (NULL);
				7062	xmlInitParser();
				7063
				7064	htmlCtxtReset(ctxt);
				7065
				7066	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
				7067	XML_CHAR_ENCODING_NONE);
				7068	if (input == NULL) {
				7069	if (ioclose != NULL)
				7070	ioclose(ioctx);
				7071	return (NULL);
				7072	}
				7073	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7074	if (stream == NULL) {
				7075	xmlFreeParserInputBuffer(input);
				7076	return (NULL);
				7077	}
				7078	inputPush(ctxt, stream);
				7079	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7080	}
				7081
				7082	#define bottom_HTMLparser
				7083	#include "elfgcchack.h"
				7084	#endif /* LIBXML_HTML_ENABLED */