Blame - HTMLparser.c - platform/external/libxml2

blob: b981298544a6c671344759e0321d27113c2e1548 [file] [log] [blame]

Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* daniel@veillard.com
				7	*/
				8
				9	#define IN_LIBXML
				10	#include "libxml.h"
				11	#ifdef LIBXML_HTML_ENABLED
				12
				13	#include <string.h>
				14	#ifdef HAVE_CTYPE_H
				15	#include <ctype.h>
				16	#endif
				17	#ifdef HAVE_STDLIB_H
				18	#include <stdlib.h>
				19	#endif
				20	#ifdef HAVE_SYS_STAT_H
				21	#include <sys/stat.h>
				22	#endif
				23	#ifdef HAVE_FCNTL_H
				24	#include <fcntl.h>
				25	#endif
				26	#ifdef HAVE_UNISTD_H
				27	#include <unistd.h>
				28	#endif
				29	#ifdef LIBXML_ZLIB_ENABLED
				30	#include <zlib.h>
				31	#endif
				32
				33	#include <libxml/xmlmemory.h>
				34	#include <libxml/tree.h>
				35	#include <libxml/parser.h>
				36	#include <libxml/parserInternals.h>
				37	#include <libxml/xmlerror.h>
				38	#include <libxml/HTMLparser.h>
				39	#include <libxml/HTMLtree.h>
				40	#include <libxml/entities.h>
				41	#include <libxml/encoding.h>
				42	#include <libxml/valid.h>
				43	#include <libxml/xmlIO.h>
				44	#include <libxml/globals.h>
				45	#include <libxml/uri.h>
				46
				47	#include "buf.h"
				48	#include "enc.h"
				49
				50	#define HTML_MAX_NAMELEN 1000
				51	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				52	#define HTML_PARSER_BUFFER_SIZE 100
				53
				54	/* #define DEBUG */
				55	/* #define DEBUG_PUSH */
				56
				57	static int htmlOmittedDefaultValue = 1;
				58
				59	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				60	xmlChar end, xmlChar end2, xmlChar end3);
				61	static void htmlParseComment(htmlParserCtxtPtr ctxt);
				62
				63	/************************************************************************
				64	* *
				65	* Some factorized error routines *
				66	* *
				67	************************************************************************/
				68
				69	/**
				70	* htmlErrMemory:
				71	* @ctxt: an HTML parser context
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	72	* @extra: extra information
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	73	*
				74	* Handle a redefinition of attribute error
				75	*/
				76	static void
				77	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
				78	{
				79	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				80	(ctxt->instate == XML_PARSER_EOF))
				81	return;
				82	if (ctxt != NULL) {
				83	ctxt->errNo = XML_ERR_NO_MEMORY;
				84	ctxt->instate = XML_PARSER_EOF;
				85	ctxt->disableSAX = 1;
				86	}
				87	if (extra)
				88	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
				89	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
				90	NULL, NULL, 0, 0,
				91	"Memory allocation failed : %s\n", extra);
				92	else
				93	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
				94	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
				95	NULL, NULL, 0, 0, "Memory allocation failed\n");
				96	}
				97
				98	/**
				99	* htmlParseErr:
				100	* @ctxt: an HTML parser context
				101	* @error: the error number
				102	* @msg: the error message
				103	* @str1: string infor
				104	* @str2: string infor
				105	*
				106	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
				107	*/
				108	static void LIBXML_ATTR_FORMAT(3,0)
				109	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
				110	const char msg, const xmlChar str1, const xmlChar *str2)
				111	{
				112	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				113	(ctxt->instate == XML_PARSER_EOF))
				114	return;
				115	if (ctxt != NULL)
				116	ctxt->errNo = error;
				117	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
				118	XML_ERR_ERROR, NULL, 0,
				119	(const char ) str1, (const char ) str2,
				120	NULL, 0, 0,
				121	msg, str1, str2);
				122	if (ctxt != NULL)
				123	ctxt->wellFormed = 0;
				124	}
				125
				126	/**
				127	* htmlParseErrInt:
				128	* @ctxt: an HTML parser context
				129	* @error: the error number
				130	* @msg: the error message
				131	* @val: integer info
				132	*
				133	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
				134	*/
				135	static void LIBXML_ATTR_FORMAT(3,0)
				136	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
				137	const char *msg, int val)
				138	{
				139	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
				140	(ctxt->instate == XML_PARSER_EOF))
				141	return;
				142	if (ctxt != NULL)
				143	ctxt->errNo = error;
				144	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
				145	XML_ERR_ERROR, NULL, 0, NULL, NULL,
				146	NULL, val, 0, msg, val);
				147	if (ctxt != NULL)
				148	ctxt->wellFormed = 0;
				149	}
				150
				151	/************************************************************************
				152	* *
				153	* Parser stacks related functions and macros *
				154	* *
				155	************************************************************************/
				156
				157	/**
				158	* htmlnamePush:
				159	* @ctxt: an HTML parser context
				160	* @value: the element name
				161	*
				162	* Pushes a new element name on top of the name stack
				163	*
				164	* Returns 0 in case of error, the index in the stack otherwise
				165	*/
				166	static int
				167	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
				168	{
				169	if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
				170	ctxt->html = 3;
				171	if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
				172	ctxt->html = 10;
				173	if (ctxt->nameNr >= ctxt->nameMax) {
				174	ctxt->nameMax *= 2;
				175	ctxt->nameTab = (const xmlChar * *)
				176	xmlRealloc((xmlChar * *)ctxt->nameTab,
				177	ctxt->nameMax *
				178	sizeof(ctxt->nameTab[0]));
				179	if (ctxt->nameTab == NULL) {
				180	htmlErrMemory(ctxt, NULL);
				181	return (0);
				182	}
				183	}
				184	ctxt->nameTab[ctxt->nameNr] = value;
				185	ctxt->name = value;
				186	return (ctxt->nameNr++);
				187	}
				188	/**
				189	* htmlnamePop:
				190	* @ctxt: an HTML parser context
				191	*
				192	* Pops the top element name from the name stack
				193	*
				194	* Returns the name just removed
				195	*/
				196	static const xmlChar *
				197	htmlnamePop(htmlParserCtxtPtr ctxt)
				198	{
				199	const xmlChar *ret;
				200
				201	if (ctxt->nameNr <= 0)
				202	return (NULL);
				203	ctxt->nameNr--;
				204	if (ctxt->nameNr < 0)
				205	return (NULL);
				206	if (ctxt->nameNr > 0)
				207	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
				208	else
				209	ctxt->name = NULL;
				210	ret = ctxt->nameTab[ctxt->nameNr];
				211	ctxt->nameTab[ctxt->nameNr] = NULL;
				212	return (ret);
				213	}
				214
				215	/**
				216	* htmlNodeInfoPush:
				217	* @ctxt: an HTML parser context
				218	* @value: the node info
				219	*
				220	* Pushes a new element name on top of the node info stack
				221	*
				222	* Returns 0 in case of error, the index in the stack otherwise
				223	*/
				224	static int
				225	htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
				226	{
				227	if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
				228	if (ctxt->nodeInfoMax == 0)
				229	ctxt->nodeInfoMax = 5;
				230	ctxt->nodeInfoMax *= 2;
				231	ctxt->nodeInfoTab = (htmlParserNodeInfo *)
				232	xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
				233	ctxt->nodeInfoMax *
				234	sizeof(ctxt->nodeInfoTab[0]));
				235	if (ctxt->nodeInfoTab == NULL) {
				236	htmlErrMemory(ctxt, NULL);
				237	return (0);
				238	}
				239	}
				240	ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
				241	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
				242	return (ctxt->nodeInfoNr++);
				243	}
				244
				245	/**
				246	* htmlNodeInfoPop:
				247	* @ctxt: an HTML parser context
				248	*
				249	* Pops the top element name from the node info stack
				250	*
				251	* Returns 0 in case of error, the pointer to NodeInfo otherwise
				252	*/
				253	static htmlParserNodeInfo *
				254	htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
				255	{
				256	if (ctxt->nodeInfoNr <= 0)
				257	return (NULL);
				258	ctxt->nodeInfoNr--;
				259	if (ctxt->nodeInfoNr < 0)
				260	return (NULL);
				261	if (ctxt->nodeInfoNr > 0)
				262	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
				263	else
				264	ctxt->nodeInfo = NULL;
				265	return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
				266	}
				267
				268	/*
				269	* Macros for accessing the content. Those should be used only by the parser,
				270	* and not exported.
				271	*
				272	* Dirty macros, i.e. one need to make assumption on the context to use them
				273	*
				274	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				275	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				276	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				277	* in UNICODE mode. This should be used internally by the parser
				278	* only to compare to ASCII values otherwise it would break when
				279	* running with UTF-8 encoding.
				280	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				281	* to compare on ASCII based substring.
				282	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				283	* it should be used only to compare on ASCII based substring.
				284	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				285	* strings without newlines within the parser.
				286	*
				287	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				288	*
				289	* CURRENT Returns the current char value, with the full decoding of
				290	* UTF-8 if we are using this mode. It returns an int.
				291	* NEXT Skip to the next character, this does the proper decoding
				292	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				293	* NEXTL(l) Skip the current unicode character of l xmlChars long.
				294	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				295	*/
				296
				297	#define UPPER (toupper(*ctxt->input->cur))
				298
				299	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
				300
				301	#define NXT(val) ctxt->input->cur[(val)]
				302
				303	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				304
				305	#define CUR_PTR ctxt->input->cur
				306	#define BASE_PTR ctxt->input->base
				307
				308	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
				309	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
				310	xmlParserInputShrink(ctxt->input)
				311
				312	#define GROW if ((ctxt->progressive == 0) && \
				313	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
				314	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				315
				316	#define CURRENT ((int) (*ctxt->input->cur))
				317
				318	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				319
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	320	/* Imported from XML */
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	321
				322	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				323	#define CUR ((int) (*ctxt->input->cur))
				324	#define NEXT xmlNextChar(ctxt)
				325
				326	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				327
				328
				329	#define NEXTL(l) do { \
				330	if (*(ctxt->input->cur) == '\n') { \
				331	ctxt->input->line++; ctxt->input->col = 1; \
				332	} else ctxt->input->col++; \
				333	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				334	} while (0)
				335
				336	/************
				337	\
				338	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				339	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				340	************/
				341
				342	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				343	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				344
				345	#define COPY_BUF(l,b,i,v) \
				346	if (l == 1) b[i++] = (xmlChar) v; \
				347	else i += xmlCopyChar(l,&b[i],v)
				348
				349	/**
				350	* htmlFindEncoding:
				351	* @the HTML parser context
				352	*
				353	* Ty to find and encoding in the current data available in the input
				354	* buffer this is needed to try to switch to the proper encoding when
				355	* one face a character error.
				356	* That's an heuristic, since it's operating outside of parsing it could
				357	* try to use a meta which had been commented out, that's the reason it
				358	* should only be used in case of error, not as a default.
				359	*
				360	* Returns an encoding string or NULL if not found, the string need to
				361	* be freed
				362	*/
				363	static xmlChar *
				364	htmlFindEncoding(xmlParserCtxtPtr ctxt) {
				365	const xmlChar start, cur, *end;
				366
				367	if ((ctxt == NULL) \|\| (ctxt->input == NULL) \|\|
				368	(ctxt->input->encoding != NULL) \|\| (ctxt->input->buf == NULL) \|\|
				369	(ctxt->input->buf->encoder != NULL))
				370	return(NULL);
				371	if ((ctxt->input->cur == NULL) \|\| (ctxt->input->end == NULL))
				372	return(NULL);
				373
				374	start = ctxt->input->cur;
				375	end = ctxt->input->end;
				376	/* we also expect the input buffer to be zero terminated */
				377	if (*end != 0)
				378	return(NULL);
				379
				380	cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
				381	if (cur == NULL)
				382	return(NULL);
				383	cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
				384	if (cur == NULL)
				385	return(NULL);
				386	cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
				387	if (cur == NULL)
				388	return(NULL);
				389	cur += 8;
				390	start = cur;
				391	while (((cur >= 'A') && (cur <= 'Z')) \|\|
				392	((cur >= 'a') && (cur <= 'z')) \|\|
				393	((cur >= '0') && (cur <= '9')) \|\|
				394	(cur == '-') \|\| (cur == '_') \|\| (cur == ':') \|\| (cur == '/'))
				395	cur++;
				396	if (cur == start)
				397	return(NULL);
				398	return(xmlStrndup(start, cur - start));
				399	}
				400
				401	/**
				402	* htmlCurrentChar:
				403	* @ctxt: the HTML parser context
				404	* @len: pointer to the length of the char read
				405	*
				406	* The current char value, if using UTF-8 this may actually span multiple
				407	* bytes in the input buffer. Implement the end of line normalization:
				408	* 2.11 End-of-Line Handling
				409	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				410	* char, then the encoding converter is plugged in automatically.
				411	*
				412	* Returns the current char value and its length
				413	*/
				414
				415	static int
				416	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	417	const unsigned char *cur;
				418	unsigned char c;
				419	unsigned int val;
				420
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	421	if (ctxt->instate == XML_PARSER_EOF)
				422	return(0);
				423
				424	if (ctxt->token != 0) {
				425	*len = 0;
				426	return(ctxt->token);
				427	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	428	if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	429	xmlChar * guess;
				430	xmlCharEncodingHandlerPtr handler;
				431
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	432	/*
				433	* Assume it's a fixed length encoding (1) with
				434	* a compatible encoding for the ASCII set, since
				435	* HTML constructs only use < 128 chars
				436	*/
				437	if ((int) *ctxt->input->cur < 0x80) {
				438	*len = 1;
				439	if ((*ctxt->input->cur == 0) &&
				440	(ctxt->input->cur < ctxt->input->end)) {
				441	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				442	"Char 0x%X out of allowed range\n", 0);
				443	return(' ');
				444	}
				445	return((int) *ctxt->input->cur);
				446	}
				447
				448	/*
				449	* Humm this is bad, do an automatic flow conversion
				450	*/
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	451	guess = htmlFindEncoding(ctxt);
				452	if (guess == NULL) {
				453	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				454	} else {
				455	if (ctxt->input->encoding != NULL)
				456	xmlFree((xmlChar *) ctxt->input->encoding);
				457	ctxt->input->encoding = guess;
				458	handler = xmlFindCharEncodingHandler((const char *) guess);
				459	if (handler != NULL) {
				460	xmlSwitchToEncoding(ctxt, handler);
				461	} else {
				462	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				463	"Unsupported encoding %s", guess, NULL);
				464	}
				465	}
				466	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				467	}
				468
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	469	/*
				470	* We are supposed to handle UTF8, check it's valid
				471	* From rfc2044: encoding of the Unicode values on UTF-8:
				472	*
				473	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				474	* 0000 0000-0000 007F 0xxxxxxx
				475	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				476	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				477	*
				478	* Check for the 0x110000 limit too
				479	*/
				480	cur = ctxt->input->cur;
				481	c = *cur;
				482	if (c & 0x80) {
				483	if ((c & 0x40) == 0)
				484	goto encoding_error;
				485	if (cur[1] == 0) {
				486	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				487	cur = ctxt->input->cur;
				488	}
				489	if ((cur[1] & 0xc0) != 0x80)
				490	goto encoding_error;
				491	if ((c & 0xe0) == 0xe0) {
				492
				493	if (cur[2] == 0) {
				494	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				495	cur = ctxt->input->cur;
				496	}
				497	if ((cur[2] & 0xc0) != 0x80)
				498	goto encoding_error;
				499	if ((c & 0xf0) == 0xf0) {
				500	if (cur[3] == 0) {
				501	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				502	cur = ctxt->input->cur;
				503	}
				504	if (((c & 0xf8) != 0xf0) \|\|
				505	((cur[3] & 0xc0) != 0x80))
				506	goto encoding_error;
				507	/* 4-byte code */
				508	*len = 4;
				509	val = (cur[0] & 0x7) << 18;
				510	val \|= (cur[1] & 0x3f) << 12;
				511	val \|= (cur[2] & 0x3f) << 6;
				512	val \|= cur[3] & 0x3f;
				513	if (val < 0x10000)
				514	goto encoding_error;
				515	} else {
				516	/* 3-byte code */
				517	*len = 3;
				518	val = (cur[0] & 0xf) << 12;
				519	val \|= (cur[1] & 0x3f) << 6;
				520	val \|= cur[2] & 0x3f;
				521	if (val < 0x800)
				522	goto encoding_error;
				523	}
				524	} else {
				525	/* 2-byte code */
				526	*len = 2;
				527	val = (cur[0] & 0x1f) << 6;
				528	val \|= cur[1] & 0x3f;
				529	if (val < 0x80)
				530	goto encoding_error;
				531	}
				532	if (!IS_CHAR(val)) {
				533	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				534	"Char 0x%X out of allowed range\n", val);
				535	}
				536	return(val);
				537	} else {
				538	if ((*ctxt->input->cur == 0) &&
				539	(ctxt->input->cur < ctxt->input->end)) {
				540	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				541	"Char 0x%X out of allowed range\n", 0);
				542	*len = 1;
				543	return(' ');
				544	}
				545	/* 1-byte code */
				546	*len = 1;
				547	return((int) *ctxt->input->cur);
				548	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	549
				550	encoding_error:
				551	/*
				552	* If we detect an UTF8 error that probably mean that the
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	553	* input encoding didn't get properly advertised in the
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	554	* declaration header. Report the error and switch the encoding
				555	* to ISO-Latin-1 (if you don't like this policy, just declare the
				556	* encoding !)
				557	*/
				558	{
				559	char buffer[150];
				560
				561	if (ctxt->input->end - ctxt->input->cur >= 4) {
				562	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				563	ctxt->input->cur[0], ctxt->input->cur[1],
				564	ctxt->input->cur[2], ctxt->input->cur[3]);
				565	} else {
				566	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
				567	}
				568	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				569	"Input is not proper UTF-8, indicate encoding !\n",
				570	BAD_CAST buffer, NULL);
				571	}
				572
				573	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				574	*len = 1;
				575	return((int) *ctxt->input->cur);
				576	}
				577
				578	/**
				579	* htmlSkipBlankChars:
				580	* @ctxt: the HTML parser context
				581	*
				582	* skip all blanks character found at that point in the input streams.
				583	*
				584	* Returns the number of space chars skipped
				585	*/
				586
				587	static int
				588	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				589	int res = 0;
				590
				591	while (IS_BLANK_CH(*(ctxt->input->cur))) {
				592	if ((*ctxt->input->cur == 0) &&
				593	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				594	xmlPopInput(ctxt);
				595	} else {
				596	if (*(ctxt->input->cur) == '\n') {
				597	ctxt->input->line++; ctxt->input->col = 1;
				598	} else ctxt->input->col++;
				599	ctxt->input->cur++;
				600	ctxt->nbChars++;
				601	if (*ctxt->input->cur == 0)
				602	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				603	}
				604	res++;
				605	}
				606	return(res);
				607	}
				608
				609
				610
				611	/************************************************************************
				612	* *
				613	* The list of HTML elements and their properties *
				614	* *
				615	************************************************************************/
				616
				617	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	618	* Start Tag: 1 means the start tag can be omitted
				619	* End Tag: 1 means the end tag can be omitted
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	620	* 2 means it's forbidden (empty elements)
				621	* 3 means the tag is stylistic and should be closed easily
				622	* Depr: this element is deprecated
				623	* DTD: 1 means that this element is valid only in the Loose DTD
				624	* 2 means that this element is valid only in the Frameset DTD
				625	*
				626	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
				627	, subElements , impliedsubelt , Attributes, userdata
				628	*/
				629
				630	/* Definitions and a couple of vars for HTML Elements */
				631
				632	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
				633	#define NB_FONTSTYLE 8
				634	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
				635	#define NB_PHRASE 10
				636	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
				637	#define NB_SPECIAL 16
				638	#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
				639	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
				640	#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
				641	#define NB_BLOCK NB_HEADING + NB_LIST + 14
				642	#define FORMCTRL "input", "select", "textarea", "label", "button"
				643	#define NB_FORMCTRL 5
				644	#define PCDATA
				645	#define NB_PCDATA 0
				646	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
				647	#define NB_HEADING 6
				648	#define LIST "ul", "ol", "dir", "menu"
				649	#define NB_LIST 4
				650	#define MODIFIER
				651	#define NB_MODIFIER 0
				652	#define FLOW BLOCK,INLINE
				653	#define NB_FLOW NB_BLOCK + NB_INLINE
				654	#define EMPTY NULL
				655
				656
				657	static const char* const html_flow[] = { FLOW, NULL } ;
				658	static const char* const html_inline[] = { INLINE, NULL } ;
				659
				660	/* placeholders: elts with content but no subelements */
				661	static const char* const html_pcdata[] = { NULL } ;
				662	#define html_cdata html_pcdata
				663
				664
				665	/* ... and for HTML Attributes */
				666
				667	#define COREATTRS "id", "class", "style", "title"
				668	#define NB_COREATTRS 4
				669	#define I18N "lang", "dir"
				670	#define NB_I18N 2
				671	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
				672	#define NB_EVENTS 9
				673	#define ATTRS COREATTRS,I18N,EVENTS
				674	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
				675	#define CELLHALIGN "align", "char", "charoff"
				676	#define NB_CELLHALIGN 3
				677	#define CELLVALIGN "valign"
				678	#define NB_CELLVALIGN 1
				679
				680	static const char* const html_attrs[] = { ATTRS, NULL } ;
				681	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
				682	static const char* const core_attrs[] = { COREATTRS, NULL } ;
				683	static const char* const i18n_attrs[] = { I18N, NULL } ;
				684
				685
				686	/* Other declarations that should go inline ... */
				687	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
				688	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
				689	"tabindex", "onfocus", "onblur", NULL } ;
				690	static const char* const target_attr[] = { "target", NULL } ;
				691	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
				692	static const char* const alt_attr[] = { "alt", NULL } ;
				693	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
				694	static const char* const href_attrs[] = { "href", NULL } ;
				695	static const char* const clear_attrs[] = { "clear", NULL } ;
				696	static const char* const inline_p[] = { INLINE, "p", NULL } ;
				697
				698	static const char* const flow_param[] = { FLOW, "param", NULL } ;
				699	static const char* const applet_attrs[] = { COREATTRS , "codebase",
				700	"archive", "alt", "name", "height", "width", "align",
				701	"hspace", "vspace", NULL } ;
				702	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
				703	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
				704	static const char* const basefont_attrs[] =
				705	{ "id", "size", "color", "face", NULL } ;
				706	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
				707	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
				708	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
				709	static const char* const body_depr[] = { "background", "bgcolor", "text",
				710	"link", "vlink", "alink", NULL } ;
				711	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
				712	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
				713
				714
				715	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
				716	static const char* const col_elt[] = { "col", NULL } ;
				717	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
				718	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
				719	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
				720	static const char* const compact_attr[] = { "compact", NULL } ;
				721	static const char* const label_attr[] = { "label", NULL } ;
				722	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
				723	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
				724	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
				725	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
				726	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
				727	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
				728	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
				729	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
				730	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
				731	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
				732	static const char* const version_attr[] = { "version", NULL } ;
				733	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
				734	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
				735	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
				736	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
				737	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
				738	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
				739	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
				740	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
				741	static const char* const align_attr[] = { "align", NULL } ;
				742	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
				743	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
				744	static const char* const name_attr[] = { "name", NULL } ;
				745	static const char* const action_attr[] = { "action", NULL } ;
				746	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
				747	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
				748	static const char* const content_attr[] = { "content", NULL } ;
				749	static const char* const type_attr[] = { "type", NULL } ;
				750	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
				751	static const char* const object_contents[] = { FLOW, "param", NULL } ;
				752	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
				753	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
				754	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
				755	static const char* const option_elt[] = { "option", NULL } ;
				756	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
				757	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
				758	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
				759	static const char* const width_attr[] = { "width", NULL } ;
				760	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
				761	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
				762	static const char* const language_attr[] = { "language", NULL } ;
				763	static const char* const select_content[] = { "optgroup", "option", NULL } ;
				764	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
				765	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
				766	static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
				767	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
				768	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
				769	static const char* const tr_elt[] = { "tr", NULL } ;
				770	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
				771	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
				772	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
				773	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
				774	static const char* const tr_contents[] = { "th", "td", NULL } ;
				775	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
				776	static const char* const li_elt[] = { "li", NULL } ;
				777	static const char* const ul_depr[] = { "type", "compact", NULL} ;
				778	static const char* const dir_attr[] = { "dir", NULL} ;
				779
				780	#define DECL (const char**)
				781
				782	static const htmlElemDesc
				783	html40ElementTable[] = {
				784	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
				785	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
				786	},
				787	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
				788	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				789	},
				790	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
				791	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				792	},
				793	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
				794	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
				795	},
				796	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
				797	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
				798	},
				799	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
				800	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
				801	},
				802	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
				803	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				804	},
				805	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
				806	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
				807	},
				808	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
				809	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
				810	},
				811	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
				812	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
				813	},
				814	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
				815	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				816	},
				817	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
				818	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
				819	},
				820	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
				821	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
				822	},
				823	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
				824	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
				825	},
				826	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
				827	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
				828	},
				829	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
				830	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				831	},
				832	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
				833	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
				834	},
				835	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
				836	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				837	},
				838	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
				839	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				840	},
				841	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
				842	EMPTY , NULL , DECL col_attrs , NULL, NULL
				843	},
				844	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
				845	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
				846	},
				847	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
				848	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
				849	},
				850	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
				851	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
				852	},
				853	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
				854	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
				855	},
				856	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
				857	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
				858	},
				859	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
				860	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
				861	},
				862	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
				863	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
				864	},
				865	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
				866	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				867	},
				868	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
				869	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				870	},
				871	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
				872	EMPTY, NULL, DECL embed_attrs, NULL, NULL
				873	},
				874	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
				875	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
				876	},
				877	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
				878	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
				879	},
				880	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
				881	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
				882	},
				883	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
				884	EMPTY, NULL, NULL, DECL frame_attrs, NULL
				885	},
				886	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
				887	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
				888	},
				889	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
				890	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				891	},
				892	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
				893	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				894	},
				895	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
				896	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				897	},
				898	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
				899	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				900	},
				901	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
				902	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				903	},
				904	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
				905	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				906	},
				907	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
				908	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
				909	},
				910	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
				911	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
				912	},
				913	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
				914	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
				915	},
				916	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
				917	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				918	},
				919	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
				920	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
				921	},
				922	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
				923	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
				924	},
				925	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
				926	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
				927	},
				928	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
				929	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
				930	},
				931	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
				932	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
				933	},
				934	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
				935	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				936	},
				937	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
				938	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
				939	},
				940	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
				941	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
				942	},
				943	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
				944	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
				945	},
				946	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
				947	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
				948	},
				949	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
				950	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
				951	},
				952	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
				953	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
				954	},
				955	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
				956	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
				957	},
				958	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
				959	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
				960	},
				961	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
				962	DECL html_flow, "div", DECL html_attrs, NULL, NULL
				963	},
				964	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
				965	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
				966	},
				967	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
				968	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
				969	},
				970	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
				971	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
				972	},
				973	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
				974	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
				975	},
				976	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
				977	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
				978	},
				979	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
				980	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
				981	},
				982	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
				983	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
				984	},
				985	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
				986	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
				987	},
				988	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
				989	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				990	},
				991	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
				992	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				993	},
				994	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
				995	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
				996	},
				997	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
				998	DECL select_content, NULL, DECL select_attrs, NULL, NULL
				999	},
				1000	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
				1001	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1002	},
				1003	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
				1004	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1005	},
				1006	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
				1007	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				1008	},
				1009	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
				1010	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1011	},
				1012	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
				1013	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
				1014	},
				1015	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
				1016	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1017	},
				1018	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
				1019	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1020	},
				1021	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
				1022	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
				1023	},
				1024	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
				1025	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1026	},
				1027	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
				1028	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
				1029	},
				1030	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
				1031	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
				1032	},
				1033	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
				1034	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1035	},
				1036	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
				1037	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
				1038	},
				1039	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
				1040	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
				1041	},
				1042	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
				1043	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
				1044	},
				1045	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
				1046	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
				1047	},
				1048	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
				1049	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1050	},
				1051	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
				1052	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
				1053	},
				1054	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
				1055	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
				1056	},
				1057	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
				1058	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
				1059	}
				1060	};
				1061
				1062	/*
				1063	* start tags that imply the end of current element
				1064	*/
				1065	static const char * const htmlStartClose[] = {
				1066	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				1067	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				1068	"listing", "xmp", "head", NULL,
				1069	"head", "p", NULL,
				1070	"title", "p", NULL,
				1071	"body", "head", "style", "link", "title", "p", NULL,
				1072	"frameset", "head", "style", "link", "title", "p", NULL,
				1073	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				1074	"pre", "listing", "xmp", "head", "li", NULL,
				1075	"hr", "p", "head", NULL,
				1076	"h1", "p", "head", NULL,
				1077	"h2", "p", "head", NULL,
				1078	"h3", "p", "head", NULL,
				1079	"h4", "p", "head", NULL,
				1080	"h5", "p", "head", NULL,
				1081	"h6", "p", "head", NULL,
				1082	"dir", "p", "head", NULL,
				1083	"address", "p", "head", "ul", NULL,
				1084	"pre", "p", "head", "ul", NULL,
				1085	"listing", "p", "head", NULL,
				1086	"xmp", "p", "head", NULL,
				1087	"blockquote", "p", "head", NULL,
				1088	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				1089	"xmp", "head", NULL,
				1090	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				1091	"head", "dd", NULL,
				1092	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				1093	"head", "dt", NULL,
				1094	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				1095	"listing", "xmp", NULL,
				1096	"ol", "p", "head", "ul", NULL,
				1097	"menu", "p", "head", "ul", NULL,
				1098	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
				1099	"div", "p", "head", NULL,
				1100	"noscript", "script", NULL,
				1101	"center", "font", "b", "i", "p", "head", NULL,
				1102	"a", "a", "head", NULL,
				1103	"caption", "p", NULL,
				1104	"colgroup", "caption", "colgroup", "col", "p", NULL,
				1105	"col", "caption", "col", "p", NULL,
				1106	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				1107	"listing", "xmp", "a", NULL,
				1108	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				1109	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				1110	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				1111	"thead", "caption", "col", "colgroup", NULL,
				1112	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				1113	"tbody", "p", NULL,
				1114	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				1115	"tfoot", "tbody", "p", NULL,
				1116	"optgroup", "option", NULL,
				1117	"option", "option", NULL,
				1118	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				1119	"pre", "listing", "xmp", "a", NULL,
				1120	/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
				1121	"tt", "head", NULL,
				1122	"i", "head", NULL,
				1123	"b", "head", NULL,
				1124	"u", "head", NULL,
				1125	"s", "head", NULL,
				1126	"strike", "head", NULL,
				1127	"big", "head", NULL,
				1128	"small", "head", NULL,
				1129
				1130	"em", "head", NULL,
				1131	"strong", "head", NULL,
				1132	"dfn", "head", NULL,
				1133	"code", "head", NULL,
				1134	"samp", "head", NULL,
				1135	"kbd", "head", NULL,
				1136	"var", "head", NULL,
				1137	"cite", "head", NULL,
				1138	"abbr", "head", NULL,
				1139	"acronym", "head", NULL,
				1140
				1141	/* "a" */
				1142	"img", "head", NULL,
				1143	/* "applet" */
				1144	/* "embed" */
				1145	/* "object" */
				1146	"font", "head", NULL,
				1147	/* "basefont" */
				1148	"br", "head", NULL,
				1149	/* "script" */
				1150	"map", "head", NULL,
				1151	"q", "head", NULL,
				1152	"sub", "head", NULL,
				1153	"sup", "head", NULL,
				1154	"span", "head", NULL,
				1155	"bdo", "head", NULL,
				1156	"iframe", "head", NULL,
				1157	NULL
				1158	};
				1159
				1160	/*
				1161	* The list of HTML elements which are supposed not to have
				1162	* CDATA content and where a p element will be implied
				1163	*
				1164	* TODO: extend that list by reading the HTML SGML DTD on
				1165	* implied paragraph
				1166	*/
				1167	static const char *const htmlNoContentElements[] = {
				1168	"html",
				1169	"head",
				1170	NULL
				1171	};
				1172
				1173	/*
				1174	* The list of HTML attributes which are of content %Script;
				1175	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				1176	* it assumes the name starts with 'on'
				1177	*/
				1178	static const char *const htmlScriptAttributes[] = {
				1179	"onclick",
				1180	"ondblclick",
				1181	"onmousedown",
				1182	"onmouseup",
				1183	"onmouseover",
				1184	"onmousemove",
				1185	"onmouseout",
				1186	"onkeypress",
				1187	"onkeydown",
				1188	"onkeyup",
				1189	"onload",
				1190	"onunload",
				1191	"onfocus",
				1192	"onblur",
				1193	"onsubmit",
				1194	"onreset",
				1195	"onchange",
				1196	"onselect"
				1197	};
				1198
				1199	/*
				1200	* This table is used by the htmlparser to know what to do with
				1201	* broken html pages. By assigning different priorities to different
				1202	* elements the parser can decide how to handle extra endtags.
				1203	* Endtags are only allowed to close elements with lower or equal
				1204	* priority.
				1205	*/
				1206
				1207	typedef struct {
				1208	const char *name;
				1209	int priority;
				1210	} elementPriority;
				1211
				1212	static const elementPriority htmlEndPriority[] = {
				1213	{"div", 150},
				1214	{"td", 160},
				1215	{"th", 160},
				1216	{"tr", 170},
				1217	{"thead", 180},
				1218	{"tbody", 180},
				1219	{"tfoot", 180},
				1220	{"table", 190},
				1221	{"head", 200},
				1222	{"body", 200},
				1223	{"html", 220},
				1224	{NULL, 100} /* Default priority */
				1225	};
				1226
				1227	static const char** htmlStartCloseIndex[100];
				1228	static int htmlStartCloseIndexinitialized = 0;
				1229
				1230	/************************************************************************
				1231	* *
				1232	* functions to handle HTML specific data *
				1233	* *
				1234	************************************************************************/
				1235
				1236	/**
				1237	* htmlInitAutoClose:
				1238	*
				1239	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				1240	* This is not reentrant. Call xmlInitParser() once before processing in
				1241	* case of use in multithreaded programs.
				1242	*/
				1243	void
				1244	htmlInitAutoClose(void) {
				1245	int indx, i = 0;
				1246
				1247	if (htmlStartCloseIndexinitialized) return;
				1248
				1249	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				1250	indx = 0;
				1251	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				1252	htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
				1253	while (htmlStartClose[i] != NULL) i++;
				1254	i++;
				1255	}
				1256	htmlStartCloseIndexinitialized = 1;
				1257	}
				1258
				1259	/**
				1260	* htmlTagLookup:
				1261	* @tag: The tag name in lowercase
				1262	*
				1263	* Lookup the HTML tag in the ElementTable
				1264	*
				1265	* Returns the related htmlElemDescPtr or NULL if not found.
				1266	*/
				1267	const htmlElemDesc *
				1268	htmlTagLookup(const xmlChar *tag) {
				1269	unsigned int i;
				1270
				1271	for (i = 0; i < (sizeof(html40ElementTable) /
				1272	sizeof(html40ElementTable[0]));i++) {
				1273	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
				1274	return((htmlElemDescPtr) &html40ElementTable[i]);
				1275	}
				1276	return(NULL);
				1277	}
				1278
				1279	/**
				1280	* htmlGetEndPriority:
				1281	* @name: The name of the element to look up the priority for.
				1282	*
				1283	* Return value: The "endtag" priority.
				1284	**/
				1285	static int
				1286	htmlGetEndPriority (const xmlChar *name) {
				1287	int i = 0;
				1288
				1289	while ((htmlEndPriority[i].name != NULL) &&
				1290	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				1291	i++;
				1292
				1293	return(htmlEndPriority[i].priority);
				1294	}
				1295
				1296
				1297	/**
				1298	* htmlCheckAutoClose:
				1299	* @newtag: The new tag name
				1300	* @oldtag: The old tag name
				1301	*
				1302	* Checks whether the new tag is one of the registered valid tags for
				1303	* closing old.
				1304	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				1305	*
				1306	* Returns 0 if no, 1 if yes.
				1307	*/
				1308	static int
				1309	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
				1310	{
				1311	int i, indx;
				1312	const char **closed = NULL;
				1313
				1314	if (htmlStartCloseIndexinitialized == 0)
				1315	htmlInitAutoClose();
				1316
				1317	/* inefficient, but not a big deal */
				1318	for (indx = 0; indx < 100; indx++) {
				1319	closed = htmlStartCloseIndex[indx];
				1320	if (closed == NULL)
				1321	return (0);
				1322	if (xmlStrEqual(BAD_CAST * closed, newtag))
				1323	break;
				1324	}
				1325
				1326	i = closed - htmlStartClose;
				1327	i++;
				1328	while (htmlStartClose[i] != NULL) {
				1329	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				1330	return (1);
				1331	}
				1332	i++;
				1333	}
				1334	return (0);
				1335	}
				1336
				1337	/**
				1338	* htmlAutoCloseOnClose:
				1339	* @ctxt: an HTML parser context
				1340	* @newtag: The new tag name
				1341	* @force: force the tag closure
				1342	*
				1343	* The HTML DTD allows an ending tag to implicitly close other tags.
				1344	*/
				1345	static void
				1346	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
				1347	{
				1348	const htmlElemDesc *info;
				1349	int i, priority;
				1350
				1351	priority = htmlGetEndPriority(newtag);
				1352
				1353	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				1354
				1355	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
				1356	break;
				1357	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	1358	* A misplaced endtag can only close elements with lower
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	1359	* or equal priority, so if we find an element with higher
				1360	* priority before we find an element with
				1361	* matching name, we just ignore this endtag
				1362	*/
				1363	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
				1364	return;
				1365	}
				1366	if (i < 0)
				1367	return;
				1368
				1369	while (!xmlStrEqual(newtag, ctxt->name)) {
				1370	info = htmlTagLookup(ctxt->name);
				1371	if ((info != NULL) && (info->endTag == 3)) {
				1372	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				1373	"Opening and ending tag mismatch: %s and %s\n",
				1374	newtag, ctxt->name);
				1375	}
				1376	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1377	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1378	htmlnamePop(ctxt);
				1379	}
				1380	}
				1381
				1382	/**
				1383	* htmlAutoCloseOnEnd:
				1384	* @ctxt: an HTML parser context
				1385	*
				1386	* Close all remaining tags at the end of the stream
				1387	*/
				1388	static void
				1389	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
				1390	{
				1391	int i;
				1392
				1393	if (ctxt->nameNr == 0)
				1394	return;
				1395	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				1396	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1397	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1398	htmlnamePop(ctxt);
				1399	}
				1400	}
				1401
				1402	/**
				1403	* htmlAutoClose:
				1404	* @ctxt: an HTML parser context
				1405	* @newtag: The new tag name or NULL
				1406	*
				1407	* The HTML DTD allows a tag to implicitly close other tags.
				1408	* The list is kept in htmlStartClose array. This function is
				1409	* called when a new tag has been detected and generates the
				1410	* appropriates closes if possible/needed.
				1411	* If newtag is NULL this mean we are at the end of the resource
				1412	* and we should check
				1413	*/
				1414	static void
				1415	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
				1416	{
				1417	while ((newtag != NULL) && (ctxt->name != NULL) &&
				1418	(htmlCheckAutoClose(newtag, ctxt->name))) {
				1419	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1420	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1421	htmlnamePop(ctxt);
				1422	}
				1423	if (newtag == NULL) {
				1424	htmlAutoCloseOnEnd(ctxt);
				1425	return;
				1426	}
				1427	while ((newtag == NULL) && (ctxt->name != NULL) &&
				1428	((xmlStrEqual(ctxt->name, BAD_CAST "head")) \|\|
				1429	(xmlStrEqual(ctxt->name, BAD_CAST "body")) \|\|
				1430	(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
				1431	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				1432	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				1433	htmlnamePop(ctxt);
				1434	}
				1435	}
				1436
				1437	/**
				1438	* htmlAutoCloseTag:
				1439	* @doc: the HTML document
				1440	* @name: The tag name
				1441	* @elem: the HTML element
				1442	*
				1443	* The HTML DTD allows a tag to implicitly close other tags.
				1444	* The list is kept in htmlStartClose array. This function checks
				1445	* if the element or one of it's children would autoclose the
				1446	* given tag.
				1447	*
				1448	* Returns 1 if autoclose, 0 otherwise
				1449	*/
				1450	int
				1451	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				1452	htmlNodePtr child;
				1453
				1454	if (elem == NULL) return(1);
				1455	if (xmlStrEqual(name, elem->name)) return(0);
				1456	if (htmlCheckAutoClose(elem->name, name)) return(1);
				1457	child = elem->children;
				1458	while (child != NULL) {
				1459	if (htmlAutoCloseTag(doc, name, child)) return(1);
				1460	child = child->next;
				1461	}
				1462	return(0);
				1463	}
				1464
				1465	/**
				1466	* htmlIsAutoClosed:
				1467	* @doc: the HTML document
				1468	* @elem: the HTML element
				1469	*
				1470	* The HTML DTD allows a tag to implicitly close other tags.
				1471	* The list is kept in htmlStartClose array. This function checks
				1472	* if a tag is autoclosed by one of it's child
				1473	*
				1474	* Returns 1 if autoclosed, 0 otherwise
				1475	*/
				1476	int
				1477	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				1478	htmlNodePtr child;
				1479
				1480	if (elem == NULL) return(1);
				1481	child = elem->children;
				1482	while (child != NULL) {
				1483	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				1484	child = child->next;
				1485	}
				1486	return(0);
				1487	}
				1488
				1489	/**
				1490	* htmlCheckImplied:
				1491	* @ctxt: an HTML parser context
				1492	* @newtag: The new tag name
				1493	*
				1494	* The HTML DTD allows a tag to exists only implicitly
				1495	* called when a new tag has been detected and generates the
				1496	* appropriates implicit tags if missing
				1497	*/
				1498	static void
				1499	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				1500	int i;
				1501
				1502	if (ctxt->options & HTML_PARSE_NOIMPLIED)
				1503	return;
				1504	if (!htmlOmittedDefaultValue)
				1505	return;
				1506	if (xmlStrEqual(newtag, BAD_CAST"html"))
				1507	return;
				1508	if (ctxt->nameNr <= 0) {
				1509	htmlnamePush(ctxt, BAD_CAST"html");
				1510	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1511	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				1512	}
				1513	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				1514	return;
				1515	if ((ctxt->nameNr <= 1) &&
				1516	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				1517	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				1518	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				1519	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				1520	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				1521	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				1522	if (ctxt->html >= 3) {
				1523	/* we already saw or generated an <head> before */
				1524	return;
				1525	}
				1526	/*
				1527	* dropped OBJECT ... i you put it first BODY will be
				1528	* assumed !
				1529	*/
				1530	htmlnamePush(ctxt, BAD_CAST"head");
				1531	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1532	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				1533	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				1534	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				1535	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				1536	if (ctxt->html >= 10) {
				1537	/* we already saw or generated a <body> before */
				1538	return;
				1539	}
				1540	for (i = 0;i < ctxt->nameNr;i++) {
				1541	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				1542	return;
				1543	}
				1544	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				1545	return;
				1546	}
				1547	}
				1548
				1549	htmlnamePush(ctxt, BAD_CAST"body");
				1550	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1551	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				1552	}
				1553	}
				1554
				1555	/**
				1556	* htmlCheckParagraph
				1557	* @ctxt: an HTML parser context
				1558	*
				1559	* Check whether a p element need to be implied before inserting
				1560	* characters in the current element.
				1561	*
				1562	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				1563	* in case of error.
				1564	*/
				1565
				1566	static int
				1567	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				1568	const xmlChar *tag;
				1569	int i;
				1570
				1571	if (ctxt == NULL)
				1572	return(-1);
				1573	tag = ctxt->name;
				1574	if (tag == NULL) {
				1575	htmlAutoClose(ctxt, BAD_CAST"p");
				1576	htmlCheckImplied(ctxt, BAD_CAST"p");
				1577	htmlnamePush(ctxt, BAD_CAST"p");
				1578	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1579	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1580	return(1);
				1581	}
				1582	if (!htmlOmittedDefaultValue)
				1583	return(0);
				1584	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				1585	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				1586	htmlAutoClose(ctxt, BAD_CAST"p");
				1587	htmlCheckImplied(ctxt, BAD_CAST"p");
				1588	htmlnamePush(ctxt, BAD_CAST"p");
				1589	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1590	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1591	return(1);
				1592	}
				1593	}
				1594	return(0);
				1595	}
				1596
				1597	/**
				1598	* htmlIsScriptAttribute:
				1599	* @name: an attribute name
				1600	*
				1601	* Check if an attribute is of content type Script
				1602	*
				1603	* Returns 1 is the attribute is a script 0 otherwise
				1604	*/
				1605	int
				1606	htmlIsScriptAttribute(const xmlChar *name) {
				1607	unsigned int i;
				1608
				1609	if (name == NULL)
				1610	return(0);
				1611	/*
				1612	* all script attributes start with 'on'
				1613	*/
				1614	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1615	return(0);
				1616	for (i = 0;
				1617	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1618	i++) {
				1619	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1620	return(1);
				1621	}
				1622	return(0);
				1623	}
				1624
				1625	/************************************************************************
				1626	* *
				1627	* The list of HTML predefined entities *
				1628	* *
				1629	************************************************************************/
				1630
				1631
				1632	static const htmlEntityDesc html40EntitiesTable[] = {
				1633	/*
				1634	* the 4 absolute ones, plus apostrophe.
				1635	*/
				1636	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1637	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1638	{ 39, "apos", "single quote" },
				1639	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1640	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1641
				1642	/*
				1643	* A bunch still in the 128-255 range
				1644	* Replacing them depend really on the charset used.
				1645	*/
				1646	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1647	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1648	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1649	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1650	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1651	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1652	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1653	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1654	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1655	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1656	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1657	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1658	{ 172, "not", "not sign, U+00AC ISOnum" },
				1659	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1660	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1661	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1662	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1663	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1664	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1665	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1666	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1667	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1668	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1669	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1670	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1671	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1672	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1673	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1674	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1675	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1676	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1677	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1678	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1679	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1680	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1681	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1682	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1683	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1684	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1685	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1686	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1687	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1688	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1689	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1690	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1691	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1692	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1693	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1694	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1695	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1696	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1697	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1698	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1699	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1700	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1701	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1702	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1703	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1704	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1705	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1706	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1707	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1708	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1709	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1710	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1711	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1712	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1713	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1714	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1715	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1716	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1717	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1718	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1719	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1720	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1721	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1722	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1723	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1724	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1725	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1726	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1727	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1728	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1729	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1730	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1731	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1732	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1733	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1734	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1735	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1736	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1737	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1738	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1739	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1740	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1741	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1742
				1743	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1744	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1745	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1746	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1747	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1748
				1749	/*
				1750	* Anything below should really be kept as entities references
				1751	*/
				1752	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1753
				1754	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1755	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1756
				1757	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1758	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1759	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1760	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1761	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1762	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1763	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1764	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1765	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1766	{ 922, "Kappa","greek capital letter kappa, U+039A" },
				1767	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
				1768	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1769	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1770	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1771	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1772	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1773	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1774	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1775	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1776	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1777	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1778	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1779	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1780	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1781
				1782	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1783	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1784	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1785	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1786	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1787	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1788	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1789	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1790	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1791	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1792	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1793	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1794	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1795	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1796	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1797	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1798	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1799	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1800	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1801	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1802	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1803	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1804	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1805	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1806	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1807	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1808	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1809	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1810
				1811	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1812	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1813	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1814	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1815	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1816	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1817	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1818	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1819	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1820	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1821	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1822	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1823	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1824	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1825	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1826	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1827	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1828
				1829	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1830	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1831
				1832	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1833
				1834	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1835	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1836
				1837	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1838	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1839
				1840	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1841	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1842
				1843	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1844
				1845	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1846	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1847	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1848	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1849	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1850	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1851	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1852	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1853	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1854	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1855	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1856	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1857	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1858	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1859	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1860	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1861
				1862	{ 8704, "forall","for all, U+2200 ISOtech" },
				1863	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1864	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1865	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1866	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1867	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1868	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1869	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1870	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1871	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
				1872	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1873	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1874	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1875	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1876	{ 8734, "infin","infinity, U+221E ISOtech" },
				1877	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1878	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1879	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1880	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1881	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1882	{ 8747, "int", "integral, U+222B ISOtech" },
				1883	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1884	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1885	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1886	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1887	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1888	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1889	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1890	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1891	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1892	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1893	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1894	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1895	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1896	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1897	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1898	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1899	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1900	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1901	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1902	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1903	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1904	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1905	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1906	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1907
				1908	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1909	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1910	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1911	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1912
				1913	};
				1914
				1915	/************************************************************************
				1916	* *
				1917	* Commodity functions to handle entities *
				1918	* *
				1919	************************************************************************/
				1920
				1921	/*
				1922	* Macro used to grow the current buffer.
				1923	*/
				1924	#define growBuffer(buffer) { \
				1925	xmlChar *tmp; \
				1926	buffer##_size *= 2; \
				1927	tmp = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1928	if (tmp == NULL) { \
				1929	htmlErrMemory(ctxt, "growing buffer\n"); \
				1930	xmlFree(buffer); \
				1931	return(NULL); \
				1932	} \
				1933	buffer = tmp; \
				1934	}
				1935
				1936	/**
				1937	* htmlEntityLookup:
				1938	* @name: the entity name
				1939	*
				1940	* Lookup the given entity in EntitiesTable
				1941	*
				1942	* TODO: the linear scan is really ugly, an hash table is really needed.
				1943	*
				1944	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1945	*/
				1946	const htmlEntityDesc *
				1947	htmlEntityLookup(const xmlChar *name) {
				1948	unsigned int i;
				1949
				1950	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1951	sizeof(html40EntitiesTable[0]));i++) {
				1952	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1953	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
				1954	}
				1955	}
				1956	return(NULL);
				1957	}
				1958
				1959	/**
				1960	* htmlEntityValueLookup:
				1961	* @value: the entity's unicode value
				1962	*
				1963	* Lookup the given entity in EntitiesTable
				1964	*
				1965	* TODO: the linear scan is really ugly, an hash table is really needed.
				1966	*
				1967	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1968	*/
				1969	const htmlEntityDesc *
				1970	htmlEntityValueLookup(unsigned int value) {
				1971	unsigned int i;
				1972
				1973	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1974	sizeof(html40EntitiesTable[0]));i++) {
				1975	if (html40EntitiesTable[i].value >= value) {
				1976	if (html40EntitiesTable[i].value > value)
				1977	break;
				1978	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
				1979	}
				1980	}
				1981	return(NULL);
				1982	}
				1983
				1984	/**
				1985	* UTF8ToHtml:
				1986	* @out: a pointer to an array of bytes to store the result
				1987	* @outlen: the length of @out
				1988	* @in: a pointer to an array of UTF-8 chars
				1989	* @inlen: the length of @in
				1990	*
				1991	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1992	* plus HTML entities block of chars out.
				1993	*
				1994	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1995	* The value of @inlen after return is the number of octets consumed
				1996	* as the return value is positive, else unpredictable.
				1997	* The value of @outlen after return is the number of octets consumed.
				1998	*/
				1999	int
				2000	UTF8ToHtml(unsigned char* out, int *outlen,
				2001	const unsigned char* in, int *inlen) {
				2002	const unsigned char* processed = in;
				2003	const unsigned char* outend;
				2004	const unsigned char* outstart = out;
				2005	const unsigned char* instart = in;
				2006	const unsigned char* inend;
				2007	unsigned int c, d;
				2008	int trailing;
				2009
				2010	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
				2011	if (in == NULL) {
				2012	/*
				2013	* initialization nothing to do
				2014	*/
				2015	*outlen = 0;
				2016	*inlen = 0;
				2017	return(0);
				2018	}
				2019	inend = in + (*inlen);
				2020	outend = out + (*outlen);
				2021	while (in < inend) {
				2022	d = *in++;
				2023	if (d < 0x80) { c= d; trailing= 0; }
				2024	else if (d < 0xC0) {
				2025	/* trailing byte in leading position */
				2026	*outlen = out - outstart;
				2027	*inlen = processed - instart;
				2028	return(-2);
				2029	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				2030	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				2031	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				2032	else {
				2033	/* no chance for this in Ascii */
				2034	*outlen = out - outstart;
				2035	*inlen = processed - instart;
				2036	return(-2);
				2037	}
				2038
				2039	if (inend - in < trailing) {
				2040	break;
				2041	}
				2042
				2043	for ( ; trailing; trailing--) {
				2044	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				2045	break;
				2046	c <<= 6;
				2047	c \|= d & 0x3F;
				2048	}
				2049
				2050	/* assertion: c is a single UTF-4 value */
				2051	if (c < 0x80) {
				2052	if (out + 1 >= outend)
				2053	break;
				2054	*out++ = c;
				2055	} else {
				2056	int len;
				2057	const htmlEntityDesc * ent;
				2058	const char *cp;
				2059	char nbuf[16];
				2060
				2061	/*
				2062	* Try to lookup a predefined HTML entity for it
				2063	*/
				2064
				2065	ent = htmlEntityValueLookup(c);
				2066	if (ent == NULL) {
				2067	snprintf(nbuf, sizeof(nbuf), "#%u", c);
				2068	cp = nbuf;
				2069	}
				2070	else
				2071	cp = ent->name;
				2072	len = strlen(cp);
				2073	if (out + 2 + len >= outend)
				2074	break;
				2075	*out++ = '&';
				2076	memcpy(out, cp, len);
				2077	out += len;
				2078	*out++ = ';';
				2079	}
				2080	processed = in;
				2081	}
				2082	*outlen = out - outstart;
				2083	*inlen = processed - instart;
				2084	return(0);
				2085	}
				2086
				2087	/**
				2088	* htmlEncodeEntities:
				2089	* @out: a pointer to an array of bytes to store the result
				2090	* @outlen: the length of @out
				2091	* @in: a pointer to an array of UTF-8 chars
				2092	* @inlen: the length of @in
				2093	* @quoteChar: the quote character to escape (' or ") or zero.
				2094	*
				2095	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				2096	* plus HTML entities block of chars out.
				2097	*
				2098	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				2099	* The value of @inlen after return is the number of octets consumed
				2100	* as the return value is positive, else unpredictable.
				2101	* The value of @outlen after return is the number of octets consumed.
				2102	*/
				2103	int
				2104	htmlEncodeEntities(unsigned char* out, int *outlen,
				2105	const unsigned char* in, int *inlen, int quoteChar) {
				2106	const unsigned char* processed = in;
				2107	const unsigned char* outend;
				2108	const unsigned char* outstart = out;
				2109	const unsigned char* instart = in;
				2110	const unsigned char* inend;
				2111	unsigned int c, d;
				2112	int trailing;
				2113
				2114	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
				2115	return(-1);
				2116	outend = out + (*outlen);
				2117	inend = in + (*inlen);
				2118	while (in < inend) {
				2119	d = *in++;
				2120	if (d < 0x80) { c= d; trailing= 0; }
				2121	else if (d < 0xC0) {
				2122	/* trailing byte in leading position */
				2123	*outlen = out - outstart;
				2124	*inlen = processed - instart;
				2125	return(-2);
				2126	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				2127	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				2128	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				2129	else {
				2130	/* no chance for this in Ascii */
				2131	*outlen = out - outstart;
				2132	*inlen = processed - instart;
				2133	return(-2);
				2134	}
				2135
				2136	if (inend - in < trailing)
				2137	break;
				2138
				2139	while (trailing--) {
				2140	if (((d= *in++) & 0xC0) != 0x80) {
				2141	*outlen = out - outstart;
				2142	*inlen = processed - instart;
				2143	return(-2);
				2144	}
				2145	c <<= 6;
				2146	c \|= d & 0x3F;
				2147	}
				2148
				2149	/* assertion: c is a single UTF-4 value */
				2150	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				2151	(c != '&') && (c != '<') && (c != '>')) {
				2152	if (out >= outend)
				2153	break;
				2154	*out++ = c;
				2155	} else {
				2156	const htmlEntityDesc * ent;
				2157	const char *cp;
				2158	char nbuf[16];
				2159	int len;
				2160
				2161	/*
				2162	* Try to lookup a predefined HTML entity for it
				2163	*/
				2164	ent = htmlEntityValueLookup(c);
				2165	if (ent == NULL) {
				2166	snprintf(nbuf, sizeof(nbuf), "#%u", c);
				2167	cp = nbuf;
				2168	}
				2169	else
				2170	cp = ent->name;
				2171	len = strlen(cp);
				2172	if (out + 2 + len > outend)
				2173	break;
				2174	*out++ = '&';
				2175	memcpy(out, cp, len);
				2176	out += len;
				2177	*out++ = ';';
				2178	}
				2179	processed = in;
				2180	}
				2181	*outlen = out - outstart;
				2182	*inlen = processed - instart;
				2183	return(0);
				2184	}
				2185
				2186	/************************************************************************
				2187	* *
				2188	* Commodity functions to handle streams *
				2189	* *
				2190	************************************************************************/
				2191
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2192	#ifdef LIBXML_PUSH_ENABLED
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2193	/**
				2194	* htmlNewInputStream:
				2195	* @ctxt: an HTML parser context
				2196	*
				2197	* Create a new input stream structure
				2198	* Returns the new input stream or NULL
				2199	*/
				2200	static htmlParserInputPtr
				2201	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				2202	htmlParserInputPtr input;
				2203
				2204	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				2205	if (input == NULL) {
				2206	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
				2207	return(NULL);
				2208	}
				2209	memset(input, 0, sizeof(htmlParserInput));
				2210	input->filename = NULL;
				2211	input->directory = NULL;
				2212	input->base = NULL;
				2213	input->cur = NULL;
				2214	input->buf = NULL;
				2215	input->line = 1;
				2216	input->col = 1;
				2217	input->buf = NULL;
				2218	input->free = NULL;
				2219	input->version = NULL;
				2220	input->consumed = 0;
				2221	input->length = 0;
				2222	return(input);
				2223	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2224	#endif
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2225
				2226
				2227	/************************************************************************
				2228	* *
				2229	* Commodity functions, cleanup needed ? *
				2230	* *
				2231	************************************************************************/
				2232	/*
				2233	* all tags allowing pc data from the html 4.01 loose dtd
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2234	* NOTE: it might be more appropriate to integrate this information
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2235	* into the html40ElementTable array but I don't want to risk any
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2236	* binary incompatibility
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2237	*/
				2238	static const char *allowPCData[] = {
				2239	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
				2240	"blockquote", "body", "button", "caption", "center", "cite", "code",
				2241	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
				2242	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
				2243	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
				2244	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
				2245	};
				2246
				2247	/**
				2248	* areBlanks:
				2249	* @ctxt: an HTML parser context
				2250	* @str: a xmlChar *
				2251	* @len: the size of @str
				2252	*
				2253	* Is this a sequence of blank chars that one can ignore ?
				2254	*
				2255	* Returns 1 if ignorable 0 otherwise.
				2256	*/
				2257
				2258	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				2259	unsigned int i;
				2260	int j;
				2261	xmlNodePtr lastChild;
				2262	xmlDtdPtr dtd;
				2263
				2264	for (j = 0;j < len;j++)
				2265	if (!(IS_BLANK_CH(str[j]))) return(0);
				2266
				2267	if (CUR == 0) return(1);
				2268	if (CUR != '<') return(0);
				2269	if (ctxt->name == NULL)
				2270	return(1);
				2271	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				2272	return(1);
				2273	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				2274	return(1);
				2275
				2276	/* Only strip CDATA children of the body tag for strict HTML DTDs */
				2277	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
				2278	dtd = xmlGetIntSubset(ctxt->myDoc);
				2279	if (dtd != NULL && dtd->ExternalID != NULL) {
				2280	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
				2281	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
				2282	return(1);
				2283	}
				2284	}
				2285
				2286	if (ctxt->node == NULL) return(0);
				2287	lastChild = xmlGetLastChild(ctxt->node);
				2288	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
				2289	lastChild = lastChild->prev;
				2290	if (lastChild == NULL) {
				2291	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				2292	(ctxt->node->content != NULL)) return(0);
				2293	/* keep ws in constructs like ...<b> </b>...
				2294	for all tags "b" allowing PCDATA */
				2295	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				2296	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
				2297	return(0);
				2298	}
				2299	}
				2300	} else if (xmlNodeIsText(lastChild)) {
				2301	return(0);
				2302	} else {
				2303	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
				2304	for all tags "p" allowing PCDATA */
				2305	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				2306	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
				2307	return(0);
				2308	}
				2309	}
				2310	}
				2311	return(1);
				2312	}
				2313
				2314	/**
				2315	* htmlNewDocNoDtD:
				2316	* @URI: URI for the dtd, or NULL
				2317	* @ExternalID: the external ID of the DTD, or NULL
				2318	*
				2319	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				2320	* are NULL
				2321	*
				2322	* Returns a new document, do not initialize the DTD if not provided
				2323	*/
				2324	htmlDocPtr
				2325	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				2326	xmlDocPtr cur;
				2327
				2328	/*
				2329	* Allocate a new document and fill the fields.
				2330	*/
				2331	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				2332	if (cur == NULL) {
				2333	htmlErrMemory(NULL, "HTML document creation failed\n");
				2334	return(NULL);
				2335	}
				2336	memset(cur, 0, sizeof(xmlDoc));
				2337
				2338	cur->type = XML_HTML_DOCUMENT_NODE;
				2339	cur->version = NULL;
				2340	cur->intSubset = NULL;
				2341	cur->doc = cur;
				2342	cur->name = NULL;
				2343	cur->children = NULL;
				2344	cur->extSubset = NULL;
				2345	cur->oldNs = NULL;
				2346	cur->encoding = NULL;
				2347	cur->standalone = 1;
				2348	cur->compression = 0;
				2349	cur->ids = NULL;
				2350	cur->refs = NULL;
				2351	cur->_private = NULL;
				2352	cur->charset = XML_CHAR_ENCODING_UTF8;
				2353	cur->properties = XML_DOC_HTML \| XML_DOC_USERBUILT;
				2354	if ((ExternalID != NULL) \|\|
				2355	(URI != NULL))
				2356	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
				2357	return(cur);
				2358	}
				2359
				2360	/**
				2361	* htmlNewDoc:
				2362	* @URI: URI for the dtd, or NULL
				2363	* @ExternalID: the external ID of the DTD, or NULL
				2364	*
				2365	* Creates a new HTML document
				2366	*
				2367	* Returns a new document
				2368	*/
				2369	htmlDocPtr
				2370	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				2371	if ((URI == NULL) && (ExternalID == NULL))
				2372	return(htmlNewDocNoDtD(
				2373	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				2374	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
				2375
				2376	return(htmlNewDocNoDtD(URI, ExternalID));
				2377	}
				2378
				2379
				2380	/************************************************************************
				2381	* *
				2382	* The parser itself *
				2383	* Relates to http://www.w3.org/TR/html40 *
				2384	* *
				2385	************************************************************************/
				2386
				2387	/************************************************************************
				2388	* *
				2389	* The parser itself *
				2390	* *
				2391	************************************************************************/
				2392
				2393	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
				2394
				2395	/**
				2396	* htmlParseHTMLName:
				2397	* @ctxt: an HTML parser context
				2398	*
				2399	* parse an HTML tag or attribute name, note that we convert it to lowercase
				2400	* since HTML names are not case-sensitive.
				2401	*
				2402	* Returns the Tag Name parsed or NULL
				2403	*/
				2404
				2405	static const xmlChar *
				2406	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				2407	int i = 0;
				2408	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				2409
				2410	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
				2411	(CUR != ':') && (CUR != '.')) return(NULL);
				2412
				2413	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				2414	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
				2415	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_') \|\|
				2416	(CUR == '.'))) {
				2417	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				2418	else loc[i] = CUR;
				2419	i++;
				2420
				2421	NEXT;
				2422	}
				2423
				2424	return(xmlDictLookup(ctxt->dict, loc, i));
				2425	}
				2426
				2427
				2428	/**
				2429	* htmlParseHTMLName_nonInvasive:
				2430	* @ctxt: an HTML parser context
				2431	*
				2432	* parse an HTML tag or attribute name, note that we convert it to lowercase
				2433	* since HTML names are not case-sensitive, this doesn't consume the data
				2434	* from the stream, it's a look-ahead
				2435	*
				2436	* Returns the Tag Name parsed or NULL
				2437	*/
				2438
				2439	static const xmlChar *
				2440	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
				2441	int i = 0;
				2442	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				2443
				2444	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
				2445	(NXT(1) != ':')) return(NULL);
				2446
				2447	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				2448	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
				2449	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
				2450	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
				2451	else loc[i] = NXT(1+i);
				2452	i++;
				2453	}
				2454
				2455	return(xmlDictLookup(ctxt->dict, loc, i));
				2456	}
				2457
				2458
				2459	/**
				2460	* htmlParseName:
				2461	* @ctxt: an HTML parser context
				2462	*
				2463	* parse an HTML name, this routine is case sensitive.
				2464	*
				2465	* Returns the Name parsed or NULL
				2466	*/
				2467
				2468	static const xmlChar *
				2469	htmlParseName(htmlParserCtxtPtr ctxt) {
				2470	const xmlChar *in;
				2471	const xmlChar *ret;
				2472	int count = 0;
				2473
				2474	GROW;
				2475
				2476	/*
				2477	* Accelerator for simple ASCII names
				2478	*/
				2479	in = ctxt->input->cur;
				2480	if (((in >= 0x61) && (in <= 0x7A)) \|\|
				2481	((in >= 0x41) && (in <= 0x5A)) \|\|
				2482	(in == '_') \|\| (in == ':')) {
				2483	in++;
				2484	while (((in >= 0x61) && (in <= 0x7A)) \|\|
				2485	((in >= 0x41) && (in <= 0x5A)) \|\|
				2486	((in >= 0x30) && (in <= 0x39)) \|\|
				2487	(in == '_') \|\| (in == '-') \|\|
				2488	(in == ':') \|\| (in == '.'))
				2489	in++;
				2490
				2491	if (in == ctxt->input->end)
				2492	return(NULL);
				2493
				2494	if ((in > 0) && (in < 0x80)) {
				2495	count = in - ctxt->input->cur;
				2496	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
				2497	ctxt->input->cur = in;
				2498	ctxt->nbChars += count;
				2499	ctxt->input->col += count;
				2500	return(ret);
				2501	}
				2502	}
				2503	return(htmlParseNameComplex(ctxt));
				2504	}
				2505
				2506	static const xmlChar *
				2507	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
				2508	int len = 0, l;
				2509	int c;
				2510	int count = 0;
				2511	const xmlChar *base = ctxt->input->base;
				2512
				2513	/*
				2514	* Handler for more complex cases
				2515	*/
				2516	GROW;
				2517	c = CUR_CHAR(l);
				2518	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
				2519	(!IS_LETTER(c) && (c != '_') &&
				2520	(c != ':'))) {
				2521	return(NULL);
				2522	}
				2523
				2524	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
				2525	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
				2526	(c == '.') \|\| (c == '-') \|\|
				2527	(c == '_') \|\| (c == ':') \|\|
				2528	(IS_COMBINING(c)) \|\|
				2529	(IS_EXTENDER(c)))) {
				2530	if (count++ > 100) {
				2531	count = 0;
				2532	GROW;
				2533	}
				2534	len += l;
				2535	NEXTL(l);
				2536	c = CUR_CHAR(l);
				2537	if (ctxt->input->base != base) {
				2538	/*
				2539	* We changed encoding from an unknown encoding
				2540	* Input buffer changed location, so we better start again
				2541	*/
				2542	return(htmlParseNameComplex(ctxt));
				2543	}
				2544	}
				2545
				2546	if (ctxt->input->cur - ctxt->input->base < len) {
				2547	/* Sanity check */
				2548	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				2549	"unexpected change of input buffer", NULL, NULL);
				2550	return (NULL);
				2551	}
				2552
				2553	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
				2554	}
				2555
				2556
				2557	/**
				2558	* htmlParseHTMLAttribute:
				2559	* @ctxt: an HTML parser context
				2560	* @stop: a char stop value
				2561	*
				2562	* parse an HTML attribute value till the stop (quote), if
				2563	* stop is 0 then it stops at the first space
				2564	*
				2565	* Returns the attribute parsed or NULL
				2566	*/
				2567
				2568	static xmlChar *
				2569	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				2570	xmlChar *buffer = NULL;
				2571	int buffer_size = 0;
				2572	xmlChar *out = NULL;
				2573	const xmlChar *name = NULL;
				2574	const xmlChar *cur = NULL;
				2575	const htmlEntityDesc * ent;
				2576
				2577	/*
				2578	* allocate a translation buffer.
				2579	*/
				2580	buffer_size = HTML_PARSER_BUFFER_SIZE;
				2581	buffer = (xmlChar ) xmlMallocAtomic(buffer_size sizeof(xmlChar));
				2582	if (buffer == NULL) {
				2583	htmlErrMemory(ctxt, "buffer allocation failed\n");
				2584	return(NULL);
				2585	}
				2586	out = buffer;
				2587
				2588	/*
				2589	* Ok loop until we reach one of the ending chars
				2590	*/
				2591	while ((CUR != 0) && (CUR != stop)) {
				2592	if ((stop == 0) && (CUR == '>')) break;
				2593	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
				2594	if (CUR == '&') {
				2595	if (NXT(1) == '#') {
				2596	unsigned int c;
				2597	int bits;
				2598
				2599	c = htmlParseCharRef(ctxt);
				2600	if (c < 0x80)
				2601	{ *out++ = c; bits= -6; }
				2602	else if (c < 0x800)
				2603	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2604	else if (c < 0x10000)
				2605	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2606	else
				2607	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2608
				2609	for ( ; bits >= 0; bits-= 6) {
				2610	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2611	}
				2612
				2613	if (out - buffer > buffer_size - 100) {
				2614	int indx = out - buffer;
				2615
				2616	growBuffer(buffer);
				2617	out = &buffer[indx];
				2618	}
				2619	} else {
				2620	ent = htmlParseEntityRef(ctxt, &name);
				2621	if (name == NULL) {
				2622	*out++ = '&';
				2623	if (out - buffer > buffer_size - 100) {
				2624	int indx = out - buffer;
				2625
				2626	growBuffer(buffer);
				2627	out = &buffer[indx];
				2628	}
				2629	} else if (ent == NULL) {
				2630	*out++ = '&';
				2631	cur = name;
				2632	while (*cur != 0) {
				2633	if (out - buffer > buffer_size - 100) {
				2634	int indx = out - buffer;
				2635
				2636	growBuffer(buffer);
				2637	out = &buffer[indx];
				2638	}
				2639	out++ = cur++;
				2640	}
				2641	} else {
				2642	unsigned int c;
				2643	int bits;
				2644
				2645	if (out - buffer > buffer_size - 100) {
				2646	int indx = out - buffer;
				2647
				2648	growBuffer(buffer);
				2649	out = &buffer[indx];
				2650	}
				2651	c = ent->value;
				2652	if (c < 0x80)
				2653	{ *out++ = c; bits= -6; }
				2654	else if (c < 0x800)
				2655	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2656	else if (c < 0x10000)
				2657	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2658	else
				2659	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2660
				2661	for ( ; bits >= 0; bits-= 6) {
				2662	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2663	}
				2664	}
				2665	}
				2666	} else {
				2667	unsigned int c;
				2668	int bits, l;
				2669
				2670	if (out - buffer > buffer_size - 100) {
				2671	int indx = out - buffer;
				2672
				2673	growBuffer(buffer);
				2674	out = &buffer[indx];
				2675	}
				2676	c = CUR_CHAR(l);
				2677	if (c < 0x80)
				2678	{ *out++ = c; bits= -6; }
				2679	else if (c < 0x800)
				2680	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2681	else if (c < 0x10000)
				2682	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2683	else
				2684	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2685
				2686	for ( ; bits >= 0; bits-= 6) {
				2687	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2688	}
				2689	NEXT;
				2690	}
				2691	}
				2692	*out = 0;
				2693	return(buffer);
				2694	}
				2695
				2696	/**
				2697	* htmlParseEntityRef:
				2698	* @ctxt: an HTML parser context
				2699	* @str: location to store the entity name
				2700	*
				2701	* parse an HTML ENTITY references
				2702	*
				2703	* [68] EntityRef ::= '&' Name ';'
				2704	*
				2705	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2706	* if non-NULL *str will have to be freed by the caller.
				2707	*/
				2708	const htmlEntityDesc *
				2709	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
				2710	const xmlChar *name;
				2711	const htmlEntityDesc * ent = NULL;
				2712
				2713	if (str != NULL) *str = NULL;
				2714	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
				2715
				2716	if (CUR == '&') {
				2717	NEXT;
				2718	name = htmlParseName(ctxt);
				2719	if (name == NULL) {
				2720	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				2721	"htmlParseEntityRef: no name\n", NULL, NULL);
				2722	} else {
				2723	GROW;
				2724	if (CUR == ';') {
				2725	if (str != NULL)
				2726	*str = name;
				2727
				2728	/*
				2729	* Lookup the entity in the table.
				2730	*/
				2731	ent = htmlEntityLookup(name);
				2732	if (ent != NULL) /* OK that's ugly !!! */
				2733	NEXT;
				2734	} else {
				2735	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
				2736	"htmlParseEntityRef: expecting ';'\n",
				2737	NULL, NULL);
				2738	if (str != NULL)
				2739	*str = name;
				2740	}
				2741	}
				2742	}
				2743	return(ent);
				2744	}
				2745
				2746	/**
				2747	* htmlParseAttValue:
				2748	* @ctxt: an HTML parser context
				2749	*
				2750	* parse a value for an attribute
				2751	* Note: the parser won't do substitution of entities here, this
				2752	* will be handled later in xmlStringGetNodeList, unless it was
				2753	* asked for ctxt->replaceEntities != 0
				2754	*
				2755	* Returns the AttValue parsed or NULL.
				2756	*/
				2757
				2758	static xmlChar *
				2759	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2760	xmlChar *ret = NULL;
				2761
				2762	if (CUR == '"') {
				2763	NEXT;
				2764	ret = htmlParseHTMLAttribute(ctxt, '"');
				2765	if (CUR != '"') {
				2766	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
				2767	"AttValue: \" expected\n", NULL, NULL);
				2768	} else
				2769	NEXT;
				2770	} else if (CUR == '\'') {
				2771	NEXT;
				2772	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2773	if (CUR != '\'') {
				2774	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
				2775	"AttValue: ' expected\n", NULL, NULL);
				2776	} else
				2777	NEXT;
				2778	} else {
				2779	/*
				2780	* That's an HTMLism, the attribute value may not be quoted
				2781	*/
				2782	ret = htmlParseHTMLAttribute(ctxt, 0);
				2783	if (ret == NULL) {
				2784	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
				2785	"AttValue: no value found\n", NULL, NULL);
				2786	}
				2787	}
				2788	return(ret);
				2789	}
				2790
				2791	/**
				2792	* htmlParseSystemLiteral:
				2793	* @ctxt: an HTML parser context
				2794	*
				2795	* parse an HTML Literal
				2796	*
				2797	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2798	*
				2799	* Returns the SystemLiteral parsed or NULL
				2800	*/
				2801
				2802	static xmlChar *
				2803	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2804	size_t len = 0, startPosition = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2805	int err = 0;
				2806	int quote;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2807	xmlChar *ret = NULL;
				2808
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2809	if ((CUR != '"') && (CUR != '\'')) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2810	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2811	"SystemLiteral \" or ' expected\n", NULL, NULL);
				2812	return(NULL);
				2813	}
				2814	quote = CUR;
				2815	NEXT;
				2816
				2817	if (CUR_PTR < BASE_PTR)
				2818	return(ret);
				2819	startPosition = CUR_PTR - BASE_PTR;
				2820
				2821	while ((CUR != 0) && (CUR != quote)) {
				2822	/* TODO: Handle UTF-8 */
				2823	if (!IS_CHAR_CH(CUR)) {
				2824	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2825	"Invalid char in SystemLiteral 0x%X\n", CUR);
				2826	err = 1;
				2827	}
				2828	NEXT;
				2829	len++;
				2830	}
				2831	if (CUR != quote) {
				2832	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
				2833	"Unfinished SystemLiteral\n", NULL, NULL);
				2834	} else {
				2835	NEXT;
				2836	if (err == 0)
				2837	ret = xmlStrndup((BASE_PTR+startPosition), len);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2838	}
				2839
				2840	return(ret);
				2841	}
				2842
				2843	/**
				2844	* htmlParsePubidLiteral:
				2845	* @ctxt: an HTML parser context
				2846	*
				2847	* parse an HTML public literal
				2848	*
				2849	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2850	*
				2851	* Returns the PubidLiteral parsed or NULL.
				2852	*/
				2853
				2854	static xmlChar *
				2855	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2856	size_t len = 0, startPosition = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2857	int err = 0;
				2858	int quote;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2859	xmlChar *ret = NULL;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2860
				2861	if ((CUR != '"') && (CUR != '\'')) {
				2862	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
				2863	"PubidLiteral \" or ' expected\n", NULL, NULL);
				2864	return(NULL);
				2865	}
				2866	quote = CUR;
				2867	NEXT;
				2868
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2869	/*
				2870	* Name ::= (Letter \| '_') (NameChar)*
				2871	*/
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2872	if (CUR_PTR < BASE_PTR)
				2873	return(ret);
				2874	startPosition = CUR_PTR - BASE_PTR;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2875
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2876	while ((CUR != 0) && (CUR != quote)) {
				2877	if (!IS_PUBIDCHAR_CH(CUR)) {
				2878	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2879	"Invalid char in PubidLiteral 0x%X\n", CUR);
				2880	err = 1;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2881	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2882	len++;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2883	NEXT;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2884	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2885
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2886	if (CUR != '"') {
				2887	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
				2888	"Unfinished PubidLiteral\n", NULL, NULL);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2889	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2890	NEXT;
				2891	if (err == 0)
				2892	ret = xmlStrndup((BASE_PTR + startPosition), len);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2893	}
				2894
				2895	return(ret);
				2896	}
				2897
				2898	/**
				2899	* htmlParseScript:
				2900	* @ctxt: an HTML parser context
				2901	*
				2902	* parse the content of an HTML SCRIPT or STYLE element
				2903	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2904	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2905	* http://www.w3.org/TR/html4/types.html#type-script
				2906	* http://www.w3.org/TR/html4/types.html#h-6.15
				2907	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2908	*
				2909	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2910	* element and the value of intrinsic event attributes. User agents must
				2911	* not evaluate script data as HTML markup but instead must pass it on as
				2912	* data to a script engine.
				2913	* NOTES:
				2914	* - The content is passed like CDATA
				2915	* - the attributes for style and scripting "onXXX" are also described
				2916	* as CDATA but SGML allows entities references in attributes so their
				2917	* processing is identical as other attributes
				2918	*/
				2919	static void
				2920	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2921	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2922	int nbchar = 0;
				2923	int cur,l;
				2924
				2925	SHRINK;
				2926	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2927	while (cur != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2928	if ((cur == '<') && (NXT(1) == '/')) {
				2929	/*
				2930	* One should break here, the specification is clear:
				2931	* Authors should therefore escape "</" within the content.
				2932	* Escape mechanisms are specific to each scripting or
				2933	* style sheet language.
				2934	*
				2935	* In recovery mode, only break if end tag match the
				2936	* current tag, effectively ignoring all tags inside the
				2937	* script/style block and treating the entire block as
				2938	* CDATA.
				2939	*/
				2940	if (ctxt->recovery) {
				2941	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
				2942	xmlStrlen(ctxt->name)) == 0)
				2943	{
				2944	break; /* while */
				2945	} else {
				2946	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				2947	"Element %s embeds close tag\n",
				2948	ctxt->name, NULL);
				2949	}
				2950	} else {
				2951	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2952	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2953	{
				2954	break; /* while */
				2955	}
				2956	}
				2957	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2958	if (IS_CHAR(cur)) {
				2959	COPY_BUF(l,buf,nbchar,cur);
				2960	} else {
				2961	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				2962	"Invalid char in CDATA 0x%X\n", cur);
				2963	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2964	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2965	buf[nbchar] = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2966	if (ctxt->sax->cdataBlock!= NULL) {
				2967	/*
				2968	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2969	*/
				2970	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2971	} else if (ctxt->sax->characters != NULL) {
				2972	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2973	}
				2974	nbchar = 0;
				2975	}
				2976	GROW;
				2977	NEXTL(l);
				2978	cur = CUR_CHAR(l);
				2979	}
				2980
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2981	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	2982	buf[nbchar] = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	2983	if (ctxt->sax->cdataBlock!= NULL) {
				2984	/*
				2985	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2986	*/
				2987	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2988	} else if (ctxt->sax->characters != NULL) {
				2989	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2990	}
				2991	}
				2992	}
				2993
				2994
				2995	/**
				2996	* htmlParseCharDataInternal:
				2997	* @ctxt: an HTML parser context
				2998	* @readahead: optional read ahead character in ascii range
				2999	*
				3000	* parse a CharData section.
				3001	* if we are within a CDATA section ']]>' marks an end of section.
				3002	*
				3003	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				3004	*/
				3005
				3006	static void
				3007	htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
				3008	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
				3009	int nbchar = 0;
				3010	int cur, l;
				3011	int chunk = 0;
				3012
				3013	if (readahead)
				3014	buf[nbchar++] = readahead;
				3015
				3016	SHRINK;
				3017	cur = CUR_CHAR(l);
				3018	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				3019	((cur != '&') \|\| (ctxt->token == '&')) &&
				3020	(cur != 0)) {
				3021	if (!(IS_CHAR(cur))) {
				3022	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3023	"Invalid char in CDATA 0x%X\n", cur);
				3024	} else {
				3025	COPY_BUF(l,buf,nbchar,cur);
				3026	}
				3027	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3028	buf[nbchar] = 0;
				3029
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3030	/*
				3031	* Ok the segment is to be consumed as chars.
				3032	*/
				3033	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				3034	if (areBlanks(ctxt, buf, nbchar)) {
				3035	if (ctxt->keepBlanks) {
				3036	if (ctxt->sax->characters != NULL)
				3037	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3038	} else {
				3039	if (ctxt->sax->ignorableWhitespace != NULL)
				3040	ctxt->sax->ignorableWhitespace(ctxt->userData,
				3041	buf, nbchar);
				3042	}
				3043	} else {
				3044	htmlCheckParagraph(ctxt);
				3045	if (ctxt->sax->characters != NULL)
				3046	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3047	}
				3048	}
				3049	nbchar = 0;
				3050	}
				3051	NEXTL(l);
				3052	chunk++;
				3053	if (chunk > HTML_PARSER_BUFFER_SIZE) {
				3054	chunk = 0;
				3055	SHRINK;
				3056	GROW;
				3057	}
				3058	cur = CUR_CHAR(l);
				3059	if (cur == 0) {
				3060	SHRINK;
				3061	GROW;
				3062	cur = CUR_CHAR(l);
				3063	}
				3064	}
				3065	if (nbchar != 0) {
				3066	buf[nbchar] = 0;
				3067
				3068	/*
				3069	* Ok the segment is to be consumed as chars.
				3070	*/
				3071	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				3072	if (areBlanks(ctxt, buf, nbchar)) {
				3073	if (ctxt->keepBlanks) {
				3074	if (ctxt->sax->characters != NULL)
				3075	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3076	} else {
				3077	if (ctxt->sax->ignorableWhitespace != NULL)
				3078	ctxt->sax->ignorableWhitespace(ctxt->userData,
				3079	buf, nbchar);
				3080	}
				3081	} else {
				3082	htmlCheckParagraph(ctxt);
				3083	if (ctxt->sax->characters != NULL)
				3084	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				3085	}
				3086	}
				3087	} else {
				3088	/*
				3089	* Loop detection
				3090	*/
				3091	if (cur == 0)
				3092	ctxt->instate = XML_PARSER_EOF;
				3093	}
				3094	}
				3095
				3096	/**
				3097	* htmlParseCharData:
				3098	* @ctxt: an HTML parser context
				3099	*
				3100	* parse a CharData section.
				3101	* if we are within a CDATA section ']]>' marks an end of section.
				3102	*
				3103	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				3104	*/
				3105
				3106	static void
				3107	htmlParseCharData(htmlParserCtxtPtr ctxt) {
				3108	htmlParseCharDataInternal(ctxt, 0);
				3109	}
				3110
				3111	/**
				3112	* htmlParseExternalID:
				3113	* @ctxt: an HTML parser context
				3114	* @publicID: a xmlChar** receiving PubidLiteral
				3115	*
				3116	* Parse an External ID or a Public ID
				3117	*
				3118	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				3119	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				3120	*
				3121	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				3122	*
				3123	* Returns the function returns SystemLiteral and in the second
				3124	* case publicID receives PubidLiteral, is strict is off
				3125	* it is possible to return NULL and have publicID set.
				3126	*/
				3127
				3128	static xmlChar *
				3129	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
				3130	xmlChar *URI = NULL;
				3131
				3132	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				3133	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				3134	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				3135	SKIP(6);
				3136	if (!IS_BLANK_CH(CUR)) {
				3137	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3138	"Space required after 'SYSTEM'\n", NULL, NULL);
				3139	}
				3140	SKIP_BLANKS;
				3141	URI = htmlParseSystemLiteral(ctxt);
				3142	if (URI == NULL) {
				3143	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
				3144	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
				3145	}
				3146	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				3147	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				3148	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				3149	SKIP(6);
				3150	if (!IS_BLANK_CH(CUR)) {
				3151	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3152	"Space required after 'PUBLIC'\n", NULL, NULL);
				3153	}
				3154	SKIP_BLANKS;
				3155	*publicID = htmlParsePubidLiteral(ctxt);
				3156	if (*publicID == NULL) {
				3157	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
				3158	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
				3159	NULL, NULL);
				3160	}
				3161	SKIP_BLANKS;
				3162	if ((CUR == '"') \|\| (CUR == '\'')) {
				3163	URI = htmlParseSystemLiteral(ctxt);
				3164	}
				3165	}
				3166	return(URI);
				3167	}
				3168
				3169	/**
				3170	* xmlParsePI:
				3171	* @ctxt: an XML parser context
				3172	*
				3173	* parse an XML Processing Instruction.
				3174	*
				3175	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
				3176	*/
				3177	static void
				3178	htmlParsePI(htmlParserCtxtPtr ctxt) {
				3179	xmlChar *buf = NULL;
				3180	int len = 0;
				3181	int size = HTML_PARSER_BUFFER_SIZE;
				3182	int cur, l;
				3183	const xmlChar *target;
				3184	xmlParserInputState state;
				3185	int count = 0;
				3186
				3187	if ((RAW == '<') && (NXT(1) == '?')) {
				3188	state = ctxt->instate;
				3189	ctxt->instate = XML_PARSER_PI;
				3190	/*
				3191	* this is a Processing Instruction.
				3192	*/
				3193	SKIP(2);
				3194	SHRINK;
				3195
				3196	/*
				3197	* Parse the target name and check for special support like
				3198	* namespace.
				3199	*/
				3200	target = htmlParseName(ctxt);
				3201	if (target != NULL) {
				3202	if (RAW == '>') {
				3203	SKIP(1);
				3204
				3205	/*
				3206	* SAX: PI detected.
				3207	*/
				3208	if ((ctxt->sax) && (!ctxt->disableSAX) &&
				3209	(ctxt->sax->processingInstruction != NULL))
				3210	ctxt->sax->processingInstruction(ctxt->userData,
				3211	target, NULL);
				3212	ctxt->instate = state;
				3213	return;
				3214	}
				3215	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
				3216	if (buf == NULL) {
				3217	htmlErrMemory(ctxt, NULL);
				3218	ctxt->instate = state;
				3219	return;
				3220	}
				3221	cur = CUR;
				3222	if (!IS_BLANK(cur)) {
				3223	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
				3224	"ParsePI: PI %s space expected\n", target, NULL);
				3225	}
				3226	SKIP_BLANKS;
				3227	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3228	while ((cur != 0) && (cur != '>')) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3229	if (len + 5 >= size) {
				3230	xmlChar *tmp;
				3231
				3232	size *= 2;
				3233	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				3234	if (tmp == NULL) {
				3235	htmlErrMemory(ctxt, NULL);
				3236	xmlFree(buf);
				3237	ctxt->instate = state;
				3238	return;
				3239	}
				3240	buf = tmp;
				3241	}
				3242	count++;
				3243	if (count > 50) {
				3244	GROW;
				3245	count = 0;
				3246	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3247	if (IS_CHAR(cur)) {
				3248	COPY_BUF(l,buf,len,cur);
				3249	} else {
				3250	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3251	"Invalid char in processing instruction "
				3252	"0x%X\n", cur);
				3253	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3254	NEXTL(l);
				3255	cur = CUR_CHAR(l);
				3256	if (cur == 0) {
				3257	SHRINK;
				3258	GROW;
				3259	cur = CUR_CHAR(l);
				3260	}
				3261	}
				3262	buf[len] = 0;
				3263	if (cur != '>') {
				3264	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
				3265	"ParsePI: PI %s never end ...\n", target, NULL);
				3266	} else {
				3267	SKIP(1);
				3268
				3269	/*
				3270	* SAX: PI detected.
				3271	*/
				3272	if ((ctxt->sax) && (!ctxt->disableSAX) &&
				3273	(ctxt->sax->processingInstruction != NULL))
				3274	ctxt->sax->processingInstruction(ctxt->userData,
				3275	target, buf);
				3276	}
				3277	xmlFree(buf);
				3278	} else {
				3279	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
				3280	"PI is not started correctly", NULL, NULL);
				3281	}
				3282	ctxt->instate = state;
				3283	}
				3284	}
				3285
				3286	/**
				3287	* htmlParseComment:
				3288	* @ctxt: an HTML parser context
				3289	*
				3290	* Parse an XML (SGML) comment <!-- .... -->
				3291	*
				3292	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				3293	*/
				3294	static void
				3295	htmlParseComment(htmlParserCtxtPtr ctxt) {
				3296	xmlChar *buf = NULL;
				3297	int len;
				3298	int size = HTML_PARSER_BUFFER_SIZE;
				3299	int q, ql;
				3300	int r, rl;
				3301	int cur, l;
				3302	xmlParserInputState state;
				3303
				3304	/*
				3305	* Check that there is a comment right here.
				3306	*/
				3307	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				3308	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				3309
				3310	state = ctxt->instate;
				3311	ctxt->instate = XML_PARSER_COMMENT;
				3312	SHRINK;
				3313	SKIP(4);
				3314	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
				3315	if (buf == NULL) {
				3316	htmlErrMemory(ctxt, "buffer allocation failed\n");
				3317	ctxt->instate = state;
				3318	return;
				3319	}
				3320	len = 0;
				3321	buf[len] = 0;
				3322	q = CUR_CHAR(ql);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3323	if (q == 0)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3324	goto unfinished;
				3325	NEXTL(ql);
				3326	r = CUR_CHAR(rl);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3327	if (r == 0)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3328	goto unfinished;
				3329	NEXTL(rl);
				3330	cur = CUR_CHAR(l);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3331	while ((cur != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3332	((cur != '>') \|\|
				3333	(r != '-') \|\| (q != '-'))) {
				3334	if (len + 5 >= size) {
				3335	xmlChar *tmp;
				3336
				3337	size *= 2;
				3338	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				3339	if (tmp == NULL) {
				3340	xmlFree(buf);
				3341	htmlErrMemory(ctxt, "growing buffer failed\n");
				3342	ctxt->instate = state;
				3343	return;
				3344	}
				3345	buf = tmp;
				3346	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3347	if (IS_CHAR(q)) {
				3348	COPY_BUF(ql,buf,len,q);
				3349	} else {
				3350	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3351	"Invalid char in comment 0x%X\n", q);
				3352	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3353	q = r;
				3354	ql = rl;
				3355	r = cur;
				3356	rl = l;
				3357	NEXTL(l);
				3358	cur = CUR_CHAR(l);
				3359	if (cur == 0) {
				3360	SHRINK;
				3361	GROW;
				3362	cur = CUR_CHAR(l);
				3363	}
				3364	}
				3365	buf[len] = 0;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3366	if (cur == '>') {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3367	NEXT;
				3368	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				3369	(!ctxt->disableSAX))
				3370	ctxt->sax->comment(ctxt->userData, buf);
				3371	xmlFree(buf);
				3372	ctxt->instate = state;
				3373	return;
				3374	}
				3375
				3376	unfinished:
				3377	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
				3378	"Comment not terminated \n<!--%.50s\n", buf, NULL);
				3379	xmlFree(buf);
				3380	}
				3381
				3382	/**
				3383	* htmlParseCharRef:
				3384	* @ctxt: an HTML parser context
				3385	*
				3386	* parse Reference declarations
				3387	*
				3388	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				3389	* '&#x' [0-9a-fA-F]+ ';'
				3390	*
				3391	* Returns the value parsed (as an int)
				3392	*/
				3393	int
				3394	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				3395	int val = 0;
				3396
				3397	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				3398	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				3399	"htmlParseCharRef: context error\n",
				3400	NULL, NULL);
				3401	return(0);
				3402	}
				3403	if ((CUR == '&') && (NXT(1) == '#') &&
				3404	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
				3405	SKIP(3);
				3406	while (CUR != ';') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3407	if ((CUR >= '0') && (CUR <= '9')) {
				3408	if (val < 0x110000)
				3409	val = val * 16 + (CUR - '0');
				3410	} else if ((CUR >= 'a') && (CUR <= 'f')) {
				3411	if (val < 0x110000)
				3412	val = val * 16 + (CUR - 'a') + 10;
				3413	} else if ((CUR >= 'A') && (CUR <= 'F')) {
				3414	if (val < 0x110000)
				3415	val = val * 16 + (CUR - 'A') + 10;
				3416	} else {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3417	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
				3418	"htmlParseCharRef: missing semicolon\n",
				3419	NULL, NULL);
				3420	break;
				3421	}
				3422	NEXT;
				3423	}
				3424	if (CUR == ';')
				3425	NEXT;
				3426	} else if ((CUR == '&') && (NXT(1) == '#')) {
				3427	SKIP(2);
				3428	while (CUR != ';') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3429	if ((CUR >= '0') && (CUR <= '9')) {
				3430	if (val < 0x110000)
				3431	val = val * 10 + (CUR - '0');
				3432	} else {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3433	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
				3434	"htmlParseCharRef: missing semicolon\n",
				3435	NULL, NULL);
				3436	break;
				3437	}
				3438	NEXT;
				3439	}
				3440	if (CUR == ';')
				3441	NEXT;
				3442	} else {
				3443	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
				3444	"htmlParseCharRef: invalid value\n", NULL, NULL);
				3445	}
				3446	/*
				3447	* Check the value IS_CHAR ...
				3448	*/
				3449	if (IS_CHAR(val)) {
				3450	return(val);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3451	} else if (val >= 0x110000) {
				3452	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
				3453	"htmlParseCharRef: value too large\n", NULL, NULL);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3454	} else {
				3455	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
				3456	"htmlParseCharRef: invalid xmlChar value %d\n",
				3457	val);
				3458	}
				3459	return(0);
				3460	}
				3461
				3462
				3463	/**
				3464	* htmlParseDocTypeDecl:
				3465	* @ctxt: an HTML parser context
				3466	*
				3467	* parse a DOCTYPE declaration
				3468	*
				3469	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				3470	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				3471	*/
				3472
				3473	static void
				3474	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				3475	const xmlChar *name;
				3476	xmlChar *ExternalID = NULL;
				3477	xmlChar *URI = NULL;
				3478
				3479	/*
				3480	* We know that '<!DOCTYPE' has been detected.
				3481	*/
				3482	SKIP(9);
				3483
				3484	SKIP_BLANKS;
				3485
				3486	/*
				3487	* Parse the DOCTYPE name.
				3488	*/
				3489	name = htmlParseName(ctxt);
				3490	if (name == NULL) {
				3491	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3492	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
				3493	NULL, NULL);
				3494	}
				3495	/*
				3496	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				3497	*/
				3498
				3499	SKIP_BLANKS;
				3500
				3501	/*
				3502	* Check for SystemID and ExternalID
				3503	*/
				3504	URI = htmlParseExternalID(ctxt, &ExternalID);
				3505	SKIP_BLANKS;
				3506
				3507	/*
				3508	* We should be at the end of the DOCTYPE declaration.
				3509	*/
				3510	if (CUR != '>') {
				3511	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
				3512	"DOCTYPE improperly terminated\n", NULL, NULL);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3513	/* Ignore bogus content */
				3514	while ((CUR != 0) && (CUR != '>'))
				3515	NEXT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3516	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3517	if (CUR == '>')
				3518	NEXT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3519
				3520	/*
				3521	* Create or update the document accordingly to the DOCTYPE
				3522	*/
				3523	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				3524	(!ctxt->disableSAX))
				3525	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				3526
				3527	/*
				3528	* Cleanup, since we don't use all those identifiers
				3529	*/
				3530	if (URI != NULL) xmlFree(URI);
				3531	if (ExternalID != NULL) xmlFree(ExternalID);
				3532	}
				3533
				3534	/**
				3535	* htmlParseAttribute:
				3536	* @ctxt: an HTML parser context
				3537	* @value: a xmlChar ** used to store the value of the attribute
				3538	*
				3539	* parse an attribute
				3540	*
				3541	* [41] Attribute ::= Name Eq AttValue
				3542	*
				3543	* [25] Eq ::= S? '=' S?
				3544	*
				3545	* With namespace:
				3546	*
				3547	* [NS 11] Attribute ::= QName Eq AttValue
				3548	*
				3549	* Also the case QName == xmlns:??? is handled independently as a namespace
				3550	* definition.
				3551	*
				3552	* Returns the attribute name, and the value in *value.
				3553	*/
				3554
				3555	static const xmlChar *
				3556	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				3557	const xmlChar *name;
				3558	xmlChar *val = NULL;
				3559
				3560	*value = NULL;
				3561	name = htmlParseHTMLName(ctxt);
				3562	if (name == NULL) {
				3563	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3564	"error parsing attribute name\n", NULL, NULL);
				3565	return(NULL);
				3566	}
				3567
				3568	/*
				3569	* read the value
				3570	*/
				3571	SKIP_BLANKS;
				3572	if (CUR == '=') {
				3573	NEXT;
				3574	SKIP_BLANKS;
				3575	val = htmlParseAttValue(ctxt);
				3576	}
				3577
				3578	*value = val;
				3579	return(name);
				3580	}
				3581
				3582	/**
				3583	* htmlCheckEncodingDirect:
				3584	* @ctxt: an HTML parser context
				3585	* @attvalue: the attribute value
				3586	*
				3587	* Checks an attribute value to detect
				3588	* the encoding
				3589	* If a new encoding is detected the parser is switched to decode
				3590	* it and pass UTF8
				3591	*/
				3592	static void
				3593	htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
				3594
				3595	if ((ctxt == NULL) \|\| (encoding == NULL) \|\|
				3596	(ctxt->options & HTML_PARSE_IGNORE_ENC))
				3597	return;
				3598
				3599	/* do not change encoding */
				3600	if (ctxt->input->encoding != NULL)
				3601	return;
				3602
				3603	if (encoding != NULL) {
				3604	xmlCharEncoding enc;
				3605	xmlCharEncodingHandlerPtr handler;
				3606
				3607	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				3608
				3609	if (ctxt->input->encoding != NULL)
				3610	xmlFree((xmlChar *) ctxt->input->encoding);
				3611	ctxt->input->encoding = xmlStrdup(encoding);
				3612
				3613	enc = xmlParseCharEncoding((const char *) encoding);
				3614	/*
				3615	* registered set of known encodings
				3616	*/
				3617	if (enc != XML_CHAR_ENCODING_ERROR) {
				3618	if (((enc == XML_CHAR_ENCODING_UTF16LE) \|\|
				3619	(enc == XML_CHAR_ENCODING_UTF16BE) \|\|
				3620	(enc == XML_CHAR_ENCODING_UCS4LE) \|\|
				3621	(enc == XML_CHAR_ENCODING_UCS4BE)) &&
				3622	(ctxt->input->buf != NULL) &&
				3623	(ctxt->input->buf->encoder == NULL)) {
				3624	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				3625	"htmlCheckEncoding: wrong encoding meta\n",
				3626	NULL, NULL);
				3627	} else {
				3628	xmlSwitchEncoding(ctxt, enc);
				3629	}
				3630	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				3631	} else {
				3632	/*
				3633	* fallback for unknown encodings
				3634	*/
				3635	handler = xmlFindCharEncodingHandler((const char *) encoding);
				3636	if (handler != NULL) {
				3637	xmlSwitchToEncoding(ctxt, handler);
				3638	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				3639	} else {
				3640	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				3641	"htmlCheckEncoding: unknown encoding %s\n",
				3642	encoding, NULL);
				3643	}
				3644	}
				3645
				3646	if ((ctxt->input->buf != NULL) &&
				3647	(ctxt->input->buf->encoder != NULL) &&
				3648	(ctxt->input->buf->raw != NULL) &&
				3649	(ctxt->input->buf->buffer != NULL)) {
				3650	int nbchars;
				3651	int processed;
				3652
				3653	/*
				3654	* convert as much as possible to the parser reading buffer.
				3655	*/
				3656	processed = ctxt->input->cur - ctxt->input->base;
				3657	xmlBufShrink(ctxt->input->buf->buffer, processed);
				3658	nbchars = xmlCharEncInput(ctxt->input->buf, 1);
				3659	xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
				3660	if (nbchars < 0) {
				3661	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				3662	"htmlCheckEncoding: encoder error\n",
				3663	NULL, NULL);
				3664	}
				3665	}
				3666	}
				3667	}
				3668
				3669	/**
				3670	* htmlCheckEncoding:
				3671	* @ctxt: an HTML parser context
				3672	* @attvalue: the attribute value
				3673	*
				3674	* Checks an http-equiv attribute from a Meta tag to detect
				3675	* the encoding
				3676	* If a new encoding is detected the parser is switched to decode
				3677	* it and pass UTF8
				3678	*/
				3679	static void
				3680	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				3681	const xmlChar *encoding;
				3682
				3683	if (!attvalue)
				3684	return;
				3685
				3686	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
				3687	if (encoding != NULL) {
				3688	encoding += 7;
				3689	}
				3690	/*
				3691	* skip blank
				3692	*/
				3693	if (encoding && IS_BLANK_CH(*encoding))
				3694	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
				3695	if (encoding && *encoding == '=') {
				3696	encoding ++;
				3697	htmlCheckEncodingDirect(ctxt, encoding);
				3698	}
				3699	}
				3700
				3701	/**
				3702	* htmlCheckMeta:
				3703	* @ctxt: an HTML parser context
				3704	* @atts: the attributes values
				3705	*
				3706	* Checks an attributes from a Meta tag
				3707	*/
				3708	static void
				3709	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				3710	int i;
				3711	const xmlChar att, value;
				3712	int http = 0;
				3713	const xmlChar *content = NULL;
				3714
				3715	if ((ctxt == NULL) \|\| (atts == NULL))
				3716	return;
				3717
				3718	i = 0;
				3719	att = atts[i++];
				3720	while (att != NULL) {
				3721	value = atts[i++];
				3722	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				3723	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				3724	http = 1;
				3725	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
				3726	htmlCheckEncodingDirect(ctxt, value);
				3727	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				3728	content = value;
				3729	att = atts[i++];
				3730	}
				3731	if ((http) && (content != NULL))
				3732	htmlCheckEncoding(ctxt, content);
				3733
				3734	}
				3735
				3736	/**
				3737	* htmlParseStartTag:
				3738	* @ctxt: an HTML parser context
				3739	*
				3740	* parse a start of tag either for rule element or
				3741	* EmptyElement. In both case we don't parse the tag closing chars.
				3742	*
				3743	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				3744	*
				3745	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				3746	*
				3747	* With namespace:
				3748	*
				3749	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				3750	*
				3751	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				3752	*
				3753	* Returns 0 in case of success, -1 in case of error and 1 if discarded
				3754	*/
				3755
				3756	static int
				3757	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				3758	const xmlChar *name;
				3759	const xmlChar *attname;
				3760	xmlChar *attvalue;
				3761	const xmlChar **atts;
				3762	int nbatts = 0;
				3763	int maxatts;
				3764	int meta = 0;
				3765	int i;
				3766	int discardtag = 0;
				3767
				3768	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				3769	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				3770	"htmlParseStartTag: context error\n", NULL, NULL);
				3771	return -1;
				3772	}
				3773	if (ctxt->instate == XML_PARSER_EOF)
				3774	return(-1);
				3775	if (CUR != '<') return -1;
				3776	NEXT;
				3777
				3778	atts = ctxt->atts;
				3779	maxatts = ctxt->maxatts;
				3780
				3781	GROW;
				3782	name = htmlParseHTMLName(ctxt);
				3783	if (name == NULL) {
				3784	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				3785	"htmlParseStartTag: invalid element name\n",
				3786	NULL, NULL);
				3787	/* if recover preserve text on classic misconstructs */
				3788	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) \|\| (CUR == '<') \|\|
				3789	(CUR == '=') \|\| (CUR == '>') \|\| (((CUR >= '0') && (CUR <= '9'))))) {
				3790	htmlParseCharDataInternal(ctxt, '<');
				3791	return(-1);
				3792	}
				3793
				3794
				3795	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3796	while ((CUR != 0) && (CUR != '>') &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3797	(ctxt->instate != XML_PARSER_EOF))
				3798	NEXT;
				3799	return -1;
				3800	}
				3801	if (xmlStrEqual(name, BAD_CAST"meta"))
				3802	meta = 1;
				3803
				3804	/*
				3805	* Check for auto-closure of HTML elements.
				3806	*/
				3807	htmlAutoClose(ctxt, name);
				3808
				3809	/*
				3810	* Check for implied HTML elements.
				3811	*/
				3812	htmlCheckImplied(ctxt, name);
				3813
				3814	/*
				3815	* Avoid html at any level > 0, head at any level != 1
				3816	* or any attempt to recurse body
				3817	*/
				3818	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				3819	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3820	"htmlParseStartTag: misplaced <html> tag\n",
				3821	name, NULL);
				3822	discardtag = 1;
				3823	ctxt->depth++;
				3824	}
				3825	if ((ctxt->nameNr != 1) &&
				3826	(xmlStrEqual(name, BAD_CAST"head"))) {
				3827	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3828	"htmlParseStartTag: misplaced <head> tag\n",
				3829	name, NULL);
				3830	discardtag = 1;
				3831	ctxt->depth++;
				3832	}
				3833	if (xmlStrEqual(name, BAD_CAST"body")) {
				3834	int indx;
				3835	for (indx = 0;indx < ctxt->nameNr;indx++) {
				3836	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
				3837	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				3838	"htmlParseStartTag: misplaced <body> tag\n",
				3839	name, NULL);
				3840	discardtag = 1;
				3841	ctxt->depth++;
				3842	}
				3843	}
				3844	}
				3845
				3846	/*
				3847	* Now parse the attributes, it ends up with the ending
				3848	*
				3849	* (S Attribute)* S?
				3850	*/
				3851	SKIP_BLANKS;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3852	while ((CUR != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3853	(CUR != '>') &&
				3854	((CUR != '/') \|\| (NXT(1) != '>'))) {
				3855	long cons = ctxt->nbChars;
				3856
				3857	GROW;
				3858	attname = htmlParseAttribute(ctxt, &attvalue);
				3859	if (attname != NULL) {
				3860
				3861	/*
				3862	* Well formedness requires at most one declaration of an attribute
				3863	*/
				3864	for (i = 0; i < nbatts;i += 2) {
				3865	if (xmlStrEqual(atts[i], attname)) {
				3866	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
				3867	"Attribute %s redefined\n", attname, NULL);
				3868	if (attvalue != NULL)
				3869	xmlFree(attvalue);
				3870	goto failed;
				3871	}
				3872	}
				3873
				3874	/*
				3875	* Add the pair to atts
				3876	*/
				3877	if (atts == NULL) {
				3878	maxatts = 22; /* allow for 10 attrs by default */
				3879	atts = (const xmlChar **)
				3880	xmlMalloc(maxatts * sizeof(xmlChar *));
				3881	if (atts == NULL) {
				3882	htmlErrMemory(ctxt, NULL);
				3883	if (attvalue != NULL)
				3884	xmlFree(attvalue);
				3885	goto failed;
				3886	}
				3887	ctxt->atts = atts;
				3888	ctxt->maxatts = maxatts;
				3889	} else if (nbatts + 4 > maxatts) {
				3890	const xmlChar **n;
				3891
				3892	maxatts *= 2;
				3893	n = (const xmlChar *) xmlRealloc((void ) atts,
				3894	maxatts * sizeof(const xmlChar *));
				3895	if (n == NULL) {
				3896	htmlErrMemory(ctxt, NULL);
				3897	if (attvalue != NULL)
				3898	xmlFree(attvalue);
				3899	goto failed;
				3900	}
				3901	atts = n;
				3902	ctxt->atts = atts;
				3903	ctxt->maxatts = maxatts;
				3904	}
				3905	atts[nbatts++] = attname;
				3906	atts[nbatts++] = attvalue;
				3907	atts[nbatts] = NULL;
				3908	atts[nbatts + 1] = NULL;
				3909	}
				3910	else {
				3911	if (attvalue != NULL)
				3912	xmlFree(attvalue);
				3913	/* Dump the bogus attribute string up to the next blank or
				3914	* the end of the tag. */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3915	while ((CUR != 0) &&
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3916	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
				3917	((CUR != '/') \|\| (NXT(1) != '>')))
				3918	NEXT;
				3919	}
				3920
				3921	failed:
				3922	SKIP_BLANKS;
				3923	if (cons == ctxt->nbChars) {
				3924	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				3925	"htmlParseStartTag: problem parsing attributes\n",
				3926	NULL, NULL);
				3927	break;
				3928	}
				3929	}
				3930
				3931	/*
				3932	* Handle specific association to the META tag
				3933	*/
				3934	if (meta && (nbatts != 0))
				3935	htmlCheckMeta(ctxt, atts);
				3936
				3937	/*
				3938	* SAX: Start of Element !
				3939	*/
				3940	if (!discardtag) {
				3941	htmlnamePush(ctxt, name);
				3942	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
				3943	if (nbatts != 0)
				3944	ctxt->sax->startElement(ctxt->userData, name, atts);
				3945	else
				3946	ctxt->sax->startElement(ctxt->userData, name, NULL);
				3947	}
				3948	}
				3949
				3950	if (atts != NULL) {
				3951	for (i = 1;i < nbatts;i += 2) {
				3952	if (atts[i] != NULL)
				3953	xmlFree((xmlChar *) atts[i]);
				3954	}
				3955	}
				3956
				3957	return(discardtag);
				3958	}
				3959
				3960	/**
				3961	* htmlParseEndTag:
				3962	* @ctxt: an HTML parser context
				3963	*
				3964	* parse an end of tag
				3965	*
				3966	* [42] ETag ::= '</' Name S? '>'
				3967	*
				3968	* With namespace
				3969	*
				3970	* [NS 9] ETag ::= '</' QName S? '>'
				3971	*
				3972	* Returns 1 if the current level should be closed.
				3973	*/
				3974
				3975	static int
				3976	htmlParseEndTag(htmlParserCtxtPtr ctxt)
				3977	{
				3978	const xmlChar *name;
				3979	const xmlChar *oldname;
				3980	int i, ret;
				3981
				3982	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3983	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
				3984	"htmlParseEndTag: '</' not found\n", NULL, NULL);
				3985	return (0);
				3986	}
				3987	SKIP(2);
				3988
				3989	name = htmlParseHTMLName(ctxt);
				3990	if (name == NULL)
				3991	return (0);
				3992	/*
				3993	* We should definitely be at the ending "S? '>'" part
				3994	*/
				3995	SKIP_BLANKS;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3996	if (CUR != '>') {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	3997	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				3998	"End tag : expected '>'\n", NULL, NULL);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	3999	/* Skip to next '>' */
				4000	while ((CUR != 0) && (CUR != '>'))
				4001	NEXT;
				4002	}
				4003	if (CUR == '>')
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4004	NEXT;
				4005
				4006	/*
				4007	* if we ignored misplaced tags in htmlParseStartTag don't pop them
				4008	* out now.
				4009	*/
				4010	if ((ctxt->depth > 0) &&
				4011	(xmlStrEqual(name, BAD_CAST "html") \|\|
				4012	xmlStrEqual(name, BAD_CAST "body") \|\|
				4013	xmlStrEqual(name, BAD_CAST "head"))) {
				4014	ctxt->depth--;
				4015	return (0);
				4016	}
				4017
				4018	/*
				4019	* If the name read is not one of the element in the parsing stack
				4020	* then return, it's just an error.
				4021	*/
				4022	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
				4023	if (xmlStrEqual(name, ctxt->nameTab[i]))
				4024	break;
				4025	}
				4026	if (i < 0) {
				4027	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				4028	"Unexpected end tag : %s\n", name, NULL);
				4029	return (0);
				4030	}
				4031
				4032
				4033	/*
				4034	* Check for auto-closure of HTML elements.
				4035	*/
				4036
				4037	htmlAutoCloseOnClose(ctxt, name);
				4038
				4039	/*
				4040	* Well formedness constraints, opening and closing must match.
				4041	* With the exception that the autoclose may have popped stuff out
				4042	* of the stack.
				4043	*/
				4044	if (!xmlStrEqual(name, ctxt->name)) {
				4045	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
				4046	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
				4047	"Opening and ending tag mismatch: %s and %s\n",
				4048	name, ctxt->name);
				4049	}
				4050	}
				4051
				4052	/*
				4053	* SAX: End of Tag
				4054	*/
				4055	oldname = ctxt->name;
				4056	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				4057	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4058	ctxt->sax->endElement(ctxt->userData, name);
				4059	htmlNodeInfoPop(ctxt);
				4060	htmlnamePop(ctxt);
				4061	ret = 1;
				4062	} else {
				4063	ret = 0;
				4064	}
				4065
				4066	return (ret);
				4067	}
				4068
				4069
				4070	/**
				4071	* htmlParseReference:
				4072	* @ctxt: an HTML parser context
				4073	*
				4074	* parse and handle entity references in content,
				4075	* this will end-up in a call to character() since this is either a
				4076	* CharRef, or a predefined entity.
				4077	*/
				4078	static void
				4079	htmlParseReference(htmlParserCtxtPtr ctxt) {
				4080	const htmlEntityDesc * ent;
				4081	xmlChar out[6];
				4082	const xmlChar *name;
				4083	if (CUR != '&') return;
				4084
				4085	if (NXT(1) == '#') {
				4086	unsigned int c;
				4087	int bits, i = 0;
				4088
				4089	c = htmlParseCharRef(ctxt);
				4090	if (c == 0)
				4091	return;
				4092
				4093	if (c < 0x80) { out[i++]= c; bits= -6; }
				4094	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				4095	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				4096	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				4097
				4098	for ( ; bits >= 0; bits-= 6) {
				4099	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				4100	}
				4101	out[i] = 0;
				4102
				4103	htmlCheckParagraph(ctxt);
				4104	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4105	ctxt->sax->characters(ctxt->userData, out, i);
				4106	} else {
				4107	ent = htmlParseEntityRef(ctxt, &name);
				4108	if (name == NULL) {
				4109	htmlCheckParagraph(ctxt);
				4110	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4111	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				4112	return;
				4113	}
				4114	if ((ent == NULL) \|\| !(ent->value > 0)) {
				4115	htmlCheckParagraph(ctxt);
				4116	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				4117	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				4118	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				4119	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				4120	}
				4121	} else {
				4122	unsigned int c;
				4123	int bits, i = 0;
				4124
				4125	c = ent->value;
				4126	if (c < 0x80)
				4127	{ out[i++]= c; bits= -6; }
				4128	else if (c < 0x800)
				4129	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				4130	else if (c < 0x10000)
				4131	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				4132	else
				4133	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				4134
				4135	for ( ; bits >= 0; bits-= 6) {
				4136	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				4137	}
				4138	out[i] = 0;
				4139
				4140	htmlCheckParagraph(ctxt);
				4141	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4142	ctxt->sax->characters(ctxt->userData, out, i);
				4143	}
				4144	}
				4145	}
				4146
				4147	/**
				4148	* htmlParseContent:
				4149	* @ctxt: an HTML parser context
				4150	*
				4151	* Parse a content: comment, sub-element, reference or text.
				4152	* Kept for compatibility with old code
				4153	*/
				4154
				4155	static void
				4156	htmlParseContent(htmlParserCtxtPtr ctxt) {
				4157	xmlChar *currentNode;
				4158	int depth;
				4159	const xmlChar *name;
				4160
				4161	currentNode = xmlStrdup(ctxt->name);
				4162	depth = ctxt->nameNr;
				4163	while (1) {
				4164	long cons = ctxt->nbChars;
				4165
				4166	GROW;
				4167
				4168	if (ctxt->instate == XML_PARSER_EOF)
				4169	break;
				4170
				4171	/*
				4172	* Our tag or one of it's parent or children is ending.
				4173	*/
				4174	if ((CUR == '<') && (NXT(1) == '/')) {
				4175	if (htmlParseEndTag(ctxt) &&
				4176	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				4177	if (currentNode != NULL)
				4178	xmlFree(currentNode);
				4179	return;
				4180	}
				4181	continue; /* while */
				4182	}
				4183
				4184	else if ((CUR == '<') &&
				4185	((IS_ASCII_LETTER(NXT(1))) \|\|
				4186	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
				4187	name = htmlParseHTMLName_nonInvasive(ctxt);
				4188	if (name == NULL) {
				4189	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				4190	"htmlParseStartTag: invalid element name\n",
				4191	NULL, NULL);
				4192	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	4193	while ((CUR != 0) && (CUR != '>'))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4194	NEXT;
				4195
				4196	if (currentNode != NULL)
				4197	xmlFree(currentNode);
				4198	return;
				4199	}
				4200
				4201	if (ctxt->name != NULL) {
				4202	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
				4203	htmlAutoClose(ctxt, name);
				4204	continue;
				4205	}
				4206	}
				4207	}
				4208
				4209	/*
				4210	* Has this node been popped out during parsing of
				4211	* the next element
				4212	*/
				4213	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				4214	(!xmlStrEqual(currentNode, ctxt->name)))
				4215	{
				4216	if (currentNode != NULL) xmlFree(currentNode);
				4217	return;
				4218	}
				4219
				4220	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				4221	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
				4222	/*
				4223	* Handle SCRIPT/STYLE separately
				4224	*/
				4225	htmlParseScript(ctxt);
				4226	} else {
				4227	/*
				4228	* Sometimes DOCTYPE arrives in the middle of the document
				4229	*/
				4230	if ((CUR == '<') && (NXT(1) == '!') &&
				4231	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4232	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4233	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4234	(UPP(8) == 'E')) {
				4235	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				4236	"Misplaced DOCTYPE declaration\n",
				4237	BAD_CAST "DOCTYPE" , NULL);
				4238	htmlParseDocTypeDecl(ctxt);
				4239	}
				4240
				4241	/*
				4242	* First case : a comment
				4243	*/
				4244	if ((CUR == '<') && (NXT(1) == '!') &&
				4245	(NXT(2) == '-') && (NXT(3) == '-')) {
				4246	htmlParseComment(ctxt);
				4247	}
				4248
				4249	/*
				4250	* Second case : a Processing Instruction.
				4251	*/
				4252	else if ((CUR == '<') && (NXT(1) == '?')) {
				4253	htmlParsePI(ctxt);
				4254	}
				4255
				4256	/*
				4257	* Third case : a sub-element.
				4258	*/
				4259	else if (CUR == '<') {
				4260	htmlParseElement(ctxt);
				4261	}
				4262
				4263	/*
				4264	* Fourth case : a reference. If if has not been resolved,
				4265	* parsing returns it's Name, create the node
				4266	*/
				4267	else if (CUR == '&') {
				4268	htmlParseReference(ctxt);
				4269	}
				4270
				4271	/*
				4272	* Fifth case : end of the resource
				4273	*/
				4274	else if (CUR == 0) {
				4275	htmlAutoCloseOnEnd(ctxt);
				4276	break;
				4277	}
				4278
				4279	/*
				4280	* Last case, text. Note that References are handled directly.
				4281	*/
				4282	else {
				4283	htmlParseCharData(ctxt);
				4284	}
				4285
				4286	if (cons == ctxt->nbChars) {
				4287	if (ctxt->node != NULL) {
				4288	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4289	"detected an error in element content\n",
				4290	NULL, NULL);
				4291	}
				4292	break;
				4293	}
				4294	}
				4295	GROW;
				4296	}
				4297	if (currentNode != NULL) xmlFree(currentNode);
				4298	}
				4299
				4300	/**
				4301	* htmlParseElement:
				4302	* @ctxt: an HTML parser context
				4303	*
				4304	* parse an HTML element, this is highly recursive
				4305	* this is kept for compatibility with previous code versions
				4306	*
				4307	* [39] element ::= EmptyElemTag \| STag content ETag
				4308	*
				4309	* [41] Attribute ::= Name Eq AttValue
				4310	*/
				4311
				4312	void
				4313	htmlParseElement(htmlParserCtxtPtr ctxt) {
				4314	const xmlChar *name;
				4315	xmlChar *currentNode = NULL;
				4316	const htmlElemDesc * info;
				4317	htmlParserNodeInfo node_info;
				4318	int failed;
				4319	int depth;
				4320	const xmlChar *oldptr;
				4321
				4322	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4323	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4324	"htmlParseElement: context error\n", NULL, NULL);
				4325	return;
				4326	}
				4327
				4328	if (ctxt->instate == XML_PARSER_EOF)
				4329	return;
				4330
				4331	/* Capture start position */
				4332	if (ctxt->record_info) {
				4333	node_info.begin_pos = ctxt->input->consumed +
				4334	(CUR_PTR - ctxt->input->base);
				4335	node_info.begin_line = ctxt->input->line;
				4336	}
				4337
				4338	failed = htmlParseStartTag(ctxt);
				4339	name = ctxt->name;
				4340	if ((failed == -1) \|\| (name == NULL)) {
				4341	if (CUR == '>')
				4342	NEXT;
				4343	return;
				4344	}
				4345
				4346	/*
				4347	* Lookup the info for that element.
				4348	*/
				4349	info = htmlTagLookup(name);
				4350	if (info == NULL) {
				4351	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				4352	"Tag %s invalid\n", name, NULL);
				4353	}
				4354
				4355	/*
				4356	* Check for an Empty Element labeled the XML/SGML way
				4357	*/
				4358	if ((CUR == '/') && (NXT(1) == '>')) {
				4359	SKIP(2);
				4360	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4361	ctxt->sax->endElement(ctxt->userData, name);
				4362	htmlnamePop(ctxt);
				4363	return;
				4364	}
				4365
				4366	if (CUR == '>') {
				4367	NEXT;
				4368	} else {
				4369	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				4370	"Couldn't find end of Start Tag %s\n", name, NULL);
				4371
				4372	/*
				4373	* end of parsing of this node.
				4374	*/
				4375	if (xmlStrEqual(name, ctxt->name)) {
				4376	nodePop(ctxt);
				4377	htmlnamePop(ctxt);
				4378	}
				4379
				4380	/*
				4381	* Capture end position and add node
				4382	*/
				4383	if (ctxt->record_info) {
				4384	node_info.end_pos = ctxt->input->consumed +
				4385	(CUR_PTR - ctxt->input->base);
				4386	node_info.end_line = ctxt->input->line;
				4387	node_info.node = ctxt->node;
				4388	xmlParserAddNodeInfo(ctxt, &node_info);
				4389	}
				4390	return;
				4391	}
				4392
				4393	/*
				4394	* Check for an Empty Element from DTD definition
				4395	*/
				4396	if ((info != NULL) && (info->empty)) {
				4397	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4398	ctxt->sax->endElement(ctxt->userData, name);
				4399	htmlnamePop(ctxt);
				4400	return;
				4401	}
				4402
				4403	/*
				4404	* Parse the content of the element:
				4405	*/
				4406	currentNode = xmlStrdup(ctxt->name);
				4407	depth = ctxt->nameNr;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	4408	while (CUR != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4409	oldptr = ctxt->input->cur;
				4410	htmlParseContent(ctxt);
				4411	if (oldptr==ctxt->input->cur) break;
				4412	if (ctxt->nameNr < depth) break;
				4413	}
				4414
				4415	/*
				4416	* Capture end position and add node
				4417	*/
				4418	if ( currentNode != NULL && ctxt->record_info ) {
				4419	node_info.end_pos = ctxt->input->consumed +
				4420	(CUR_PTR - ctxt->input->base);
				4421	node_info.end_line = ctxt->input->line;
				4422	node_info.node = ctxt->node;
				4423	xmlParserAddNodeInfo(ctxt, &node_info);
				4424	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	4425	if (CUR == 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4426	htmlAutoCloseOnEnd(ctxt);
				4427	}
				4428
				4429	if (currentNode != NULL)
				4430	xmlFree(currentNode);
				4431	}
				4432
				4433	static void
				4434	htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
				4435	/*
				4436	* Capture end position and add node
				4437	*/
				4438	if ( ctxt->node != NULL && ctxt->record_info ) {
				4439	ctxt->nodeInfo->end_pos = ctxt->input->consumed +
				4440	(CUR_PTR - ctxt->input->base);
				4441	ctxt->nodeInfo->end_line = ctxt->input->line;
				4442	ctxt->nodeInfo->node = ctxt->node;
				4443	xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
				4444	htmlNodeInfoPop(ctxt);
				4445	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	4446	if (CUR == 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4447	htmlAutoCloseOnEnd(ctxt);
				4448	}
				4449	}
				4450
				4451	/**
				4452	* htmlParseElementInternal:
				4453	* @ctxt: an HTML parser context
				4454	*
				4455	* parse an HTML element, new version, non recursive
				4456	*
				4457	* [39] element ::= EmptyElemTag \| STag content ETag
				4458	*
				4459	* [41] Attribute ::= Name Eq AttValue
				4460	*/
				4461
				4462	static void
				4463	htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
				4464	const xmlChar *name;
				4465	const htmlElemDesc * info;
				4466	htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
				4467	int failed;
				4468
				4469	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4470	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4471	"htmlParseElementInternal: context error\n", NULL, NULL);
				4472	return;
				4473	}
				4474
				4475	if (ctxt->instate == XML_PARSER_EOF)
				4476	return;
				4477
				4478	/* Capture start position */
				4479	if (ctxt->record_info) {
				4480	node_info.begin_pos = ctxt->input->consumed +
				4481	(CUR_PTR - ctxt->input->base);
				4482	node_info.begin_line = ctxt->input->line;
				4483	}
				4484
				4485	failed = htmlParseStartTag(ctxt);
				4486	name = ctxt->name;
				4487	if ((failed == -1) \|\| (name == NULL)) {
				4488	if (CUR == '>')
				4489	NEXT;
				4490	return;
				4491	}
				4492
				4493	/*
				4494	* Lookup the info for that element.
				4495	*/
				4496	info = htmlTagLookup(name);
				4497	if (info == NULL) {
				4498	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				4499	"Tag %s invalid\n", name, NULL);
				4500	}
				4501
				4502	/*
				4503	* Check for an Empty Element labeled the XML/SGML way
				4504	*/
				4505	if ((CUR == '/') && (NXT(1) == '>')) {
				4506	SKIP(2);
				4507	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4508	ctxt->sax->endElement(ctxt->userData, name);
				4509	htmlnamePop(ctxt);
				4510	return;
				4511	}
				4512
				4513	if (CUR == '>') {
				4514	NEXT;
				4515	} else {
				4516	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				4517	"Couldn't find end of Start Tag %s\n", name, NULL);
				4518
				4519	/*
				4520	* end of parsing of this node.
				4521	*/
				4522	if (xmlStrEqual(name, ctxt->name)) {
				4523	nodePop(ctxt);
				4524	htmlnamePop(ctxt);
				4525	}
				4526
				4527	if (ctxt->record_info)
				4528	htmlNodeInfoPush(ctxt, &node_info);
				4529	htmlParserFinishElementParsing(ctxt);
				4530	return;
				4531	}
				4532
				4533	/*
				4534	* Check for an Empty Element from DTD definition
				4535	*/
				4536	if ((info != NULL) && (info->empty)) {
				4537	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4538	ctxt->sax->endElement(ctxt->userData, name);
				4539	htmlnamePop(ctxt);
				4540	return;
				4541	}
				4542
				4543	if (ctxt->record_info)
				4544	htmlNodeInfoPush(ctxt, &node_info);
				4545	}
				4546
				4547	/**
				4548	* htmlParseContentInternal:
				4549	* @ctxt: an HTML parser context
				4550	*
				4551	* Parse a content: comment, sub-element, reference or text.
				4552	* New version for non recursive htmlParseElementInternal
				4553	*/
				4554
				4555	static void
				4556	htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
				4557	xmlChar *currentNode;
				4558	int depth;
				4559	const xmlChar *name;
				4560
				4561	currentNode = xmlStrdup(ctxt->name);
				4562	depth = ctxt->nameNr;
				4563	while (1) {
				4564	long cons = ctxt->nbChars;
				4565
				4566	GROW;
				4567
				4568	if (ctxt->instate == XML_PARSER_EOF)
				4569	break;
				4570
				4571	/*
				4572	* Our tag or one of it's parent or children is ending.
				4573	*/
				4574	if ((CUR == '<') && (NXT(1) == '/')) {
				4575	if (htmlParseEndTag(ctxt) &&
				4576	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				4577	if (currentNode != NULL)
				4578	xmlFree(currentNode);
				4579
				4580	currentNode = xmlStrdup(ctxt->name);
				4581	depth = ctxt->nameNr;
				4582	}
				4583	continue; /* while */
				4584	}
				4585
				4586	else if ((CUR == '<') &&
				4587	((IS_ASCII_LETTER(NXT(1))) \|\|
				4588	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
				4589	name = htmlParseHTMLName_nonInvasive(ctxt);
				4590	if (name == NULL) {
				4591	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				4592	"htmlParseStartTag: invalid element name\n",
				4593	NULL, NULL);
				4594	/* Dump the bogus tag like browsers do */
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	4595	while ((CUR == 0) && (CUR != '>'))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	4596	NEXT;
				4597
				4598	htmlParserFinishElementParsing(ctxt);
				4599	if (currentNode != NULL)
				4600	xmlFree(currentNode);
				4601
				4602	currentNode = xmlStrdup(ctxt->name);
				4603	depth = ctxt->nameNr;
				4604	continue;
				4605	}
				4606
				4607	if (ctxt->name != NULL) {
				4608	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
				4609	htmlAutoClose(ctxt, name);
				4610	continue;
				4611	}
				4612	}
				4613	}
				4614
				4615	/*
				4616	* Has this node been popped out during parsing of
				4617	* the next element
				4618	*/
				4619	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				4620	(!xmlStrEqual(currentNode, ctxt->name)))
				4621	{
				4622	htmlParserFinishElementParsing(ctxt);
				4623	if (currentNode != NULL) xmlFree(currentNode);
				4624
				4625	currentNode = xmlStrdup(ctxt->name);
				4626	depth = ctxt->nameNr;
				4627	continue;
				4628	}
				4629
				4630	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				4631	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
				4632	/*
				4633	* Handle SCRIPT/STYLE separately
				4634	*/
				4635	htmlParseScript(ctxt);
				4636	} else {
				4637	/*
				4638	* Sometimes DOCTYPE arrives in the middle of the document
				4639	*/
				4640	if ((CUR == '<') && (NXT(1) == '!') &&
				4641	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4642	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4643	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4644	(UPP(8) == 'E')) {
				4645	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				4646	"Misplaced DOCTYPE declaration\n",
				4647	BAD_CAST "DOCTYPE" , NULL);
				4648	htmlParseDocTypeDecl(ctxt);
				4649	}
				4650
				4651	/*
				4652	* First case : a comment
				4653	*/
				4654	if ((CUR == '<') && (NXT(1) == '!') &&
				4655	(NXT(2) == '-') && (NXT(3) == '-')) {
				4656	htmlParseComment(ctxt);
				4657	}
				4658
				4659	/*
				4660	* Second case : a Processing Instruction.
				4661	*/
				4662	else if ((CUR == '<') && (NXT(1) == '?')) {
				4663	htmlParsePI(ctxt);
				4664	}
				4665
				4666	/*
				4667	* Third case : a sub-element.
				4668	*/
				4669	else if (CUR == '<') {
				4670	htmlParseElementInternal(ctxt);
				4671	if (currentNode != NULL) xmlFree(currentNode);
				4672
				4673	currentNode = xmlStrdup(ctxt->name);
				4674	depth = ctxt->nameNr;
				4675	}
				4676
				4677	/*
				4678	* Fourth case : a reference. If if has not been resolved,
				4679	* parsing returns it's Name, create the node
				4680	*/
				4681	else if (CUR == '&') {
				4682	htmlParseReference(ctxt);
				4683	}
				4684
				4685	/*
				4686	* Fifth case : end of the resource
				4687	*/
				4688	else if (CUR == 0) {
				4689	htmlAutoCloseOnEnd(ctxt);
				4690	break;
				4691	}
				4692
				4693	/*
				4694	* Last case, text. Note that References are handled directly.
				4695	*/
				4696	else {
				4697	htmlParseCharData(ctxt);
				4698	}
				4699
				4700	if (cons == ctxt->nbChars) {
				4701	if (ctxt->node != NULL) {
				4702	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4703	"detected an error in element content\n",
				4704	NULL, NULL);
				4705	}
				4706	break;
				4707	}
				4708	}
				4709	GROW;
				4710	}
				4711	if (currentNode != NULL) xmlFree(currentNode);
				4712	}
				4713
				4714	/**
				4715	* htmlParseContent:
				4716	* @ctxt: an HTML parser context
				4717	*
				4718	* Parse a content: comment, sub-element, reference or text.
				4719	* This is the entry point when called from parser.c
				4720	*/
				4721
				4722	void
				4723	__htmlParseContent(void *ctxt) {
				4724	if (ctxt != NULL)
				4725	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
				4726	}
				4727
				4728	/**
				4729	* htmlParseDocument:
				4730	* @ctxt: an HTML parser context
				4731	*
				4732	* parse an HTML document (and build a tree if using the standard SAX
				4733	* interface).
				4734	*
				4735	* Returns 0, -1 in case of error. the parser context is augmented
				4736	* as a result of the parsing.
				4737	*/
				4738
				4739	int
				4740	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				4741	xmlChar start[4];
				4742	xmlCharEncoding enc;
				4743	xmlDtdPtr dtd;
				4744
				4745	xmlInitParser();
				4746
				4747	htmlDefaultSAXHandlerInit();
				4748
				4749	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				4750	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				4751	"htmlParseDocument: context error\n", NULL, NULL);
				4752	return(XML_ERR_INTERNAL_ERROR);
				4753	}
				4754	ctxt->html = 1;
				4755	ctxt->linenumbers = 1;
				4756	GROW;
				4757	/*
				4758	* SAX: beginning of the document processing.
				4759	*/
				4760	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4761	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				4762
				4763	if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
				4764	((ctxt->input->end - ctxt->input->cur) >= 4)) {
				4765	/*
				4766	* Get the 4 first bytes and decode the charset
				4767	* if enc != XML_CHAR_ENCODING_NONE
				4768	* plug some encoding conversion routines.
				4769	*/
				4770	start[0] = RAW;
				4771	start[1] = NXT(1);
				4772	start[2] = NXT(2);
				4773	start[3] = NXT(3);
				4774	enc = xmlDetectCharEncoding(&start[0], 4);
				4775	if (enc != XML_CHAR_ENCODING_NONE) {
				4776	xmlSwitchEncoding(ctxt, enc);
				4777	}
				4778	}
				4779
				4780	/*
				4781	* Wipe out everything which is before the first '<'
				4782	*/
				4783	SKIP_BLANKS;
				4784	if (CUR == 0) {
				4785	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
				4786	"Document is empty\n", NULL, NULL);
				4787	}
				4788
				4789	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				4790	ctxt->sax->startDocument(ctxt->userData);
				4791
				4792
				4793	/*
				4794	* Parse possible comments and PIs before any content
				4795	*/
				4796	while (((CUR == '<') && (NXT(1) == '!') &&
				4797	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
				4798	((CUR == '<') && (NXT(1) == '?'))) {
				4799	htmlParseComment(ctxt);
				4800	htmlParsePI(ctxt);
				4801	SKIP_BLANKS;
				4802	}
				4803
				4804
				4805	/*
				4806	* Then possibly doc type declaration(s) and more Misc
				4807	* (doctypedecl Misc*)?
				4808	*/
				4809	if ((CUR == '<') && (NXT(1) == '!') &&
				4810	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4811	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4812	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4813	(UPP(8) == 'E')) {
				4814	htmlParseDocTypeDecl(ctxt);
				4815	}
				4816	SKIP_BLANKS;
				4817
				4818	/*
				4819	* Parse possible comments and PIs before any content
				4820	*/
				4821	while (((CUR == '<') && (NXT(1) == '!') &&
				4822	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
				4823	((CUR == '<') && (NXT(1) == '?'))) {
				4824	htmlParseComment(ctxt);
				4825	htmlParsePI(ctxt);
				4826	SKIP_BLANKS;
				4827	}
				4828
				4829	/*
				4830	* Time to start parsing the tree itself
				4831	*/
				4832	htmlParseContentInternal(ctxt);
				4833
				4834	/*
				4835	* autoclose
				4836	*/
				4837	if (CUR == 0)
				4838	htmlAutoCloseOnEnd(ctxt);
				4839
				4840
				4841	/*
				4842	* SAX: end of the document processing.
				4843	*/
				4844	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4845	ctxt->sax->endDocument(ctxt->userData);
				4846
				4847	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
				4848	dtd = xmlGetIntSubset(ctxt->myDoc);
				4849	if (dtd == NULL)
				4850	ctxt->myDoc->intSubset =
				4851	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
				4852	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4853	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4854	}
				4855	if (! ctxt->wellFormed) return(-1);
				4856	return(0);
				4857	}
				4858
				4859
				4860	/************************************************************************
				4861	* *
				4862	* Parser contexts handling *
				4863	* *
				4864	************************************************************************/
				4865
				4866	/**
				4867	* htmlInitParserCtxt:
				4868	* @ctxt: an HTML parser context
				4869	*
				4870	* Initialize a parser context
				4871	*
				4872	* Returns 0 in case of success and -1 in case of error
				4873	*/
				4874
				4875	static int
				4876	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				4877	{
				4878	htmlSAXHandler *sax;
				4879
				4880	if (ctxt == NULL) return(-1);
				4881	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4882
				4883	ctxt->dict = xmlDictCreate();
				4884	if (ctxt->dict == NULL) {
				4885	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4886	return(-1);
				4887	}
				4888	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				4889	if (sax == NULL) {
				4890	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4891	return(-1);
				4892	}
				4893	else
				4894	memset(sax, 0, sizeof(htmlSAXHandler));
				4895
				4896	/* Allocate the Input stack */
				4897	ctxt->inputTab = (htmlParserInputPtr *)
				4898	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				4899	if (ctxt->inputTab == NULL) {
				4900	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4901	ctxt->inputNr = 0;
				4902	ctxt->inputMax = 0;
				4903	ctxt->input = NULL;
				4904	return(-1);
				4905	}
				4906	ctxt->inputNr = 0;
				4907	ctxt->inputMax = 5;
				4908	ctxt->input = NULL;
				4909	ctxt->version = NULL;
				4910	ctxt->encoding = NULL;
				4911	ctxt->standalone = -1;
				4912	ctxt->instate = XML_PARSER_START;
				4913
				4914	/* Allocate the Node stack */
				4915	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				4916	if (ctxt->nodeTab == NULL) {
				4917	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4918	ctxt->nodeNr = 0;
				4919	ctxt->nodeMax = 0;
				4920	ctxt->node = NULL;
				4921	ctxt->inputNr = 0;
				4922	ctxt->inputMax = 0;
				4923	ctxt->input = NULL;
				4924	return(-1);
				4925	}
				4926	ctxt->nodeNr = 0;
				4927	ctxt->nodeMax = 10;
				4928	ctxt->node = NULL;
				4929
				4930	/* Allocate the Name stack */
				4931	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				4932	if (ctxt->nameTab == NULL) {
				4933	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
				4934	ctxt->nameNr = 0;
				4935	ctxt->nameMax = 0;
				4936	ctxt->name = NULL;
				4937	ctxt->nodeNr = 0;
				4938	ctxt->nodeMax = 0;
				4939	ctxt->node = NULL;
				4940	ctxt->inputNr = 0;
				4941	ctxt->inputMax = 0;
				4942	ctxt->input = NULL;
				4943	return(-1);
				4944	}
				4945	ctxt->nameNr = 0;
				4946	ctxt->nameMax = 10;
				4947	ctxt->name = NULL;
				4948
				4949	ctxt->nodeInfoTab = NULL;
				4950	ctxt->nodeInfoNr = 0;
				4951	ctxt->nodeInfoMax = 0;
				4952
				4953	if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
				4954	else {
				4955	ctxt->sax = sax;
				4956	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
				4957	}
				4958	ctxt->userData = ctxt;
				4959	ctxt->myDoc = NULL;
				4960	ctxt->wellFormed = 1;
				4961	ctxt->replaceEntities = 0;
				4962	ctxt->linenumbers = xmlLineNumbersDefaultValue;
				4963	ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
				4964	ctxt->html = 1;
				4965	ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
				4966	ctxt->vctxt.userData = ctxt;
				4967	ctxt->vctxt.error = xmlParserValidityError;
				4968	ctxt->vctxt.warning = xmlParserValidityWarning;
				4969	ctxt->record_info = 0;
				4970	ctxt->validate = 0;
				4971	ctxt->nbChars = 0;
				4972	ctxt->checkIndex = 0;
				4973	ctxt->catalogs = NULL;
				4974	xmlInitNodeInfoSeq(&ctxt->node_seq);
				4975	return(0);
				4976	}
				4977
				4978	/**
				4979	* htmlFreeParserCtxt:
				4980	* @ctxt: an HTML parser context
				4981	*
				4982	* Free all the memory used by a parser context. However the parsed
				4983	* document in ctxt->myDoc is not freed.
				4984	*/
				4985
				4986	void
				4987	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				4988	{
				4989	xmlFreeParserCtxt(ctxt);
				4990	}
				4991
				4992	/**
				4993	* htmlNewParserCtxt:
				4994	*
				4995	* Allocate and initialize a new parser context.
				4996	*
				4997	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
				4998	*/
				4999
				5000	htmlParserCtxtPtr
				5001	htmlNewParserCtxt(void)
				5002	{
				5003	xmlParserCtxtPtr ctxt;
				5004
				5005	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
				5006	if (ctxt == NULL) {
				5007	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
				5008	return(NULL);
				5009	}
				5010	memset(ctxt, 0, sizeof(xmlParserCtxt));
				5011	if (htmlInitParserCtxt(ctxt) < 0) {
				5012	htmlFreeParserCtxt(ctxt);
				5013	return(NULL);
				5014	}
				5015	return(ctxt);
				5016	}
				5017
				5018	/**
				5019	* htmlCreateMemoryParserCtxt:
				5020	* @buffer: a pointer to a char array
				5021	* @size: the size of the array
				5022	*
				5023	* Create a parser context for an HTML in-memory document.
				5024	*
				5025	* Returns the new parser context or NULL
				5026	*/
				5027	htmlParserCtxtPtr
				5028	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
				5029	xmlParserCtxtPtr ctxt;
				5030	xmlParserInputPtr input;
				5031	xmlParserInputBufferPtr buf;
				5032
				5033	if (buffer == NULL)
				5034	return(NULL);
				5035	if (size <= 0)
				5036	return(NULL);
				5037
				5038	ctxt = htmlNewParserCtxt();
				5039	if (ctxt == NULL)
				5040	return(NULL);
				5041
				5042	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				5043	if (buf == NULL) return(NULL);
				5044
				5045	input = xmlNewInputStream(ctxt);
				5046	if (input == NULL) {
				5047	xmlFreeParserCtxt(ctxt);
				5048	return(NULL);
				5049	}
				5050
				5051	input->filename = NULL;
				5052	input->buf = buf;
				5053	xmlBufResetInput(buf->buffer, input);
				5054
				5055	inputPush(ctxt, input);
				5056	return(ctxt);
				5057	}
				5058
				5059	/**
				5060	* htmlCreateDocParserCtxt:
				5061	* @cur: a pointer to an array of xmlChar
				5062	* @encoding: a free form C string describing the HTML document encoding, or NULL
				5063	*
				5064	* Create a parser context for an HTML document.
				5065	*
				5066	* TODO: check the need to add encoding handling there
				5067	*
				5068	* Returns the new parser context or NULL
				5069	*/
				5070	static htmlParserCtxtPtr
				5071	htmlCreateDocParserCtxt(const xmlChar cur, const char encoding) {
				5072	int len;
				5073	htmlParserCtxtPtr ctxt;
				5074
				5075	if (cur == NULL)
				5076	return(NULL);
				5077	len = xmlStrlen(cur);
				5078	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
				5079	if (ctxt == NULL)
				5080	return(NULL);
				5081
				5082	if (encoding != NULL) {
				5083	xmlCharEncoding enc;
				5084	xmlCharEncodingHandlerPtr handler;
				5085
				5086	if (ctxt->input->encoding != NULL)
				5087	xmlFree((xmlChar *) ctxt->input->encoding);
				5088	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
				5089
				5090	enc = xmlParseCharEncoding(encoding);
				5091	/*
				5092	* registered set of known encodings
				5093	*/
				5094	if (enc != XML_CHAR_ENCODING_ERROR) {
				5095	xmlSwitchEncoding(ctxt, enc);
				5096	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
				5097	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				5098	"Unsupported encoding %s\n",
				5099	(const xmlChar *) encoding, NULL);
				5100	}
				5101	} else {
				5102	/*
				5103	* fallback for unknown encodings
				5104	*/
				5105	handler = xmlFindCharEncodingHandler((const char *) encoding);
				5106	if (handler != NULL) {
				5107	xmlSwitchToEncoding(ctxt, handler);
				5108	} else {
				5109	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
				5110	"Unsupported encoding %s\n",
				5111	(const xmlChar *) encoding, NULL);
				5112	}
				5113	}
				5114	}
				5115	return(ctxt);
				5116	}
				5117
				5118	#ifdef LIBXML_PUSH_ENABLED
				5119	/************************************************************************
				5120	* *
				5121	* Progressive parsing interfaces *
				5122	* *
				5123	************************************************************************/
				5124
				5125	/**
				5126	* htmlParseLookupSequence:
				5127	* @ctxt: an HTML parser context
				5128	* @first: the first char to lookup
				5129	* @next: the next char to lookup or zero
				5130	* @third: the next char to lookup or zero
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5131	* @ignoreattrval: skip over attribute values
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5132	*
				5133	* Try to find if a sequence (first, next, third) or just (first next) or
				5134	* (first) is available in the input stream.
				5135	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				5136	* to avoid rescanning sequences of bytes, it DOES change the state of the
				5137	* parser, do not use liberally.
				5138	* This is basically similar to xmlParseLookupSequence()
				5139	*
				5140	* Returns the index to the current parsing point if the full sequence
				5141	* is available, -1 otherwise.
				5142	*/
				5143	static int
				5144	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5145	xmlChar next, xmlChar third, int ignoreattrval)
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5146	{
				5147	int base, len;
				5148	htmlParserInputPtr in;
				5149	const xmlChar *buf;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5150	int invalue = 0;
				5151	char valdellim = 0x0;
				5152
				5153	in = ctxt->input;
				5154	if (in == NULL)
				5155	return (-1);
				5156
				5157	base = in->cur - in->base;
				5158	if (base < 0)
				5159	return (-1);
				5160
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5161	if (ctxt->checkIndex > base) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5162	base = ctxt->checkIndex;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5163	/* Abuse hasPErefs member to restore current state. */
				5164	invalue = ctxt->hasPErefs & 1 ? 1 : 0;
				5165	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5166
				5167	if (in->buf == NULL) {
				5168	buf = in->base;
				5169	len = in->length;
				5170	} else {
				5171	buf = xmlBufContent(in->buf->buffer);
				5172	len = xmlBufUse(in->buf->buffer);
				5173	}
				5174
				5175	/* take into account the sequence length */
				5176	if (third)
				5177	len -= 2;
				5178	else if (next)
				5179	len--;
				5180	for (; base < len; base++) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5181	if (ignoreattrval) {
				5182	if (buf[base] == '"' \|\| buf[base] == '\'') {
				5183	if (invalue) {
				5184	if (buf[base] == valdellim) {
				5185	invalue = 0;
				5186	continue;
				5187	}
				5188	} else {
				5189	valdellim = buf[base];
				5190	invalue = 1;
				5191	continue;
				5192	}
				5193	} else if (invalue) {
				5194	continue;
				5195	}
				5196	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5197	if (buf[base] == first) {
				5198	if (third != 0) {
				5199	if ((buf[base + 1] != next) \|\| (buf[base + 2] != third))
				5200	continue;
				5201	} else if (next != 0) {
				5202	if (buf[base + 1] != next)
				5203	continue;
				5204	}
				5205	ctxt->checkIndex = 0;
				5206	#ifdef DEBUG_PUSH
				5207	if (next == 0)
				5208	xmlGenericError(xmlGenericErrorContext,
				5209	"HPP: lookup '%c' found at %d\n",
				5210	first, base);
				5211	else if (third == 0)
				5212	xmlGenericError(xmlGenericErrorContext,
				5213	"HPP: lookup '%c%c' found at %d\n",
				5214	first, next, base);
				5215	else
				5216	xmlGenericError(xmlGenericErrorContext,
				5217	"HPP: lookup '%c%c%c' found at %d\n",
				5218	first, next, third, base);
				5219	#endif
				5220	return (base - (in->cur - in->base));
				5221	}
				5222	}
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5223	ctxt->checkIndex = base;
				5224	/* Abuse hasPErefs member to track current state. */
				5225	if (invalue)
				5226	ctxt->hasPErefs \|= 1;
				5227	else
				5228	ctxt->hasPErefs &= ~1;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5229	#ifdef DEBUG_PUSH
				5230	if (next == 0)
				5231	xmlGenericError(xmlGenericErrorContext,
				5232	"HPP: lookup '%c' failed\n", first);
				5233	else if (third == 0)
				5234	xmlGenericError(xmlGenericErrorContext,
				5235	"HPP: lookup '%c%c' failed\n", first, next);
				5236	else
				5237	xmlGenericError(xmlGenericErrorContext,
				5238	"HPP: lookup '%c%c%c' failed\n", first, next,
				5239	third);
				5240	#endif
				5241	return (-1);
				5242	}
				5243
				5244	/**
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5245	* htmlParseTryOrFinish:
				5246	* @ctxt: an HTML parser context
				5247	* @terminate: last chunk indicator
				5248	*
				5249	* Try to progress on parsing
				5250	*
				5251	* Returns zero if no parsing was possible
				5252	*/
				5253	static int
				5254	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				5255	int ret = 0;
				5256	htmlParserInputPtr in;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5257	ptrdiff_t avail = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5258	xmlChar cur, next;
				5259
				5260	htmlParserNodeInfo node_info;
				5261
				5262	#ifdef DEBUG_PUSH
				5263	switch (ctxt->instate) {
				5264	case XML_PARSER_EOF:
				5265	xmlGenericError(xmlGenericErrorContext,
				5266	"HPP: try EOF\n"); break;
				5267	case XML_PARSER_START:
				5268	xmlGenericError(xmlGenericErrorContext,
				5269	"HPP: try START\n"); break;
				5270	case XML_PARSER_MISC:
				5271	xmlGenericError(xmlGenericErrorContext,
				5272	"HPP: try MISC\n");break;
				5273	case XML_PARSER_COMMENT:
				5274	xmlGenericError(xmlGenericErrorContext,
				5275	"HPP: try COMMENT\n");break;
				5276	case XML_PARSER_PROLOG:
				5277	xmlGenericError(xmlGenericErrorContext,
				5278	"HPP: try PROLOG\n");break;
				5279	case XML_PARSER_START_TAG:
				5280	xmlGenericError(xmlGenericErrorContext,
				5281	"HPP: try START_TAG\n");break;
				5282	case XML_PARSER_CONTENT:
				5283	xmlGenericError(xmlGenericErrorContext,
				5284	"HPP: try CONTENT\n");break;
				5285	case XML_PARSER_CDATA_SECTION:
				5286	xmlGenericError(xmlGenericErrorContext,
				5287	"HPP: try CDATA_SECTION\n");break;
				5288	case XML_PARSER_END_TAG:
				5289	xmlGenericError(xmlGenericErrorContext,
				5290	"HPP: try END_TAG\n");break;
				5291	case XML_PARSER_ENTITY_DECL:
				5292	xmlGenericError(xmlGenericErrorContext,
				5293	"HPP: try ENTITY_DECL\n");break;
				5294	case XML_PARSER_ENTITY_VALUE:
				5295	xmlGenericError(xmlGenericErrorContext,
				5296	"HPP: try ENTITY_VALUE\n");break;
				5297	case XML_PARSER_ATTRIBUTE_VALUE:
				5298	xmlGenericError(xmlGenericErrorContext,
				5299	"HPP: try ATTRIBUTE_VALUE\n");break;
				5300	case XML_PARSER_DTD:
				5301	xmlGenericError(xmlGenericErrorContext,
				5302	"HPP: try DTD\n");break;
				5303	case XML_PARSER_EPILOG:
				5304	xmlGenericError(xmlGenericErrorContext,
				5305	"HPP: try EPILOG\n");break;
				5306	case XML_PARSER_PI:
				5307	xmlGenericError(xmlGenericErrorContext,
				5308	"HPP: try PI\n");break;
				5309	case XML_PARSER_SYSTEM_LITERAL:
				5310	xmlGenericError(xmlGenericErrorContext,
				5311	"HPP: try SYSTEM_LITERAL\n");break;
				5312	}
				5313	#endif
				5314
				5315	while (1) {
				5316
				5317	in = ctxt->input;
				5318	if (in == NULL) break;
				5319	if (in->buf == NULL)
				5320	avail = in->length - (in->cur - in->base);
				5321	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5322	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5323	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5324	if ((avail == 0) && (terminate)) {
				5325	htmlAutoCloseOnEnd(ctxt);
				5326	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				5327	/*
				5328	* SAX: end of the document processing.
				5329	*/
				5330	ctxt->instate = XML_PARSER_EOF;
				5331	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				5332	ctxt->sax->endDocument(ctxt->userData);
				5333	}
				5334	}
				5335	if (avail < 1)
				5336	goto done;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5337	/*
				5338	* This is done to make progress and avoid an infinite loop
				5339	* if a parsing attempt was aborted by hitting a NUL byte. After
				5340	* changing htmlCurrentChar, this probably isn't necessary anymore.
				5341	* We should consider removing this check.
				5342	*/
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5343	cur = in->cur[0];
				5344	if (cur == 0) {
				5345	SKIP(1);
				5346	continue;
				5347	}
				5348
				5349	switch (ctxt->instate) {
				5350	case XML_PARSER_EOF:
				5351	/*
				5352	* Document parsing is done !
				5353	*/
				5354	goto done;
				5355	case XML_PARSER_START:
				5356	/*
				5357	* Very first chars read from the document flow.
				5358	*/
				5359	cur = in->cur[0];
				5360	if (IS_BLANK_CH(cur)) {
				5361	SKIP_BLANKS;
				5362	if (in->buf == NULL)
				5363	avail = in->length - (in->cur - in->base);
				5364	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5365	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5366	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5367	}
				5368	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				5369	ctxt->sax->setDocumentLocator(ctxt->userData,
				5370	&xmlDefaultSAXLocator);
				5371	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				5372	(!ctxt->disableSAX))
				5373	ctxt->sax->startDocument(ctxt->userData);
				5374
				5375	cur = in->cur[0];
				5376	next = in->cur[1];
				5377	if ((cur == '<') && (next == '!') &&
				5378	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5379	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5380	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5381	(UPP(8) == 'E')) {
				5382	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5383	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5384	goto done;
				5385	#ifdef DEBUG_PUSH
				5386	xmlGenericError(xmlGenericErrorContext,
				5387	"HPP: Parsing internal subset\n");
				5388	#endif
				5389	htmlParseDocTypeDecl(ctxt);
				5390	ctxt->instate = XML_PARSER_PROLOG;
				5391	#ifdef DEBUG_PUSH
				5392	xmlGenericError(xmlGenericErrorContext,
				5393	"HPP: entering PROLOG\n");
				5394	#endif
				5395	} else {
				5396	ctxt->instate = XML_PARSER_MISC;
				5397	#ifdef DEBUG_PUSH
				5398	xmlGenericError(xmlGenericErrorContext,
				5399	"HPP: entering MISC\n");
				5400	#endif
				5401	}
				5402	break;
				5403	case XML_PARSER_MISC:
				5404	SKIP_BLANKS;
				5405	if (in->buf == NULL)
				5406	avail = in->length - (in->cur - in->base);
				5407	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5408	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5409	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5410	/*
				5411	* no chars in buffer
				5412	*/
				5413	if (avail < 1)
				5414	goto done;
				5415	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5416	* not enough chars in buffer
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5417	*/
				5418	if (avail < 2) {
				5419	if (!terminate)
				5420	goto done;
				5421	else
				5422	next = ' ';
				5423	} else {
				5424	next = in->cur[1];
				5425	}
				5426	cur = in->cur[0];
				5427	if ((cur == '<') && (next == '!') &&
				5428	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5429	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5430	(htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5431	goto done;
				5432	#ifdef DEBUG_PUSH
				5433	xmlGenericError(xmlGenericErrorContext,
				5434	"HPP: Parsing Comment\n");
				5435	#endif
				5436	htmlParseComment(ctxt);
				5437	ctxt->instate = XML_PARSER_MISC;
				5438	} else if ((cur == '<') && (next == '?')) {
				5439	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5440	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5441	goto done;
				5442	#ifdef DEBUG_PUSH
				5443	xmlGenericError(xmlGenericErrorContext,
				5444	"HPP: Parsing PI\n");
				5445	#endif
				5446	htmlParsePI(ctxt);
				5447	ctxt->instate = XML_PARSER_MISC;
				5448	} else if ((cur == '<') && (next == '!') &&
				5449	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5450	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5451	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5452	(UPP(8) == 'E')) {
				5453	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5454	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5455	goto done;
				5456	#ifdef DEBUG_PUSH
				5457	xmlGenericError(xmlGenericErrorContext,
				5458	"HPP: Parsing internal subset\n");
				5459	#endif
				5460	htmlParseDocTypeDecl(ctxt);
				5461	ctxt->instate = XML_PARSER_PROLOG;
				5462	#ifdef DEBUG_PUSH
				5463	xmlGenericError(xmlGenericErrorContext,
				5464	"HPP: entering PROLOG\n");
				5465	#endif
				5466	} else if ((cur == '<') && (next == '!') &&
				5467	(avail < 9)) {
				5468	goto done;
				5469	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5470	ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5471	#ifdef DEBUG_PUSH
				5472	xmlGenericError(xmlGenericErrorContext,
				5473	"HPP: entering START_TAG\n");
				5474	#endif
				5475	}
				5476	break;
				5477	case XML_PARSER_PROLOG:
				5478	SKIP_BLANKS;
				5479	if (in->buf == NULL)
				5480	avail = in->length - (in->cur - in->base);
				5481	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5482	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5483	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5484	if (avail < 2)
				5485	goto done;
				5486	cur = in->cur[0];
				5487	next = in->cur[1];
				5488	if ((cur == '<') && (next == '!') &&
				5489	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5490	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5491	(htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5492	goto done;
				5493	#ifdef DEBUG_PUSH
				5494	xmlGenericError(xmlGenericErrorContext,
				5495	"HPP: Parsing Comment\n");
				5496	#endif
				5497	htmlParseComment(ctxt);
				5498	ctxt->instate = XML_PARSER_PROLOG;
				5499	} else if ((cur == '<') && (next == '?')) {
				5500	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5501	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5502	goto done;
				5503	#ifdef DEBUG_PUSH
				5504	xmlGenericError(xmlGenericErrorContext,
				5505	"HPP: Parsing PI\n");
				5506	#endif
				5507	htmlParsePI(ctxt);
				5508	ctxt->instate = XML_PARSER_PROLOG;
				5509	} else if ((cur == '<') && (next == '!') &&
				5510	(avail < 4)) {
				5511	goto done;
				5512	} else {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5513	ctxt->instate = XML_PARSER_CONTENT;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5514	#ifdef DEBUG_PUSH
				5515	xmlGenericError(xmlGenericErrorContext,
				5516	"HPP: entering START_TAG\n");
				5517	#endif
				5518	}
				5519	break;
				5520	case XML_PARSER_EPILOG:
				5521	if (in->buf == NULL)
				5522	avail = in->length - (in->cur - in->base);
				5523	else
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5524	avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
				5525	(in->cur - in->base);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5526	if (avail < 1)
				5527	goto done;
				5528	cur = in->cur[0];
				5529	if (IS_BLANK_CH(cur)) {
				5530	htmlParseCharData(ctxt);
				5531	goto done;
				5532	}
				5533	if (avail < 2)
				5534	goto done;
				5535	next = in->cur[1];
				5536	if ((cur == '<') && (next == '!') &&
				5537	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5538	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5539	(htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5540	goto done;
				5541	#ifdef DEBUG_PUSH
				5542	xmlGenericError(xmlGenericErrorContext,
				5543	"HPP: Parsing Comment\n");
				5544	#endif
				5545	htmlParseComment(ctxt);
				5546	ctxt->instate = XML_PARSER_EPILOG;
				5547	} else if ((cur == '<') && (next == '?')) {
				5548	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5549	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5550	goto done;
				5551	#ifdef DEBUG_PUSH
				5552	xmlGenericError(xmlGenericErrorContext,
				5553	"HPP: Parsing PI\n");
				5554	#endif
				5555	htmlParsePI(ctxt);
				5556	ctxt->instate = XML_PARSER_EPILOG;
				5557	} else if ((cur == '<') && (next == '!') &&
				5558	(avail < 4)) {
				5559	goto done;
				5560	} else {
				5561	ctxt->errNo = XML_ERR_DOCUMENT_END;
				5562	ctxt->wellFormed = 0;
				5563	ctxt->instate = XML_PARSER_EOF;
				5564	#ifdef DEBUG_PUSH
				5565	xmlGenericError(xmlGenericErrorContext,
				5566	"HPP: entering EOF\n");
				5567	#endif
				5568	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				5569	ctxt->sax->endDocument(ctxt->userData);
				5570	goto done;
				5571	}
				5572	break;
				5573	case XML_PARSER_START_TAG: {
				5574	const xmlChar *name;
				5575	int failed;
				5576	const htmlElemDesc * info;
				5577
				5578	/*
				5579	* no chars in buffer
				5580	*/
				5581	if (avail < 1)
				5582	goto done;
				5583	/*
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5584	* not enough chars in buffer
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5585	*/
				5586	if (avail < 2) {
				5587	if (!terminate)
				5588	goto done;
				5589	else
				5590	next = ' ';
				5591	} else {
				5592	next = in->cur[1];
				5593	}
				5594	cur = in->cur[0];
				5595	if (cur != '<') {
				5596	ctxt->instate = XML_PARSER_CONTENT;
				5597	#ifdef DEBUG_PUSH
				5598	xmlGenericError(xmlGenericErrorContext,
				5599	"HPP: entering CONTENT\n");
				5600	#endif
				5601	break;
				5602	}
				5603	if (next == '/') {
				5604	ctxt->instate = XML_PARSER_END_TAG;
				5605	ctxt->checkIndex = 0;
				5606	#ifdef DEBUG_PUSH
				5607	xmlGenericError(xmlGenericErrorContext,
				5608	"HPP: entering END_TAG\n");
				5609	#endif
				5610	break;
				5611	}
				5612	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5613	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5614	goto done;
				5615
				5616	/* Capture start position */
				5617	if (ctxt->record_info) {
				5618	node_info.begin_pos = ctxt->input->consumed +
				5619	(CUR_PTR - ctxt->input->base);
				5620	node_info.begin_line = ctxt->input->line;
				5621	}
				5622
				5623
				5624	failed = htmlParseStartTag(ctxt);
				5625	name = ctxt->name;
				5626	if ((failed == -1) \|\|
				5627	(name == NULL)) {
				5628	if (CUR == '>')
				5629	NEXT;
				5630	break;
				5631	}
				5632
				5633	/*
				5634	* Lookup the info for that element.
				5635	*/
				5636	info = htmlTagLookup(name);
				5637	if (info == NULL) {
				5638	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
				5639	"Tag %s invalid\n", name, NULL);
				5640	}
				5641
				5642	/*
				5643	* Check for an Empty Element labeled the XML/SGML way
				5644	*/
				5645	if ((CUR == '/') && (NXT(1) == '>')) {
				5646	SKIP(2);
				5647	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				5648	ctxt->sax->endElement(ctxt->userData, name);
				5649	htmlnamePop(ctxt);
				5650	ctxt->instate = XML_PARSER_CONTENT;
				5651	#ifdef DEBUG_PUSH
				5652	xmlGenericError(xmlGenericErrorContext,
				5653	"HPP: entering CONTENT\n");
				5654	#endif
				5655	break;
				5656	}
				5657
				5658	if (CUR == '>') {
				5659	NEXT;
				5660	} else {
				5661	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
				5662	"Couldn't find end of Start Tag %s\n",
				5663	name, NULL);
				5664
				5665	/*
				5666	* end of parsing of this node.
				5667	*/
				5668	if (xmlStrEqual(name, ctxt->name)) {
				5669	nodePop(ctxt);
				5670	htmlnamePop(ctxt);
				5671	}
				5672
				5673	if (ctxt->record_info)
				5674	htmlNodeInfoPush(ctxt, &node_info);
				5675
				5676	ctxt->instate = XML_PARSER_CONTENT;
				5677	#ifdef DEBUG_PUSH
				5678	xmlGenericError(xmlGenericErrorContext,
				5679	"HPP: entering CONTENT\n");
				5680	#endif
				5681	break;
				5682	}
				5683
				5684	/*
				5685	* Check for an Empty Element from DTD definition
				5686	*/
				5687	if ((info != NULL) && (info->empty)) {
				5688	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				5689	ctxt->sax->endElement(ctxt->userData, name);
				5690	htmlnamePop(ctxt);
				5691	}
				5692
				5693	if (ctxt->record_info)
				5694	htmlNodeInfoPush(ctxt, &node_info);
				5695
				5696	ctxt->instate = XML_PARSER_CONTENT;
				5697	#ifdef DEBUG_PUSH
				5698	xmlGenericError(xmlGenericErrorContext,
				5699	"HPP: entering CONTENT\n");
				5700	#endif
				5701	break;
				5702	}
				5703	case XML_PARSER_CONTENT: {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5704	xmlChar chr[2] = { 0, 0 };
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5705	long cons;
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5706
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5707	/*
				5708	* Handle preparsed entities and charRef
				5709	*/
				5710	if (ctxt->token != 0) {
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5711	chr[0] = (xmlChar) ctxt->token;
				5712	htmlCheckParagraph(ctxt);
				5713	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				5714	ctxt->sax->characters(ctxt->userData, chr, 1);
				5715	ctxt->token = 0;
				5716	ctxt->checkIndex = 0;
				5717	}
				5718	if ((avail == 1) && (terminate)) {
				5719	cur = in->cur[0];
				5720	if ((cur != '<') && (cur != '&')) {
				5721	if (ctxt->sax != NULL) {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5722	chr[0] = cur;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5723	if (IS_BLANK_CH(cur)) {
				5724	if (ctxt->keepBlanks) {
				5725	if (ctxt->sax->characters != NULL)
				5726	ctxt->sax->characters(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5727	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5728	} else {
				5729	if (ctxt->sax->ignorableWhitespace != NULL)
				5730	ctxt->sax->ignorableWhitespace(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5731	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5732	}
				5733	} else {
				5734	htmlCheckParagraph(ctxt);
				5735	if (ctxt->sax->characters != NULL)
				5736	ctxt->sax->characters(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5737	ctxt->userData, chr, 1);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5738	}
				5739	}
				5740	ctxt->token = 0;
				5741	ctxt->checkIndex = 0;
				5742	in->cur++;
				5743	break;
				5744	}
				5745	}
				5746	if (avail < 2)
				5747	goto done;
				5748	cur = in->cur[0];
				5749	next = in->cur[1];
				5750	cons = ctxt->nbChars;
				5751	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				5752	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				5753	/*
				5754	* Handle SCRIPT/STYLE separately
				5755	*/
				5756	if (!terminate) {
				5757	int idx;
				5758	xmlChar val;
				5759
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5760	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5761	if (idx < 0)
				5762	goto done;
				5763	val = in->cur[idx + 2];
				5764	if (val == 0) /* bad cut of input */
				5765	goto done;
				5766	}
				5767	htmlParseScript(ctxt);
				5768	if ((cur == '<') && (next == '/')) {
				5769	ctxt->instate = XML_PARSER_END_TAG;
				5770	ctxt->checkIndex = 0;
				5771	#ifdef DEBUG_PUSH
				5772	xmlGenericError(xmlGenericErrorContext,
				5773	"HPP: entering END_TAG\n");
				5774	#endif
				5775	break;
				5776	}
				5777	} else {
				5778	/*
				5779	* Sometimes DOCTYPE arrives in the middle of the document
				5780	*/
				5781	if ((cur == '<') && (next == '!') &&
				5782	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				5783	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				5784	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				5785	(UPP(8) == 'E')) {
				5786	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5787	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5788	goto done;
				5789	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
				5790	"Misplaced DOCTYPE declaration\n",
				5791	BAD_CAST "DOCTYPE" , NULL);
				5792	htmlParseDocTypeDecl(ctxt);
				5793	} else if ((cur == '<') && (next == '!') &&
				5794	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				5795	if ((!terminate) &&
				5796	(htmlParseLookupSequence(
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5797	ctxt, '-', '-', '>', 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5798	goto done;
				5799	#ifdef DEBUG_PUSH
				5800	xmlGenericError(xmlGenericErrorContext,
				5801	"HPP: Parsing Comment\n");
				5802	#endif
				5803	htmlParseComment(ctxt);
				5804	ctxt->instate = XML_PARSER_CONTENT;
				5805	} else if ((cur == '<') && (next == '?')) {
				5806	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5807	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5808	goto done;
				5809	#ifdef DEBUG_PUSH
				5810	xmlGenericError(xmlGenericErrorContext,
				5811	"HPP: Parsing PI\n");
				5812	#endif
				5813	htmlParsePI(ctxt);
				5814	ctxt->instate = XML_PARSER_CONTENT;
				5815	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				5816	goto done;
				5817	} else if ((cur == '<') && (next == '/')) {
				5818	ctxt->instate = XML_PARSER_END_TAG;
				5819	ctxt->checkIndex = 0;
				5820	#ifdef DEBUG_PUSH
				5821	xmlGenericError(xmlGenericErrorContext,
				5822	"HPP: entering END_TAG\n");
				5823	#endif
				5824	break;
				5825	} else if (cur == '<') {
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5826	if ((!terminate) && (next == 0))
				5827	goto done;
				5828	/*
				5829	* Only switch to START_TAG if the next character
				5830	* starts a valid name. Otherwise, htmlParseStartTag
				5831	* might return without consuming all characters
				5832	* up to the final '>'.
				5833	*/
				5834	if ((IS_ASCII_LETTER(next)) \|\|
				5835	(next == '_') \|\| (next == ':') \|\| (next == '.')) {
				5836	ctxt->instate = XML_PARSER_START_TAG;
				5837	ctxt->checkIndex = 0;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5838	#ifdef DEBUG_PUSH
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5839	xmlGenericError(xmlGenericErrorContext,
				5840	"HPP: entering START_TAG\n");
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5841	#endif
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5842	} else {
				5843	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
				5844	"htmlParseTryOrFinish: "
				5845	"invalid element name\n",
				5846	NULL, NULL);
				5847	htmlCheckParagraph(ctxt);
				5848	if ((ctxt->sax != NULL) &&
				5849	(ctxt->sax->characters != NULL))
				5850	ctxt->sax->characters(ctxt->userData,
				5851	in->cur, 1);
				5852	NEXT;
				5853	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5854	break;
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5855	} else {
				5856	/*
				5857	* check that the text sequence is complete
				5858	* before handing out the data to the parser
				5859	* to avoid problems with erroneous end of
				5860	* data detection.
				5861	*/
				5862	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5863	(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5864	goto done;
				5865	ctxt->checkIndex = 0;
				5866	#ifdef DEBUG_PUSH
				5867	xmlGenericError(xmlGenericErrorContext,
				5868	"HPP: Parsing char data\n");
				5869	#endif
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5870	while ((cur != '<') && (cur != 0)) {
				5871	if (cur == '&') {
				5872	htmlParseReference(ctxt);
				5873	} else {
				5874	htmlParseCharData(ctxt);
				5875	}
				5876	cur = in->cur[0];
				5877	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5878	}
				5879	}
				5880	if (cons == ctxt->nbChars) {
				5881	if (ctxt->node != NULL) {
				5882	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5883	"detected an error in element content\n",
				5884	NULL, NULL);
				5885	}
				5886	NEXT;
				5887	break;
				5888	}
				5889
				5890	break;
				5891	}
				5892	case XML_PARSER_END_TAG:
				5893	if (avail < 2)
				5894	goto done;
				5895	if ((!terminate) &&
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	5896	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	5897	goto done;
				5898	htmlParseEndTag(ctxt);
				5899	if (ctxt->nameNr == 0) {
				5900	ctxt->instate = XML_PARSER_EPILOG;
				5901	} else {
				5902	ctxt->instate = XML_PARSER_CONTENT;
				5903	}
				5904	ctxt->checkIndex = 0;
				5905	#ifdef DEBUG_PUSH
				5906	xmlGenericError(xmlGenericErrorContext,
				5907	"HPP: entering CONTENT\n");
				5908	#endif
				5909	break;
				5910	case XML_PARSER_CDATA_SECTION:
				5911	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5912	"HPP: internal error, state == CDATA\n",
				5913	NULL, NULL);
				5914	ctxt->instate = XML_PARSER_CONTENT;
				5915	ctxt->checkIndex = 0;
				5916	#ifdef DEBUG_PUSH
				5917	xmlGenericError(xmlGenericErrorContext,
				5918	"HPP: entering CONTENT\n");
				5919	#endif
				5920	break;
				5921	case XML_PARSER_DTD:
				5922	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5923	"HPP: internal error, state == DTD\n",
				5924	NULL, NULL);
				5925	ctxt->instate = XML_PARSER_CONTENT;
				5926	ctxt->checkIndex = 0;
				5927	#ifdef DEBUG_PUSH
				5928	xmlGenericError(xmlGenericErrorContext,
				5929	"HPP: entering CONTENT\n");
				5930	#endif
				5931	break;
				5932	case XML_PARSER_COMMENT:
				5933	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5934	"HPP: internal error, state == COMMENT\n",
				5935	NULL, NULL);
				5936	ctxt->instate = XML_PARSER_CONTENT;
				5937	ctxt->checkIndex = 0;
				5938	#ifdef DEBUG_PUSH
				5939	xmlGenericError(xmlGenericErrorContext,
				5940	"HPP: entering CONTENT\n");
				5941	#endif
				5942	break;
				5943	case XML_PARSER_PI:
				5944	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5945	"HPP: internal error, state == PI\n",
				5946	NULL, NULL);
				5947	ctxt->instate = XML_PARSER_CONTENT;
				5948	ctxt->checkIndex = 0;
				5949	#ifdef DEBUG_PUSH
				5950	xmlGenericError(xmlGenericErrorContext,
				5951	"HPP: entering CONTENT\n");
				5952	#endif
				5953	break;
				5954	case XML_PARSER_ENTITY_DECL:
				5955	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5956	"HPP: internal error, state == ENTITY_DECL\n",
				5957	NULL, NULL);
				5958	ctxt->instate = XML_PARSER_CONTENT;
				5959	ctxt->checkIndex = 0;
				5960	#ifdef DEBUG_PUSH
				5961	xmlGenericError(xmlGenericErrorContext,
				5962	"HPP: entering CONTENT\n");
				5963	#endif
				5964	break;
				5965	case XML_PARSER_ENTITY_VALUE:
				5966	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5967	"HPP: internal error, state == ENTITY_VALUE\n",
				5968	NULL, NULL);
				5969	ctxt->instate = XML_PARSER_CONTENT;
				5970	ctxt->checkIndex = 0;
				5971	#ifdef DEBUG_PUSH
				5972	xmlGenericError(xmlGenericErrorContext,
				5973	"HPP: entering DTD\n");
				5974	#endif
				5975	break;
				5976	case XML_PARSER_ATTRIBUTE_VALUE:
				5977	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5978	"HPP: internal error, state == ATTRIBUTE_VALUE\n",
				5979	NULL, NULL);
				5980	ctxt->instate = XML_PARSER_START_TAG;
				5981	ctxt->checkIndex = 0;
				5982	#ifdef DEBUG_PUSH
				5983	xmlGenericError(xmlGenericErrorContext,
				5984	"HPP: entering START_TAG\n");
				5985	#endif
				5986	break;
				5987	case XML_PARSER_SYSTEM_LITERAL:
				5988	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				5989	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
				5990	NULL, NULL);
				5991	ctxt->instate = XML_PARSER_CONTENT;
				5992	ctxt->checkIndex = 0;
				5993	#ifdef DEBUG_PUSH
				5994	xmlGenericError(xmlGenericErrorContext,
				5995	"HPP: entering CONTENT\n");
				5996	#endif
				5997	break;
				5998	case XML_PARSER_IGNORE:
				5999	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6000	"HPP: internal error, state == XML_PARSER_IGNORE\n",
				6001	NULL, NULL);
				6002	ctxt->instate = XML_PARSER_CONTENT;
				6003	ctxt->checkIndex = 0;
				6004	#ifdef DEBUG_PUSH
				6005	xmlGenericError(xmlGenericErrorContext,
				6006	"HPP: entering CONTENT\n");
				6007	#endif
				6008	break;
				6009	case XML_PARSER_PUBLIC_LITERAL:
				6010	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6011	"HPP: internal error, state == XML_PARSER_LITERAL\n",
				6012	NULL, NULL);
				6013	ctxt->instate = XML_PARSER_CONTENT;
				6014	ctxt->checkIndex = 0;
				6015	#ifdef DEBUG_PUSH
				6016	xmlGenericError(xmlGenericErrorContext,
				6017	"HPP: entering CONTENT\n");
				6018	#endif
				6019	break;
				6020
				6021	}
				6022	}
				6023	done:
				6024	if ((avail == 0) && (terminate)) {
				6025	htmlAutoCloseOnEnd(ctxt);
				6026	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				6027	/*
				6028	* SAX: end of the document processing.
				6029	*/
				6030	ctxt->instate = XML_PARSER_EOF;
				6031	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				6032	ctxt->sax->endDocument(ctxt->userData);
				6033	}
				6034	}
				6035	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
				6036	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				6037	(ctxt->instate == XML_PARSER_EPILOG))) {
				6038	xmlDtdPtr dtd;
				6039	dtd = xmlGetIntSubset(ctxt->myDoc);
				6040	if (dtd == NULL)
				6041	ctxt->myDoc->intSubset =
				6042	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
				6043	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				6044	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				6045	}
				6046	#ifdef DEBUG_PUSH
				6047	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				6048	#endif
				6049	return(ret);
				6050	}
				6051
				6052	/**
				6053	* htmlParseChunk:
				6054	* @ctxt: an HTML parser context
				6055	* @chunk: an char array
				6056	* @size: the size in byte of the chunk
				6057	* @terminate: last chunk indicator
				6058	*
				6059	* Parse a Chunk of memory
				6060	*
				6061	* Returns zero if no error, the xmlParserErrors otherwise.
				6062	*/
				6063	int
				6064	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				6065	int terminate) {
				6066	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
				6067	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
				6068	"htmlParseChunk: context error\n", NULL, NULL);
				6069	return(XML_ERR_INTERNAL_ERROR);
				6070	}
				6071	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				6072	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				6073	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
				6074	size_t cur = ctxt->input->cur - ctxt->input->base;
				6075	int res;
				6076
				6077	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	6078	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6079	if (res < 0) {
				6080	ctxt->errNo = XML_PARSER_EOF;
				6081	ctxt->disableSAX = 1;
				6082	return (XML_PARSER_EOF);
				6083	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6084	#ifdef DEBUG_PUSH
				6085	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				6086	#endif
				6087
				6088	#if 0
				6089	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				6090	htmlParseTryOrFinish(ctxt, terminate);
				6091	#endif
				6092	} else if (ctxt->instate != XML_PARSER_EOF) {
				6093	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
				6094	xmlParserInputBufferPtr in = ctxt->input->buf;
				6095	if ((in->encoder != NULL) && (in->buffer != NULL) &&
				6096	(in->raw != NULL)) {
				6097	int nbchars;
				6098	size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
				6099	size_t current = ctxt->input->cur - ctxt->input->base;
				6100
				6101	nbchars = xmlCharEncInput(in, terminate);
Haibo Huang	cfd91dc	2020-07-30 23:01:33 -0700	[diff] [blame^]	6102	xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6103	if (nbchars < 0) {
				6104	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
				6105	"encoder error\n", NULL, NULL);
				6106	return(XML_ERR_INVALID_ENCODING);
				6107	}
Elliott Hughes	7fbecab	2019-01-10 16:42:03 -0800	[diff] [blame]	6108	}
				6109	}
				6110	}
				6111	htmlParseTryOrFinish(ctxt, terminate);
				6112	if (terminate) {
				6113	if ((ctxt->instate != XML_PARSER_EOF) &&
				6114	(ctxt->instate != XML_PARSER_EPILOG) &&
				6115	(ctxt->instate != XML_PARSER_MISC)) {
				6116	ctxt->errNo = XML_ERR_DOCUMENT_END;
				6117	ctxt->wellFormed = 0;
				6118	}
				6119	if (ctxt->instate != XML_PARSER_EOF) {
				6120	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				6121	ctxt->sax->endDocument(ctxt->userData);
				6122	}
				6123	ctxt->instate = XML_PARSER_EOF;
				6124	}
				6125	return((xmlParserErrors) ctxt->errNo);
				6126	}
				6127
				6128	/************************************************************************
				6129	* *
				6130	* User entry points *
				6131	* *
				6132	************************************************************************/
				6133
				6134	/**
				6135	* htmlCreatePushParserCtxt:
				6136	* @sax: a SAX handler
				6137	* @user_data: The user data returned on SAX callbacks
				6138	* @chunk: a pointer to an array of chars
				6139	* @size: number of chars in the array
				6140	* @filename: an optional file name or URI
				6141	* @enc: an optional encoding
				6142	*
				6143	* Create a parser context for using the HTML parser in push mode
				6144	* The value of @filename is used for fetching external entities
				6145	* and error/warning reports.
				6146	*
				6147	* Returns the new parser context or NULL
				6148	*/
				6149	htmlParserCtxtPtr
				6150	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				6151	const char chunk, int size, const char filename,
				6152	xmlCharEncoding enc) {
				6153	htmlParserCtxtPtr ctxt;
				6154	htmlParserInputPtr inputStream;
				6155	xmlParserInputBufferPtr buf;
				6156
				6157	xmlInitParser();
				6158
				6159	buf = xmlAllocParserInputBuffer(enc);
				6160	if (buf == NULL) return(NULL);
				6161
				6162	ctxt = htmlNewParserCtxt();
				6163	if (ctxt == NULL) {
				6164	xmlFreeParserInputBuffer(buf);
				6165	return(NULL);
				6166	}
				6167	if(enc==XML_CHAR_ENCODING_UTF8 \|\| buf->encoder)
				6168	ctxt->charset=XML_CHAR_ENCODING_UTF8;
				6169	if (sax != NULL) {
				6170	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
				6171	xmlFree(ctxt->sax);
				6172	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				6173	if (ctxt->sax == NULL) {
				6174	xmlFree(buf);
				6175	xmlFree(ctxt);
				6176	return(NULL);
				6177	}
				6178	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				6179	if (user_data != NULL)
				6180	ctxt->userData = user_data;
				6181	}
				6182	if (filename == NULL) {
				6183	ctxt->directory = NULL;
				6184	} else {
				6185	ctxt->directory = xmlParserGetDirectory(filename);
				6186	}
				6187
				6188	inputStream = htmlNewInputStream(ctxt);
				6189	if (inputStream == NULL) {
				6190	xmlFreeParserCtxt(ctxt);
				6191	xmlFree(buf);
				6192	return(NULL);
				6193	}
				6194
				6195	if (filename == NULL)
				6196	inputStream->filename = NULL;
				6197	else
				6198	inputStream->filename = (char *)
				6199	xmlCanonicPath((const xmlChar *) filename);
				6200	inputStream->buf = buf;
				6201	xmlBufResetInput(buf->buffer, inputStream);
				6202
				6203	inputPush(ctxt, inputStream);
				6204
				6205	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				6206	(ctxt->input->buf != NULL)) {
				6207	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
				6208	size_t cur = ctxt->input->cur - ctxt->input->base;
				6209
				6210	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				6211
				6212	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
				6213	#ifdef DEBUG_PUSH
				6214	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				6215	#endif
				6216	}
				6217	ctxt->progressive = 1;
				6218
				6219	return(ctxt);
				6220	}
				6221	#endif /* LIBXML_PUSH_ENABLED */
				6222
				6223	/**
				6224	* htmlSAXParseDoc:
				6225	* @cur: a pointer to an array of xmlChar
				6226	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6227	* @sax: the SAX handler block
				6228	* @userData: if using SAX, this pointer will be provided on callbacks.
				6229	*
				6230	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				6231	* to handle parse events. If sax is NULL, fallback to the default DOM
				6232	* behavior and return a tree.
				6233	*
				6234	* Returns the resulting document tree unless SAX is NULL or the document is
				6235	* not well formed.
				6236	*/
				6237
				6238	htmlDocPtr
				6239	htmlSAXParseDoc(const xmlChar cur, const char encoding,
				6240	htmlSAXHandlerPtr sax, void *userData) {
				6241	htmlDocPtr ret;
				6242	htmlParserCtxtPtr ctxt;
				6243
				6244	xmlInitParser();
				6245
				6246	if (cur == NULL) return(NULL);
				6247
				6248
				6249	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				6250	if (ctxt == NULL) return(NULL);
				6251	if (sax != NULL) {
				6252	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
				6253	ctxt->sax = sax;
				6254	ctxt->userData = userData;
				6255	}
				6256
				6257	htmlParseDocument(ctxt);
				6258	ret = ctxt->myDoc;
				6259	if (sax != NULL) {
				6260	ctxt->sax = NULL;
				6261	ctxt->userData = NULL;
				6262	}
				6263	htmlFreeParserCtxt(ctxt);
				6264
				6265	return(ret);
				6266	}
				6267
				6268	/**
				6269	* htmlParseDoc:
				6270	* @cur: a pointer to an array of xmlChar
				6271	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6272	*
				6273	* parse an HTML in-memory document and build a tree.
				6274	*
				6275	* Returns the resulting document tree
				6276	*/
				6277
				6278	htmlDocPtr
				6279	htmlParseDoc(const xmlChar cur, const char encoding) {
				6280	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				6281	}
				6282
				6283
				6284	/**
				6285	* htmlCreateFileParserCtxt:
				6286	* @filename: the filename
				6287	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6288	*
				6289	* Create a parser context for a file content.
				6290	* Automatic support for ZLIB/Compress compressed document is provided
				6291	* by default if found at compile-time.
				6292	*
				6293	* Returns the new parser context or NULL
				6294	*/
				6295	htmlParserCtxtPtr
				6296	htmlCreateFileParserCtxt(const char filename, const char encoding)
				6297	{
				6298	htmlParserCtxtPtr ctxt;
				6299	htmlParserInputPtr inputStream;
				6300	char *canonicFilename;
				6301	/* htmlCharEncoding enc; */
				6302	xmlChar content, content_line = (xmlChar *) "charset=";
				6303
				6304	if (filename == NULL)
				6305	return(NULL);
				6306
				6307	ctxt = htmlNewParserCtxt();
				6308	if (ctxt == NULL) {
				6309	return(NULL);
				6310	}
				6311	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
				6312	if (canonicFilename == NULL) {
				6313	#ifdef LIBXML_SAX1_ENABLED
				6314	if (xmlDefaultSAXHandler.error != NULL) {
				6315	xmlDefaultSAXHandler.error(NULL, "out of memory\n");
				6316	}
				6317	#endif
				6318	xmlFreeParserCtxt(ctxt);
				6319	return(NULL);
				6320	}
				6321
				6322	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
				6323	xmlFree(canonicFilename);
				6324	if (inputStream == NULL) {
				6325	xmlFreeParserCtxt(ctxt);
				6326	return(NULL);
				6327	}
				6328
				6329	inputPush(ctxt, inputStream);
				6330
				6331	/* set encoding */
				6332	if (encoding) {
				6333	size_t l = strlen(encoding);
				6334
				6335	if (l < 1000) {
				6336	content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
				6337	if (content) {
				6338	strcpy ((char )content, (char )content_line);
				6339	strcat ((char )content, (char )encoding);
				6340	htmlCheckEncoding (ctxt, content);
				6341	xmlFree (content);
				6342	}
				6343	}
				6344	}
				6345
				6346	return(ctxt);
				6347	}
				6348
				6349	/**
				6350	* htmlSAXParseFile:
				6351	* @filename: the filename
				6352	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6353	* @sax: the SAX handler block
				6354	* @userData: if using SAX, this pointer will be provided on callbacks.
				6355	*
				6356	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				6357	* compressed document is provided by default if found at compile-time.
				6358	* It use the given SAX function block to handle the parsing callback.
				6359	* If sax is NULL, fallback to the default DOM tree building routines.
				6360	*
				6361	* Returns the resulting document tree unless SAX is NULL or the document is
				6362	* not well formed.
				6363	*/
				6364
				6365	htmlDocPtr
				6366	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				6367	void *userData) {
				6368	htmlDocPtr ret;
				6369	htmlParserCtxtPtr ctxt;
				6370	htmlSAXHandlerPtr oldsax = NULL;
				6371
				6372	xmlInitParser();
				6373
				6374	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				6375	if (ctxt == NULL) return(NULL);
				6376	if (sax != NULL) {
				6377	oldsax = ctxt->sax;
				6378	ctxt->sax = sax;
				6379	ctxt->userData = userData;
				6380	}
				6381
				6382	htmlParseDocument(ctxt);
				6383
				6384	ret = ctxt->myDoc;
				6385	if (sax != NULL) {
				6386	ctxt->sax = oldsax;
				6387	ctxt->userData = NULL;
				6388	}
				6389	htmlFreeParserCtxt(ctxt);
				6390
				6391	return(ret);
				6392	}
				6393
				6394	/**
				6395	* htmlParseFile:
				6396	* @filename: the filename
				6397	* @encoding: a free form C string describing the HTML document encoding, or NULL
				6398	*
				6399	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				6400	* compressed document is provided by default if found at compile-time.
				6401	*
				6402	* Returns the resulting document tree
				6403	*/
				6404
				6405	htmlDocPtr
				6406	htmlParseFile(const char filename, const char encoding) {
				6407	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				6408	}
				6409
				6410	/**
				6411	* htmlHandleOmittedElem:
				6412	* @val: int 0 or 1
				6413	*
				6414	* Set and return the previous value for handling HTML omitted tags.
				6415	*
				6416	* Returns the last value for 0 for no handling, 1 for auto insertion.
				6417	*/
				6418
				6419	int
				6420	htmlHandleOmittedElem(int val) {
				6421	int old = htmlOmittedDefaultValue;
				6422
				6423	htmlOmittedDefaultValue = val;
				6424	return(old);
				6425	}
				6426
				6427	/**
				6428	* htmlElementAllowedHere:
				6429	* @parent: HTML parent element
				6430	* @elt: HTML element
				6431	*
				6432	* Checks whether an HTML element may be a direct child of a parent element.
				6433	* Note - doesn't check for deprecated elements
				6434	*
				6435	* Returns 1 if allowed; 0 otherwise.
				6436	*/
				6437	int
				6438	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
				6439	const char** p ;
				6440
				6441	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
				6442	return 0 ;
				6443
				6444	for ( p = parent->subelts; *p; ++p )
				6445	if ( !xmlStrcmp((const xmlChar )p, elt) )
				6446	return 1 ;
				6447
				6448	return 0 ;
				6449	}
				6450	/**
				6451	* htmlElementStatusHere:
				6452	* @parent: HTML parent element
				6453	* @elt: HTML element
				6454	*
				6455	* Checks whether an HTML element may be a direct child of a parent element.
				6456	* and if so whether it is valid or deprecated.
				6457	*
				6458	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
				6459	*/
				6460	htmlStatus
				6461	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
				6462	if ( ! parent \|\| ! elt )
				6463	return HTML_INVALID ;
				6464	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
				6465	return HTML_INVALID ;
				6466
				6467	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
				6468	}
				6469	/**
				6470	* htmlAttrAllowed:
				6471	* @elt: HTML element
				6472	* @attr: HTML attribute
				6473	* @legacy: whether to allow deprecated attributes
				6474	*
				6475	* Checks whether an attribute is valid for an element
				6476	* Has full knowledge of Required and Deprecated attributes
				6477	*
				6478	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
				6479	*/
				6480	htmlStatus
				6481	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
				6482	const char** p ;
				6483
				6484	if ( !elt \|\| ! attr )
				6485	return HTML_INVALID ;
				6486
				6487	if ( elt->attrs_req )
				6488	for ( p = elt->attrs_req; *p; ++p)
				6489	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6490	return HTML_REQUIRED ;
				6491
				6492	if ( elt->attrs_opt )
				6493	for ( p = elt->attrs_opt; *p; ++p)
				6494	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6495	return HTML_VALID ;
				6496
				6497	if ( legacy && elt->attrs_depr )
				6498	for ( p = elt->attrs_depr; *p; ++p)
				6499	if ( !xmlStrcmp((const xmlChar)p, attr) )
				6500	return HTML_DEPRECATED ;
				6501
				6502	return HTML_INVALID ;
				6503	}
				6504	/**
				6505	* htmlNodeStatus:
				6506	* @node: an htmlNodePtr in a tree
				6507	* @legacy: whether to allow deprecated elements (YES is faster here
				6508	* for Element nodes)
				6509	*
				6510	* Checks whether the tree node is valid. Experimental (the author
				6511	* only uses the HTML enhancements in a SAX parser)
				6512	*
				6513	* Return: for Element nodes, a return from htmlElementAllowedHere (if
				6514	* legacy allowed) or htmlElementStatusHere (otherwise).
				6515	* for Attribute nodes, a return from htmlAttrAllowed
				6516	* for other nodes, HTML_NA (no checks performed)
				6517	*/
				6518	htmlStatus
				6519	htmlNodeStatus(const htmlNodePtr node, int legacy) {
				6520	if ( ! node )
				6521	return HTML_INVALID ;
				6522
				6523	switch ( node->type ) {
				6524	case XML_ELEMENT_NODE:
				6525	return legacy
				6526	? ( htmlElementAllowedHere (
				6527	htmlTagLookup(node->parent->name) , node->name
				6528	) ? HTML_VALID : HTML_INVALID )
				6529	: htmlElementStatusHere(
				6530	htmlTagLookup(node->parent->name) ,
				6531	htmlTagLookup(node->name) )
				6532	;
				6533	case XML_ATTRIBUTE_NODE:
				6534	return htmlAttrAllowed(
				6535	htmlTagLookup(node->parent->name) , node->name, legacy) ;
				6536	default: return HTML_NA ;
				6537	}
				6538	}
				6539	/************************************************************************
				6540	* *
				6541	* New set (2.6.0) of simpler and more flexible APIs *
				6542	* *
				6543	************************************************************************/
				6544	/**
				6545	* DICT_FREE:
				6546	* @str: a string
				6547	*
				6548	* Free a string if it is not owned by the "dict" dictionary in the
				6549	* current scope
				6550	*/
				6551	#define DICT_FREE(str) \
				6552	if ((str) && ((!dict) \|\| \
				6553	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
				6554	xmlFree((char *)(str));
				6555
				6556	/**
				6557	* htmlCtxtReset:
				6558	* @ctxt: an HTML parser context
				6559	*
				6560	* Reset a parser context
				6561	*/
				6562	void
				6563	htmlCtxtReset(htmlParserCtxtPtr ctxt)
				6564	{
				6565	xmlParserInputPtr input;
				6566	xmlDictPtr dict;
				6567
				6568	if (ctxt == NULL)
				6569	return;
				6570
				6571	xmlInitParser();
				6572	dict = ctxt->dict;
				6573
				6574	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
				6575	xmlFreeInputStream(input);
				6576	}
				6577	ctxt->inputNr = 0;
				6578	ctxt->input = NULL;
				6579
				6580	ctxt->spaceNr = 0;
				6581	if (ctxt->spaceTab != NULL) {
				6582	ctxt->spaceTab[0] = -1;
				6583	ctxt->space = &ctxt->spaceTab[0];
				6584	} else {
				6585	ctxt->space = NULL;
				6586	}
				6587
				6588
				6589	ctxt->nodeNr = 0;
				6590	ctxt->node = NULL;
				6591
				6592	ctxt->nameNr = 0;
				6593	ctxt->name = NULL;
				6594
				6595	DICT_FREE(ctxt->version);
				6596	ctxt->version = NULL;
				6597	DICT_FREE(ctxt->encoding);
				6598	ctxt->encoding = NULL;
				6599	DICT_FREE(ctxt->directory);
				6600	ctxt->directory = NULL;
				6601	DICT_FREE(ctxt->extSubURI);
				6602	ctxt->extSubURI = NULL;
				6603	DICT_FREE(ctxt->extSubSystem);
				6604	ctxt->extSubSystem = NULL;
				6605	if (ctxt->myDoc != NULL)
				6606	xmlFreeDoc(ctxt->myDoc);
				6607	ctxt->myDoc = NULL;
				6608
				6609	ctxt->standalone = -1;
				6610	ctxt->hasExternalSubset = 0;
				6611	ctxt->hasPErefs = 0;
				6612	ctxt->html = 1;
				6613	ctxt->external = 0;
				6614	ctxt->instate = XML_PARSER_START;
				6615	ctxt->token = 0;
				6616
				6617	ctxt->wellFormed = 1;
				6618	ctxt->nsWellFormed = 1;
				6619	ctxt->disableSAX = 0;
				6620	ctxt->valid = 1;
				6621	ctxt->vctxt.userData = ctxt;
				6622	ctxt->vctxt.error = xmlParserValidityError;
				6623	ctxt->vctxt.warning = xmlParserValidityWarning;
				6624	ctxt->record_info = 0;
				6625	ctxt->nbChars = 0;
				6626	ctxt->checkIndex = 0;
				6627	ctxt->inSubset = 0;
				6628	ctxt->errNo = XML_ERR_OK;
				6629	ctxt->depth = 0;
				6630	ctxt->charset = XML_CHAR_ENCODING_NONE;
				6631	ctxt->catalogs = NULL;
				6632	xmlInitNodeInfoSeq(&ctxt->node_seq);
				6633
				6634	if (ctxt->attsDefault != NULL) {
				6635	xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
				6636	ctxt->attsDefault = NULL;
				6637	}
				6638	if (ctxt->attsSpecial != NULL) {
				6639	xmlHashFree(ctxt->attsSpecial, NULL);
				6640	ctxt->attsSpecial = NULL;
				6641	}
				6642	}
				6643
				6644	/**
				6645	* htmlCtxtUseOptions:
				6646	* @ctxt: an HTML parser context
				6647	* @options: a combination of htmlParserOption(s)
				6648	*
				6649	* Applies the options to the parser context
				6650	*
				6651	* Returns 0 in case of success, the set of unknown or unimplemented options
				6652	* in case of error.
				6653	*/
				6654	int
				6655	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
				6656	{
				6657	if (ctxt == NULL)
				6658	return(-1);
				6659
				6660	if (options & HTML_PARSE_NOWARNING) {
				6661	ctxt->sax->warning = NULL;
				6662	ctxt->vctxt.warning = NULL;
				6663	options -= XML_PARSE_NOWARNING;
				6664	ctxt->options \|= XML_PARSE_NOWARNING;
				6665	}
				6666	if (options & HTML_PARSE_NOERROR) {
				6667	ctxt->sax->error = NULL;
				6668	ctxt->vctxt.error = NULL;
				6669	ctxt->sax->fatalError = NULL;
				6670	options -= XML_PARSE_NOERROR;
				6671	ctxt->options \|= XML_PARSE_NOERROR;
				6672	}
				6673	if (options & HTML_PARSE_PEDANTIC) {
				6674	ctxt->pedantic = 1;
				6675	options -= XML_PARSE_PEDANTIC;
				6676	ctxt->options \|= XML_PARSE_PEDANTIC;
				6677	} else
				6678	ctxt->pedantic = 0;
				6679	if (options & XML_PARSE_NOBLANKS) {
				6680	ctxt->keepBlanks = 0;
				6681	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
				6682	options -= XML_PARSE_NOBLANKS;
				6683	ctxt->options \|= XML_PARSE_NOBLANKS;
				6684	} else
				6685	ctxt->keepBlanks = 1;
				6686	if (options & HTML_PARSE_RECOVER) {
				6687	ctxt->recovery = 1;
				6688	options -= HTML_PARSE_RECOVER;
				6689	} else
				6690	ctxt->recovery = 0;
				6691	if (options & HTML_PARSE_COMPACT) {
				6692	ctxt->options \|= HTML_PARSE_COMPACT;
				6693	options -= HTML_PARSE_COMPACT;
				6694	}
				6695	if (options & XML_PARSE_HUGE) {
				6696	ctxt->options \|= XML_PARSE_HUGE;
				6697	options -= XML_PARSE_HUGE;
				6698	}
				6699	if (options & HTML_PARSE_NODEFDTD) {
				6700	ctxt->options \|= HTML_PARSE_NODEFDTD;
				6701	options -= HTML_PARSE_NODEFDTD;
				6702	}
				6703	if (options & HTML_PARSE_IGNORE_ENC) {
				6704	ctxt->options \|= HTML_PARSE_IGNORE_ENC;
				6705	options -= HTML_PARSE_IGNORE_ENC;
				6706	}
				6707	if (options & HTML_PARSE_NOIMPLIED) {
				6708	ctxt->options \|= HTML_PARSE_NOIMPLIED;
				6709	options -= HTML_PARSE_NOIMPLIED;
				6710	}
				6711	ctxt->dictNames = 0;
				6712	return (options);
				6713	}
				6714
				6715	/**
				6716	* htmlDoRead:
				6717	* @ctxt: an HTML parser context
				6718	* @URL: the base URL to use for the document
				6719	* @encoding: the document encoding, or NULL
				6720	* @options: a combination of htmlParserOption(s)
				6721	* @reuse: keep the context for reuse
				6722	*
				6723	* Common front-end for the htmlRead functions
				6724	*
				6725	* Returns the resulting document tree or NULL
				6726	*/
				6727	static htmlDocPtr
				6728	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
				6729	int options, int reuse)
				6730	{
				6731	htmlDocPtr ret;
				6732
				6733	htmlCtxtUseOptions(ctxt, options);
				6734	ctxt->html = 1;
				6735	if (encoding != NULL) {
				6736	xmlCharEncodingHandlerPtr hdlr;
				6737
				6738	hdlr = xmlFindCharEncodingHandler(encoding);
				6739	if (hdlr != NULL) {
				6740	xmlSwitchToEncoding(ctxt, hdlr);
				6741	if (ctxt->input->encoding != NULL)
				6742	xmlFree((xmlChar *) ctxt->input->encoding);
				6743	ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
				6744	}
				6745	}
				6746	if ((URL != NULL) && (ctxt->input != NULL) &&
				6747	(ctxt->input->filename == NULL))
				6748	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
				6749	htmlParseDocument(ctxt);
				6750	ret = ctxt->myDoc;
				6751	ctxt->myDoc = NULL;
				6752	if (!reuse) {
				6753	if ((ctxt->dictNames) &&
				6754	(ret != NULL) &&
				6755	(ret->dict == ctxt->dict))
				6756	ctxt->dict = NULL;
				6757	xmlFreeParserCtxt(ctxt);
				6758	}
				6759	return (ret);
				6760	}
				6761
				6762	/**
				6763	* htmlReadDoc:
				6764	* @cur: a pointer to a zero terminated string
				6765	* @URL: the base URL to use for the document
				6766	* @encoding: the document encoding, or NULL
				6767	* @options: a combination of htmlParserOption(s)
				6768	*
				6769	* parse an XML in-memory document and build a tree.
				6770	*
				6771	* Returns the resulting document tree
				6772	*/
				6773	htmlDocPtr
				6774	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
				6775	{
				6776	htmlParserCtxtPtr ctxt;
				6777
				6778	if (cur == NULL)
				6779	return (NULL);
				6780
				6781	xmlInitParser();
				6782	ctxt = htmlCreateDocParserCtxt(cur, NULL);
				6783	if (ctxt == NULL)
				6784	return (NULL);
				6785	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6786	}
				6787
				6788	/**
				6789	* htmlReadFile:
				6790	* @filename: a file or URL
				6791	* @encoding: the document encoding, or NULL
				6792	* @options: a combination of htmlParserOption(s)
				6793	*
				6794	* parse an XML file from the filesystem or the network.
				6795	*
				6796	* Returns the resulting document tree
				6797	*/
				6798	htmlDocPtr
				6799	htmlReadFile(const char filename, const char encoding, int options)
				6800	{
				6801	htmlParserCtxtPtr ctxt;
				6802
				6803	xmlInitParser();
				6804	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				6805	if (ctxt == NULL)
				6806	return (NULL);
				6807	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
				6808	}
				6809
				6810	/**
				6811	* htmlReadMemory:
				6812	* @buffer: a pointer to a char array
				6813	* @size: the size of the array
				6814	* @URL: the base URL to use for the document
				6815	* @encoding: the document encoding, or NULL
				6816	* @options: a combination of htmlParserOption(s)
				6817	*
				6818	* parse an XML in-memory document and build a tree.
				6819	*
				6820	* Returns the resulting document tree
				6821	*/
				6822	htmlDocPtr
				6823	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
				6824	{
				6825	htmlParserCtxtPtr ctxt;
				6826
				6827	xmlInitParser();
				6828	ctxt = xmlCreateMemoryParserCtxt(buffer, size);
				6829	if (ctxt == NULL)
				6830	return (NULL);
				6831	htmlDefaultSAXHandlerInit();
				6832	if (ctxt->sax != NULL)
				6833	memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
				6834	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6835	}
				6836
				6837	/**
				6838	* htmlReadFd:
				6839	* @fd: an open file descriptor
				6840	* @URL: the base URL to use for the document
				6841	* @encoding: the document encoding, or NULL
				6842	* @options: a combination of htmlParserOption(s)
				6843	*
				6844	* parse an XML from a file descriptor and build a tree.
				6845	*
				6846	* Returns the resulting document tree
				6847	*/
				6848	htmlDocPtr
				6849	htmlReadFd(int fd, const char URL, const char encoding, int options)
				6850	{
				6851	htmlParserCtxtPtr ctxt;
				6852	xmlParserInputBufferPtr input;
				6853	xmlParserInputPtr stream;
				6854
				6855	if (fd < 0)
				6856	return (NULL);
				6857	xmlInitParser();
				6858
				6859	xmlInitParser();
				6860	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
				6861	if (input == NULL)
				6862	return (NULL);
				6863	ctxt = xmlNewParserCtxt();
				6864	if (ctxt == NULL) {
				6865	xmlFreeParserInputBuffer(input);
				6866	return (NULL);
				6867	}
				6868	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				6869	if (stream == NULL) {
				6870	xmlFreeParserInputBuffer(input);
				6871	xmlFreeParserCtxt(ctxt);
				6872	return (NULL);
				6873	}
				6874	inputPush(ctxt, stream);
				6875	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6876	}
				6877
				6878	/**
				6879	* htmlReadIO:
				6880	* @ioread: an I/O read function
				6881	* @ioclose: an I/O close function
				6882	* @ioctx: an I/O handler
				6883	* @URL: the base URL to use for the document
				6884	* @encoding: the document encoding, or NULL
				6885	* @options: a combination of htmlParserOption(s)
				6886	*
				6887	* parse an HTML document from I/O functions and source and build a tree.
				6888	*
				6889	* Returns the resulting document tree
				6890	*/
				6891	htmlDocPtr
				6892	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
				6893	void ioctx, const char URL, const char *encoding, int options)
				6894	{
				6895	htmlParserCtxtPtr ctxt;
				6896	xmlParserInputBufferPtr input;
				6897	xmlParserInputPtr stream;
				6898
				6899	if (ioread == NULL)
				6900	return (NULL);
				6901	xmlInitParser();
				6902
				6903	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
				6904	XML_CHAR_ENCODING_NONE);
				6905	if (input == NULL) {
				6906	if (ioclose != NULL)
				6907	ioclose(ioctx);
				6908	return (NULL);
				6909	}
				6910	ctxt = htmlNewParserCtxt();
				6911	if (ctxt == NULL) {
				6912	xmlFreeParserInputBuffer(input);
				6913	return (NULL);
				6914	}
				6915	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				6916	if (stream == NULL) {
				6917	xmlFreeParserInputBuffer(input);
				6918	xmlFreeParserCtxt(ctxt);
				6919	return (NULL);
				6920	}
				6921	inputPush(ctxt, stream);
				6922	return (htmlDoRead(ctxt, URL, encoding, options, 0));
				6923	}
				6924
				6925	/**
				6926	* htmlCtxtReadDoc:
				6927	* @ctxt: an HTML parser context
				6928	* @cur: a pointer to a zero terminated string
				6929	* @URL: the base URL to use for the document
				6930	* @encoding: the document encoding, or NULL
				6931	* @options: a combination of htmlParserOption(s)
				6932	*
				6933	* parse an XML in-memory document and build a tree.
				6934	* This reuses the existing @ctxt parser context
				6935	*
				6936	* Returns the resulting document tree
				6937	*/
				6938	htmlDocPtr
				6939	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
				6940	const char URL, const char encoding, int options)
				6941	{
				6942	xmlParserInputPtr stream;
				6943
				6944	if (cur == NULL)
				6945	return (NULL);
				6946	if (ctxt == NULL)
				6947	return (NULL);
				6948	xmlInitParser();
				6949
				6950	htmlCtxtReset(ctxt);
				6951
				6952	stream = xmlNewStringInputStream(ctxt, cur);
				6953	if (stream == NULL) {
				6954	return (NULL);
				6955	}
				6956	inputPush(ctxt, stream);
				6957	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				6958	}
				6959
				6960	/**
				6961	* htmlCtxtReadFile:
				6962	* @ctxt: an HTML parser context
				6963	* @filename: a file or URL
				6964	* @encoding: the document encoding, or NULL
				6965	* @options: a combination of htmlParserOption(s)
				6966	*
				6967	* parse an XML file from the filesystem or the network.
				6968	* This reuses the existing @ctxt parser context
				6969	*
				6970	* Returns the resulting document tree
				6971	*/
				6972	htmlDocPtr
				6973	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
				6974	const char *encoding, int options)
				6975	{
				6976	xmlParserInputPtr stream;
				6977
				6978	if (filename == NULL)
				6979	return (NULL);
				6980	if (ctxt == NULL)
				6981	return (NULL);
				6982	xmlInitParser();
				6983
				6984	htmlCtxtReset(ctxt);
				6985
				6986	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
				6987	if (stream == NULL) {
				6988	return (NULL);
				6989	}
				6990	inputPush(ctxt, stream);
				6991	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
				6992	}
				6993
				6994	/**
				6995	* htmlCtxtReadMemory:
				6996	* @ctxt: an HTML parser context
				6997	* @buffer: a pointer to a char array
				6998	* @size: the size of the array
				6999	* @URL: the base URL to use for the document
				7000	* @encoding: the document encoding, or NULL
				7001	* @options: a combination of htmlParserOption(s)
				7002	*
				7003	* parse an XML in-memory document and build a tree.
				7004	* This reuses the existing @ctxt parser context
				7005	*
				7006	* Returns the resulting document tree
				7007	*/
				7008	htmlDocPtr
				7009	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
				7010	const char URL, const char encoding, int options)
				7011	{
				7012	xmlParserInputBufferPtr input;
				7013	xmlParserInputPtr stream;
				7014
				7015	if (ctxt == NULL)
				7016	return (NULL);
				7017	if (buffer == NULL)
				7018	return (NULL);
				7019	xmlInitParser();
				7020
				7021	htmlCtxtReset(ctxt);
				7022
				7023	input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				7024	if (input == NULL) {
				7025	return(NULL);
				7026	}
				7027
				7028	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7029	if (stream == NULL) {
				7030	xmlFreeParserInputBuffer(input);
				7031	return(NULL);
				7032	}
				7033
				7034	inputPush(ctxt, stream);
				7035	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7036	}
				7037
				7038	/**
				7039	* htmlCtxtReadFd:
				7040	* @ctxt: an HTML parser context
				7041	* @fd: an open file descriptor
				7042	* @URL: the base URL to use for the document
				7043	* @encoding: the document encoding, or NULL
				7044	* @options: a combination of htmlParserOption(s)
				7045	*
				7046	* parse an XML from a file descriptor and build a tree.
				7047	* This reuses the existing @ctxt parser context
				7048	*
				7049	* Returns the resulting document tree
				7050	*/
				7051	htmlDocPtr
				7052	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
				7053	const char URL, const char encoding, int options)
				7054	{
				7055	xmlParserInputBufferPtr input;
				7056	xmlParserInputPtr stream;
				7057
				7058	if (fd < 0)
				7059	return (NULL);
				7060	if (ctxt == NULL)
				7061	return (NULL);
				7062	xmlInitParser();
				7063
				7064	htmlCtxtReset(ctxt);
				7065
				7066
				7067	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
				7068	if (input == NULL)
				7069	return (NULL);
				7070	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7071	if (stream == NULL) {
				7072	xmlFreeParserInputBuffer(input);
				7073	return (NULL);
				7074	}
				7075	inputPush(ctxt, stream);
				7076	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7077	}
				7078
				7079	/**
				7080	* htmlCtxtReadIO:
				7081	* @ctxt: an HTML parser context
				7082	* @ioread: an I/O read function
				7083	* @ioclose: an I/O close function
				7084	* @ioctx: an I/O handler
				7085	* @URL: the base URL to use for the document
				7086	* @encoding: the document encoding, or NULL
				7087	* @options: a combination of htmlParserOption(s)
				7088	*
				7089	* parse an HTML document from I/O functions and source and build a tree.
				7090	* This reuses the existing @ctxt parser context
				7091	*
				7092	* Returns the resulting document tree
				7093	*/
				7094	htmlDocPtr
				7095	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
				7096	xmlInputCloseCallback ioclose, void *ioctx,
				7097	const char *URL,
				7098	const char *encoding, int options)
				7099	{
				7100	xmlParserInputBufferPtr input;
				7101	xmlParserInputPtr stream;
				7102
				7103	if (ioread == NULL)
				7104	return (NULL);
				7105	if (ctxt == NULL)
				7106	return (NULL);
				7107	xmlInitParser();
				7108
				7109	htmlCtxtReset(ctxt);
				7110
				7111	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
				7112	XML_CHAR_ENCODING_NONE);
				7113	if (input == NULL) {
				7114	if (ioclose != NULL)
				7115	ioclose(ioctx);
				7116	return (NULL);
				7117	}
				7118	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
				7119	if (stream == NULL) {
				7120	xmlFreeParserInputBuffer(input);
				7121	return (NULL);
				7122	}
				7123	inputPush(ctxt, stream);
				7124	return (htmlDoRead(ctxt, URL, encoding, options, 1));
				7125	}
				7126
				7127	#define bottom_HTMLparser
				7128	#include "elfgcchack.h"
				7129	#endif /* LIBXML_HTML_ENABLED */