Blame - HTMLparser.c - platform/external/libxml2

blob: a83b669bb5b338fc6e2656cc04d1df58fa7224a6 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* Daniel.Veillard@w3.org
				7	*/
				8
				9	#ifdef WIN32
				10	#include "win32config.h"
				11	#else
				12	#include "config.h"
				13	#endif
				14
				15	#include <libxml/xmlversion.h>
				16	#ifdef LIBXML_HTML_ENABLED
				17	#include <stdio.h>
				18	#include <string.h>
				19	#ifdef HAVE_CTYPE_H
				20	#include <ctype.h>
				21	#endif
				22	#ifdef HAVE_STDLIB_H
				23	#include <stdlib.h>
				24	#endif
				25	#ifdef HAVE_SYS_STAT_H
				26	#include <sys/stat.h>
				27	#endif
				28	#ifdef HAVE_FCNTL_H
				29	#include <fcntl.h>
				30	#endif
				31	#ifdef HAVE_UNISTD_H
				32	#include <unistd.h>
				33	#endif
				34	#ifdef HAVE_ZLIB_H
				35	#include <zlib.h>
				36	#endif
				37
				38	#include <libxml/xmlmemory.h>
				39	#include <libxml/tree.h>
				40	#include <libxml/parser.h>
				41	#include <libxml/parserInternals.h>
				42	#include <libxml/xmlerror.h>
				43	#include <libxml/HTMLparser.h>
				44	#include <libxml/entities.h>
				45	#include <libxml/encoding.h>
				46	#include <libxml/valid.h>
				47	#include <libxml/xmlIO.h>
				48
				49	#define HTML_MAX_NAMELEN 1000
				50	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				51	#define HTML_PARSER_BUFFER_SIZE 100
				52
				53	/* #define DEBUG */
				54	/* #define DEBUG_PUSH */
				55
				56	int htmlOmittedDefaultValue = 1;
				57
				58	/************************************************************************
				59	* *
				60	* Parser stacks related functions and macros *
				61	* *
				62	************************************************************************/
				63
				64	/*
				65	* Generic function for accessing stacks in the Parser Context
				66	*/
				67
				68	#define PUSH_AND_POP(scope, type, name) \
				69	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				70	if (ctxt->name##Nr >= ctxt->name##Max) { \
				71	ctxt->name##Max *= 2; \
				72	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				73	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				74	if (ctxt->name##Tab == NULL) { \
				75	xmlGenericError(xmlGenericErrorContext, \
				76	"realloc failed !\n"); \
				77	return(0); \
				78	} \
				79	} \
				80	ctxt->name##Tab[ctxt->name##Nr] = value; \
				81	ctxt->name = value; \
				82	return(ctxt->name##Nr++); \
				83	} \
				84	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				85	type ret; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	ctxt->name##Nr--; \
				88	if (ctxt->name##Nr < 0) return(0); \
				89	if (ctxt->name##Nr > 0) \
				90	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				91	else \
				92	ctxt->name = NULL; \
				93	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				94	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				95	return(ret); \
				96	} \
				97
				98	PUSH_AND_POP(extern, xmlNodePtr, node)
				99	PUSH_AND_POP(extern, xmlChar*, name)
				100
				101	/*
				102	* Macros for accessing the content. Those should be used only by the parser,
				103	* and not exported.
				104	*
				105	* Dirty macros, i.e. one need to make assumption on the context to use them
				106	*
				107	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				108	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				109	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				110	* in UNICODE mode. This should be used internally by the parser
				111	* only to compare to ASCII values otherwise it would break when
				112	* running with UTF-8 encoding.
				113	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				114	* to compare on ASCII based substring.
				115	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				116	* it should be used only to compare on ASCII based substring.
				117	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				118	* strings within the parser.
				119	*
				120	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				121	*
				122	* CURRENT Returns the current char value, with the full decoding of
				123	* UTF-8 if we are using this mode. It returns an int.
				124	* NEXT Skip to the next character, this does the proper decoding
				125	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				126	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				127	*/
				128
				129	#define UPPER (toupper(*ctxt->input->cur))
				130
				131	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				132
				133	#define NXT(val) ctxt->input->cur[(val)]
				134
				135	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				136
				137	#define CUR_PTR ctxt->input->cur
				138
				139	#define SHRINK xmlParserInputShrink(ctxt->input)
				140
				141	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				142
				143	#define CURRENT ((int) (*ctxt->input->cur))
				144
				145	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				146
				147	/* Inported from XML */
				148
				149	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				150	#define CUR ((int) (*ctxt->input->cur))
				151	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				152
				153	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				154	#define NXT(val) ctxt->input->cur[(val)]
				155	#define CUR_PTR ctxt->input->cur
				156
				157
				158	#define NEXTL(l) do { \
				159	if (*(ctxt->input->cur) == '\n') { \
				160	ctxt->input->line++; ctxt->input->col = 1; \
				161	} else ctxt->input->col++; \
				162	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				163	} while (0)
				164
				165	/************
				166	\
				167	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				168	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				169	************/
				170
				171	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				172	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				173
				174	#define COPY_BUF(l,b,i,v) \
				175	if (l == 1) b[i++] = (xmlChar) v; \
				176	else i += xmlCopyChar(l,&b[i],v)
				177
				178	/**
				179	* htmlCurrentChar:
				180	* @ctxt: the HTML parser context
				181	* @len: pointer to the length of the char read
				182	*
				183	* The current char value, if using UTF-8 this may actaully span multiple
				184	* bytes in the input buffer. Implement the end of line normalization:
				185	* 2.11 End-of-Line Handling
				186	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				187	* char, then the encoding converter is plugged in automatically.
				188	*
				189	* Returns the current char value and its lenght
				190	*/
				191
				192	int
				193	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				194	if (ctxt->instate == XML_PARSER_EOF)
				195	return(0);
				196
				197	if (ctxt->token != 0) {
				198	*len = 0;
				199	return(ctxt->token);
				200	}
				201	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				202	/*
				203	* We are supposed to handle UTF8, check it's valid
				204	* From rfc2044: encoding of the Unicode values on UTF-8:
				205	*
				206	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				207	* 0000 0000-0000 007F 0xxxxxxx
				208	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				209	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				210	*
				211	* Check for the 0x110000 limit too
				212	*/
				213	const unsigned char *cur = ctxt->input->cur;
				214	unsigned char c;
				215	unsigned int val;
				216
				217	c = *cur;
				218	if (c & 0x80) {
				219	if (cur[1] == 0)
				220	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				221	if ((cur[1] & 0xc0) != 0x80)
				222	goto encoding_error;
				223	if ((c & 0xe0) == 0xe0) {
				224
				225	if (cur[2] == 0)
				226	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				227	if ((cur[2] & 0xc0) != 0x80)
				228	goto encoding_error;
				229	if ((c & 0xf0) == 0xf0) {
				230	if (cur[3] == 0)
				231	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				232	if (((c & 0xf8) != 0xf0) \|\|
				233	((cur[3] & 0xc0) != 0x80))
				234	goto encoding_error;
				235	/* 4-byte code */
				236	*len = 4;
				237	val = (cur[0] & 0x7) << 18;
				238	val \|= (cur[1] & 0x3f) << 12;
				239	val \|= (cur[2] & 0x3f) << 6;
				240	val \|= cur[3] & 0x3f;
				241	} else {
				242	/* 3-byte code */
				243	*len = 3;
				244	val = (cur[0] & 0xf) << 12;
				245	val \|= (cur[1] & 0x3f) << 6;
				246	val \|= cur[2] & 0x3f;
				247	}
				248	} else {
				249	/* 2-byte code */
				250	*len = 2;
				251	val = (cur[0] & 0x1f) << 6;
				252	val \|= cur[1] & 0x3f;
				253	}
				254	if (!IS_CHAR(val)) {
				255	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				256	if ((ctxt->sax != NULL) &&
				257	(ctxt->sax->error != NULL))
				258	ctxt->sax->error(ctxt->userData,
				259	"Char 0x%X out of allowed range\n", val);
				260	ctxt->wellFormed = 0;
				261	ctxt->disableSAX = 1;
				262	}
				263	return(val);
				264	} else {
				265	/* 1-byte code */
				266	*len = 1;
				267	return((int) *ctxt->input->cur);
				268	}
				269	}
				270	/*
				271	* Assume it's a fixed lenght encoding (1) with
				272	* a compatibke encoding for the ASCII set, since
				273	* XML constructs only use < 128 chars
				274	*/
				275	*len = 1;
				276	if ((int) *ctxt->input->cur < 0x80)
				277	return((int) *ctxt->input->cur);
				278
				279	/*
				280	* Humm this is bad, do an automatic flow conversion
				281	*/
				282	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				283	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				284	return(xmlCurrentChar(ctxt, len));
				285
				286	encoding_error:
				287	/*
				288	* If we detect an UTF8 error that probably mean that the
				289	* input encoding didn't get properly advertized in the
				290	* declaration header. Report the error and switch the encoding
				291	* to ISO-Latin-1 (if you don't like this policy, just declare the
				292	* encoding !)
				293	*/
				294	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				295	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				296	ctxt->sax->error(ctxt->userData,
				297	"Input is not proper UTF-8, indicate encoding !\n");
				298	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				299	ctxt->input->cur[0], ctxt->input->cur[1],
				300	ctxt->input->cur[2], ctxt->input->cur[3]);
				301	}
				302
				303	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				304	*len = 1;
				305	return((int) *ctxt->input->cur);
				306	}
				307
				308	/**
				309	* htmlNextChar:
				310	* @ctxt: the HTML parser context
				311	*
				312	* Skip to the next char input char.
				313	*/
				314
				315	void
				316	htmlNextChar(htmlParserCtxtPtr ctxt) {
				317	if (ctxt->instate == XML_PARSER_EOF)
				318	return;
				319	if ((*ctxt->input->cur == 0) &&
				320	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				321	xmlPopInput(ctxt);
				322	} else {
				323	if (*(ctxt->input->cur) == '\n') {
				324	ctxt->input->line++; ctxt->input->col = 1;
				325	} else ctxt->input->col++;
				326	ctxt->input->cur++;
				327	ctxt->nbChars++;
				328	if (*ctxt->input->cur == 0)
				329	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				330	}
				331	}
				332
				333	/**
				334	* htmlSkipBlankChars:
				335	* @ctxt: the HTML parser context
				336	*
				337	* skip all blanks character found at that point in the input streams.
				338	*
				339	* Returns the number of space chars skipped
				340	*/
				341
				342	int
				343	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				344	int res = 0;
				345
				346	while (IS_BLANK(*(ctxt->input->cur))) {
				347	if ((*ctxt->input->cur == 0) &&
				348	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				349	xmlPopInput(ctxt);
				350	} else {
				351	if (*(ctxt->input->cur) == '\n') {
				352	ctxt->input->line++; ctxt->input->col = 1;
				353	} else ctxt->input->col++;
				354	ctxt->input->cur++;
				355	ctxt->nbChars++;
				356	if (*ctxt->input->cur == 0)
				357	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				358	}
				359	res++;
				360	}
				361	return(res);
				362	}
				363
				364
				365
				366	/************************************************************************
				367	* *
				368	* The list of HTML elements and their properties *
				369	* *
				370	************************************************************************/
				371
				372	/*
				373	* Start Tag: 1 means the start tag can be ommited
				374	* End Tag: 1 means the end tag can be ommited
				375	* 2 means it's forbidden (empty elements)
				376	* Depr: this element is deprecated
				377	* DTD: 1 means that this element is valid only in the Loose DTD
				378	* 2 means that this element is valid only in the Frameset DTD
				379	*
				380	* Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
				381	*/
				382	htmlElemDesc html40ElementTable[] = {
				383	{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
				384	{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
				385	{ "acronym", 0, 0, 0, 0, 0, 0, "" },
				386	{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
				387	{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
				388	{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
				389	{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
				390	{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
				391	{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
				392	{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
				393	{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
				394	{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
				395	{ "body", 1, 1, 0, 0, 0, 0, "document body " },
				396	{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
				397	{ "button", 0, 0, 0, 0, 0, 0, "push button " },
				398	{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
				399	{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
				400	{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
				401	{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
				402	{ "col", 0, 2, 2, 1, 0, 0, "table column " },
				403	{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
				404	{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
				405	{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
				406	{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
				407	{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
				408	{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
				409	{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
				410	{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
				411	{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
				412	{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
				413	{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
				414	{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
				415	{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
				416	{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
				417	{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
				418	{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
				419	{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
				420	{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
				421	{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
				422	{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
				423	{ "head", 1, 1, 0, 0, 0, 0, "document head " },
				424	{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
				425	{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
				426	{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
				427	{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
				428	{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
				429	{ "input", 0, 2, 2, 1, 0, 0, "form control " },
				430	{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
				431	{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
				432	{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
				433	{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
				434	{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
				435	{ "li", 0, 1, 1, 0, 0, 0, "list item " },
				436	{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
				437	{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
				438	{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
				439	{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
				440	{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
				441	{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				442	{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
				443	{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
				444	{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
				445	{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
				446	{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
				447	{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
				448	{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
				449	{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
				450	{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
				451	{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
				452	{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
				453	{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
				454	{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
				455	{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
				456	{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
				457	{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
				458	{ "style", 0, 0, 0, 0, 0, 0, "style info " },
				459	{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
				460	{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
				461	{ "table", 0, 0, 0, 0, 0, 0, " " },
				462	{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
				463	{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
				464	{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
				465	{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
				466	{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
				467	{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
				468	{ "title", 0, 0, 0, 0, 0, 0, "document title " },
				469	{ "tr", 0, 1, 0, 0, 0, 0, "table row " },
				470	{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
				471	{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
				472	{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
				473	{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
				474	};
				475
				476	/*
				477	* start tags that imply the end of a current element
				478	* any tag of each line implies the end of the current element if the type of
				479	* that element is in the same line
				480	*/
				481	char *htmlEquEnd[] = {
				482	"dt", "dd", "li", "option", NULL,
				483	"h1", "h2", "h3", "h4", "h5", "h6", NULL,
				484	"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
				485	NULL
				486	};
				487	/*
				488	* acording the HTML DTD, HR should be added to the 2nd line above, as it
				489	* is not allowed within a H1, H2, H3, etc. But we should tolerate that case
				490	* because many documents contain rules in headings...
				491	*/
				492
				493	/*
				494	* start tags that imply the end of current element
				495	*/
				496	char *htmlStartClose[] = {
				497	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				498	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				499	"listing", "xmp", "head", NULL,
				500	"head", "p", NULL,
				501	"title", "p", NULL,
				502	"body", "head", "style", "link", "title", "p", NULL,
				503	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				504	"pre", "listing", "xmp", "head", "li", NULL,
				505	"hr", "p", "head", NULL,
				506	"h1", "p", "head", NULL,
				507	"h2", "p", "head", NULL,
				508	"h3", "p", "head", NULL,
				509	"h4", "p", "head", NULL,
				510	"h5", "p", "head", NULL,
				511	"h6", "p", "head", NULL,
				512	"dir", "p", "head", NULL,
				513	"address", "p", "head", "ul", NULL,
				514	"pre", "p", "head", "ul", NULL,
				515	"listing", "p", "head", NULL,
				516	"xmp", "p", "head", NULL,
				517	"blockquote", "p", "head", NULL,
				518	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				519	"xmp", "head", NULL,
				520	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				521	"head", "dd", NULL,
				522	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				523	"head", "dt", NULL,
				524	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				525	"listing", "xmp", NULL,
				526	"ol", "p", "head", "ul", NULL,
				527	"menu", "p", "head", "ul", NULL,
				528	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				529	"div", "p", "head", NULL,
				530	"noscript", "p", "head", NULL,
				531	"center", "font", "b", "i", "p", "head", NULL,
				532	"a", "a", NULL,
				533	"caption", "p", NULL,
				534	"colgroup", "caption", "colgroup", "col", "p", NULL,
				535	"col", "caption", "col", "p", NULL,
				536	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				537	"listing", "xmp", "a", NULL,
				538	"th", "th", "td", NULL,
				539	"td", "th", "td", "p", NULL,
				540	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				541	"thead", "caption", "col", "colgroup", NULL,
				542	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				543	"tbody", "p", NULL,
				544	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				545	"tfoot", "tbody", "p", NULL,
				546	"optgroup", "option", NULL,
				547	"option", "option", NULL,
				548	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				549	"pre", "listing", "xmp", "a", NULL,
				550	NULL
				551	};
				552
				553	/*
				554	* The list of HTML elements which are supposed not to have
				555	* CDATA content and where a p element will be implied
				556	*
				557	* TODO: extend that list by reading the HTML SGML DtD on
				558	* implied paragraph
				559	*/
				560	static char *htmlNoContentElements[] = {
				561	"html",
				562	"head",
				563	"body",
				564	NULL
				565	};
				566
				567	/*
				568	* The list of HTML attributes which are of content %Script;
				569	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				570	* it assumes the name starts with 'on'
				571	*/
				572	static char *htmlScriptAttributes[] = {
				573	"onclick",
				574	"ondblclick",
				575	"onmousedown",
				576	"onmouseup",
				577	"onmouseover",
				578	"onmousemove",
				579	"onmouseout",
				580	"onkeypress",
				581	"onkeydown",
				582	"onkeyup",
				583	"onload",
				584	"onunload",
				585	"onfocus",
				586	"onblur",
				587	"onsubmit",
				588	"onrest",
				589	"onchange",
				590	"onselect"
				591	};
				592
				593
				594	static char** htmlStartCloseIndex[100];
				595	static int htmlStartCloseIndexinitialized = 0;
				596
				597	/************************************************************************
				598	* *
				599	* functions to handle HTML specific data *
				600	* *
				601	************************************************************************/
				602
				603	/**
				604	* htmlInitAutoClose:
				605	*
				606	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				607	* This is not reentrant. Call xmlInitParser() once before processing in
				608	* case of use in multithreaded programs.
				609	*/
				610	void
				611	htmlInitAutoClose(void) {
				612	int index, i = 0;
				613
				614	if (htmlStartCloseIndexinitialized) return;
				615
				616	for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
				617	index = 0;
				618	while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
				619	htmlStartCloseIndex[index++] = &htmlStartClose[i];
				620	while (htmlStartClose[i] != NULL) i++;
				621	i++;
				622	}
				623	htmlStartCloseIndexinitialized = 1;
				624	}
				625
				626	/**
				627	* htmlTagLookup:
				628	* @tag: The tag name in lowercase
				629	*
				630	* Lookup the HTML tag in the ElementTable
				631	*
				632	* Returns the related htmlElemDescPtr or NULL if not found.
				633	*/
				634	htmlElemDescPtr
				635	htmlTagLookup(const xmlChar *tag) {
				636	int i;
				637
				638	for (i = 0; i < (sizeof(html40ElementTable) /
				639	sizeof(html40ElementTable[0]));i++) {
				640	if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
				641	return(&html40ElementTable[i]);
				642	}
				643	return(NULL);
				644	}
				645
				646	/**
				647	* htmlCheckAutoClose:
				648	* @newtag: The new tag name
				649	* @oldtag: The old tag name
				650	*
				651	* Checks wether the new tag is one of the registered valid tags for closing old.
				652	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				653	*
				654	* Returns 0 if no, 1 if yes.
				655	*/
				656	int
				657	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
				658	int i, index;
				659	char **close = NULL;
				660
				661	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				662
				663	/* inefficient, but not a big deal */
				664	for (index = 0; index < 100;index++) {
				665	close = htmlStartCloseIndex[index];
				666	if (close == NULL) return(0);
				667	if (xmlStrEqual(BAD_CAST *close, newtag)) break;
				668	}
				669
				670	i = close - htmlStartClose;
				671	i++;
				672	while (htmlStartClose[i] != NULL) {
				673	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				674	return(1);
				675	}
				676	i++;
				677	}
				678	return(0);
				679	}
				680
				681	/**
				682	* htmlAutoCloseOnClose:
				683	* @ctxt: an HTML parser context
				684	* @newtag: The new tag name
				685	*
				686	* The HTmL DtD allows an ending tag to implicitely close other tags.
				687	*/
				688	void
				689	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				690	htmlElemDescPtr info;
				691	xmlChar *oldname;
				692	int i;
				693
				694	#ifdef DEBUG
				695	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				696	for (i = 0;i < ctxt->nameNr;i++)
				697	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				698	#endif
				699
				700	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				701	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
				702	}
				703	if (i < 0) return;
				704
				705	while (!xmlStrEqual(newtag, ctxt->name)) {
				706	info = htmlTagLookup(ctxt->name);
				707	if ((info == NULL) \|\| (info->endTag == 1)) {
				708	#ifdef DEBUG
				709	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				710	#endif
				711	} else {
				712	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				713	ctxt->sax->error(ctxt->userData,
				714	"Opening and ending tag mismatch: %s and %s\n",
				715	newtag, ctxt->name);
				716	ctxt->wellFormed = 0;
				717	}
				718	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				719	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				720	oldname = htmlnamePop(ctxt);
				721	if (oldname != NULL) {
				722	#ifdef DEBUG
				723	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				724	#endif
				725	xmlFree(oldname);
				726	}
				727	}
				728	}
				729
				730	/**
				731	* htmlAutoClose:
				732	* @ctxt: an HTML parser context
				733	* @newtag: The new tag name or NULL
				734	*
				735	* The HTmL DtD allows a tag to implicitely close other tags.
				736	* The list is kept in htmlStartClose array. This function is
				737	* called when a new tag has been detected and generates the
				738	* appropriates closes if possible/needed.
				739	* If newtag is NULL this mean we are at the end of the resource
				740	* and we should check
				741	*/
				742	void
				743	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				744	xmlChar *oldname;
				745	while ((newtag != NULL) && (ctxt->name != NULL) &&
				746	(htmlCheckAutoClose(newtag, ctxt->name))) {
				747	#ifdef DEBUG
				748	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				749	#endif
				750	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				751	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				752	oldname = htmlnamePop(ctxt);
				753	if (oldname != NULL) {
				754	#ifdef DEBUG
				755	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				756	#endif
				757	xmlFree(oldname);
				758	}
				759	}
				760	if (newtag == NULL) {
				761	htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
				762	htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
				763	htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
				764	}
				765	while ((newtag == NULL) && (ctxt->name != NULL) &&
				766	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				767	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				768	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				769	#ifdef DEBUG
				770	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				771	#endif
				772	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				773	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				774	oldname = htmlnamePop(ctxt);
				775	if (oldname != NULL) {
				776	#ifdef DEBUG
				777	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				778	#endif
				779	xmlFree(oldname);
				780	}
				781	}
				782
				783	}
				784
				785	/**
				786	* htmlAutoCloseTag:
				787	* @doc: the HTML document
				788	* @name: The tag name
				789	* @elem: the HTML element
				790	*
				791	* The HTmL DtD allows a tag to implicitely close other tags.
				792	* The list is kept in htmlStartClose array. This function checks
				793	* if the element or one of it's children would autoclose the
				794	* given tag.
				795	*
				796	* Returns 1 if autoclose, 0 otherwise
				797	*/
				798	int
				799	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				800	htmlNodePtr child;
				801
				802	if (elem == NULL) return(1);
				803	if (xmlStrEqual(name, elem->name)) return(0);
				804	if (htmlCheckAutoClose(elem->name, name)) return(1);
				805	child = elem->children;
				806	while (child != NULL) {
				807	if (htmlAutoCloseTag(doc, name, child)) return(1);
				808	child = child->next;
				809	}
				810	return(0);
				811	}
				812
				813	/**
				814	* htmlIsAutoClosed:
				815	* @doc: the HTML document
				816	* @elem: the HTML element
				817	*
				818	* The HTmL DtD allows a tag to implicitely close other tags.
				819	* The list is kept in htmlStartClose array. This function checks
				820	* if a tag is autoclosed by one of it's child
				821	*
				822	* Returns 1 if autoclosed, 0 otherwise
				823	*/
				824	int
				825	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				826	htmlNodePtr child;
				827
				828	if (elem == NULL) return(1);
				829	child = elem->children;
				830	while (child != NULL) {
				831	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				832	child = child->next;
				833	}
				834	return(0);
				835	}
				836
				837	/**
				838	* htmlCheckImplied:
				839	* @ctxt: an HTML parser context
				840	* @newtag: The new tag name
				841	*
				842	* The HTML DtD allows a tag to exists only implicitely
				843	* called when a new tag has been detected and generates the
				844	* appropriates implicit tags if missing
				845	*/
				846	void
				847	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				848	if (!htmlOmittedDefaultValue)
				849	return;
				850	if (xmlStrEqual(newtag, BAD_CAST"html"))
				851	return;
				852	if (ctxt->nameNr <= 0) {
				853	#ifdef DEBUG
				854	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				855	#endif
				856	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				857	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				858	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				859	}
				860	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				861	return;
				862	if ((ctxt->nameNr <= 1) &&
				863	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				864	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				865	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				866	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				867	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				868	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				869	/*
				870	* dropped OBJECT ... i you put it first BODY will be
				871	* assumed !
				872	*/
				873	#ifdef DEBUG
				874	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				875	#endif
				876	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				877	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				878	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				879	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				880	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				881	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				882	int i;
				883	for (i = 0;i < ctxt->nameNr;i++) {
				884	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				885	return;
				886	}
				887	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				888	return;
				889	}
				890	}
				891
				892	#ifdef DEBUG
				893	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				894	#endif
				895	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				896	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				897	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				898	}
				899	}
				900
				901	/**
				902	* htmlCheckParagraph
				903	* @ctxt: an HTML parser context
				904	*
				905	* Check whether a p element need to be implied before inserting
				906	* characters in the current element.
				907	*
				908	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				909	* in case of error.
				910	*/
				911
				912	int
				913	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				914	const xmlChar *tag;
				915	int i;
				916
				917	if (ctxt == NULL)
				918	return(-1);
				919	tag = ctxt->name;
				920	if (tag == NULL) {
				921	htmlAutoClose(ctxt, BAD_CAST"p");
				922	htmlCheckImplied(ctxt, BAD_CAST"p");
				923	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				924	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				925	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				926	return(1);
				927	}
				928	if (!htmlOmittedDefaultValue)
				929	return(0);
				930	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				931	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				932	#ifdef DEBUG
				933	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				934	#endif
				935	htmlAutoClose(ctxt, BAD_CAST"p");
				936	htmlCheckImplied(ctxt, BAD_CAST"p");
				937	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				938	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				939	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				940	return(1);
				941	}
				942	}
				943	return(0);
				944	}
				945
				946	/**
				947	* htmlIsScriptAttribute:
				948	* @name: an attribute name
				949	*
				950	* Check if an attribute is of content type Script
				951	*
				952	* Returns 1 is the attribute is a script 0 otherwise
				953	*/
				954	int
				955	htmlIsScriptAttribute(const xmlChar *name) {
				956	int i;
				957
				958	if (name == NULL)
				959	return(0);
				960	/*
				961	* all script attributes start with 'on'
				962	*/
				963	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				964	return(0);
				965	for (i = 0;
				966	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				967	i++) {
				968	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				969	return(1);
				970	}
				971	return(0);
				972	}
				973
				974	/************************************************************************
				975	* *
				976	* The list of HTML predefined entities *
				977	* *
				978	************************************************************************/
				979
				980
				981	htmlEntityDesc html40EntitiesTable[] = {
				982	/*
				983	* the 4 absolute ones, plus apostrophe.
				984	*/
				985	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				986	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				987	{ 39, "apos", "single quote" },
				988	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				989	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				990
				991	/*
				992	* A bunch still in the 128-255 range
				993	* Replacing them depend really on the charset used.
				994	*/
				995	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				996	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				997	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				998	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				999	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1000	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1001	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1002	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1003	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1004	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1005	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1006	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1007	{ 172, "not", "not sign, U+00AC ISOnum" },
				1008	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1009	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1010	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1011	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1012	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1013	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1014	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1015	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1016	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1017	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1018	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1019	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1020	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1021	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1022	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1023	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1024	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1025	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1026	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1027	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1028	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1029	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1030	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1031	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1032	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1033	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1034	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1035	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1036	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1037	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1038	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1039	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1040	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1041	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1042	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1043	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1044	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1045	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1046	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1047	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1048	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1049	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1050	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1051	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1052	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1053	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1054	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1055	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1056	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1057	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1058	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1059	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1060	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1061	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1062	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1063	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1064	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1065	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1066	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1067	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1068	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1069	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1070	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1071	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1072	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1073	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1074	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1075	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1076	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1077	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1078	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1079	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1080	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1081	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1082	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1083	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1084	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1085	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1086	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1087	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1088	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1089	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1090	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1091
				1092	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1093	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1094	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1095	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1096	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1097
				1098	/*
				1099	* Anything below should really be kept as entities references
				1100	*/
				1101	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1102
				1103	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1104	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1105
				1106	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1107	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1108	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1109	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1110	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1111	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1112	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1113	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1114	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1115	{ 922, "Kappa","greek capital letter kappa, U+039A" },
				1116	{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
				1117	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1118	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1119	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1120	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1121	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1122	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1123	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1124	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1125	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1126	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1127	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1128	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1129	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1130
				1131	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1132	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1133	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1134	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1135	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1136	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1137	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1138	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1139	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1140	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1141	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1142	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1143	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1144	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1145	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1146	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1147	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1148	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1149	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1150	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1151	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1152	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1153	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1154	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1155	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1156	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1157	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1158	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1159
				1160	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1161	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1162	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1163	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1164	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1165	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1166	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1167	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1168	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1169	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1170	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1171	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1172	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1173	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1174	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1175	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1176	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1177
				1178	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1179	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1180
				1181	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1182
				1183	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1184	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1185
				1186	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1187	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1188
				1189	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1190	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1191
				1192	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1193
				1194	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1195	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1196	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1197	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1198	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1199	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1200	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1201	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1202	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1203	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1204	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1205	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1206	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1207	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1208	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1209	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1210
				1211	{ 8704, "forall","for all, U+2200 ISOtech" },
				1212	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1213	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1214	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1215	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1216	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1217	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1218	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1219	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1220	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1221	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1222	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1223	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1224	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1225	{ 8734, "infin","infinity, U+221E ISOtech" },
				1226	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1227	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1228	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1229	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1230	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1231	{ 8747, "int", "integral, U+222B ISOtech" },
				1232	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1233	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1234	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1235	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1236	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1237	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1238	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1239	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1240	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1241	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1242	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1243	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1244	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1245	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1246	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1247	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1248	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1249	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1250	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1251	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1252	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1253	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1254	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1255	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1256
				1257	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1258	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1259	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1260	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1261
				1262	};
				1263
				1264	/************************************************************************
				1265	* *
				1266	* Commodity functions to handle entities *
				1267	* *
				1268	************************************************************************/
				1269
				1270	/*
				1271	* Macro used to grow the current buffer.
				1272	*/
				1273	#define growBuffer(buffer) { \
				1274	buffer##_size *= 2; \
				1275	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1276	if (buffer == NULL) { \
				1277	perror("realloc failed"); \
				1278	return(NULL); \
				1279	} \
				1280	}
				1281
				1282	/**
				1283	* htmlEntityLookup:
				1284	* @name: the entity name
				1285	*
				1286	* Lookup the given entity in EntitiesTable
				1287	*
				1288	* TODO: the linear scan is really ugly, an hash table is really needed.
				1289	*
				1290	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1291	*/
				1292	htmlEntityDescPtr
				1293	htmlEntityLookup(const xmlChar *name) {
				1294	int i;
				1295
				1296	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1297	sizeof(html40EntitiesTable[0]));i++) {
				1298	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1299	#ifdef DEBUG
				1300	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1301	#endif
				1302	return(&html40EntitiesTable[i]);
				1303	}
				1304	}
				1305	return(NULL);
				1306	}
				1307
				1308	/**
				1309	* htmlEntityValueLookup:
				1310	* @value: the entity's unicode value
				1311	*
				1312	* Lookup the given entity in EntitiesTable
				1313	*
				1314	* TODO: the linear scan is really ugly, an hash table is really needed.
				1315	*
				1316	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1317	*/
				1318	htmlEntityDescPtr
				1319	htmlEntityValueLookup(int value) {
				1320	int i;
				1321	#ifdef DEBUG
				1322	int lv = 0;
				1323	#endif
				1324
				1325	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1326	sizeof(html40EntitiesTable[0]));i++) {
				1327	if ((unsigned int) html40EntitiesTable[i].value >= value) {
				1328	if ((unsigned int) html40EntitiesTable[i].value > value)
				1329	break;
				1330	#ifdef DEBUG
				1331	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1332	#endif
				1333	return(&html40EntitiesTable[i]);
				1334	}
				1335	#ifdef DEBUG
				1336	if (lv > html40EntitiesTable[i].value) {
				1337	xmlGenericError(xmlGenericErrorContext,
				1338	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1339	lv, html40EntitiesTable[i].value);
				1340	}
				1341	lv = html40EntitiesTable[i].value;
				1342	#endif
				1343	}
				1344	return(NULL);
				1345	}
				1346
				1347	/**
				1348	* UTF8ToHtml:
				1349	* @out: a pointer to an array of bytes to store the result
				1350	* @outlen: the length of @out
				1351	* @in: a pointer to an array of UTF-8 chars
				1352	* @inlen: the length of @in
				1353	*
				1354	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1355	* plus HTML entities block of chars out.
				1356	*
				1357	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1358	* The value of @inlen after return is the number of octets consumed
				1359	* as the return value is positive, else unpredictiable.
				1360	* The value of @outlen after return is the number of octets consumed.
				1361	*/
				1362	int
				1363	UTF8ToHtml(unsigned char* out, int *outlen,
				1364	const unsigned char* in, int *inlen) {
				1365	const unsigned char* processed = in;
				1366	const unsigned char* outend;
				1367	const unsigned char* outstart = out;
				1368	const unsigned char* instart = in;
				1369	const unsigned char* inend;
				1370	unsigned int c, d;
				1371	int trailing;
				1372
				1373	if (in == NULL) {
				1374	/*
				1375	* initialization nothing to do
				1376	*/
				1377	*outlen = 0;
				1378	*inlen = 0;
				1379	return(0);
				1380	}
				1381	inend = in + (*inlen);
				1382	outend = out + (*outlen);
				1383	while (in < inend) {
				1384	d = *in++;
				1385	if (d < 0x80) { c= d; trailing= 0; }
				1386	else if (d < 0xC0) {
				1387	/* trailing byte in leading position */
				1388	*outlen = out - outstart;
				1389	*inlen = processed - instart;
				1390	return(-2);
				1391	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1392	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1393	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1394	else {
				1395	/* no chance for this in Ascii */
				1396	*outlen = out - outstart;
				1397	*inlen = processed - instart;
				1398	return(-2);
				1399	}
				1400
				1401	if (inend - in < trailing) {
				1402	break;
				1403	}
				1404
				1405	for ( ; trailing; trailing--) {
				1406	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1407	break;
				1408	c <<= 6;
				1409	c \|= d & 0x3F;
				1410	}
				1411
				1412	/* assertion: c is a single UTF-4 value */
				1413	if (c < 0x80) {
				1414	if (out + 1 >= outend)
				1415	break;
				1416	*out++ = c;
				1417	} else {
				1418	int len;
				1419	htmlEntityDescPtr ent;
				1420
				1421	/*
				1422	* Try to lookup a predefined HTML entity for it
				1423	*/
				1424
				1425	ent = htmlEntityValueLookup(c);
				1426	if (ent == NULL) {
				1427	/* no chance for this in Ascii */
				1428	*outlen = out - outstart;
				1429	*inlen = processed - instart;
				1430	return(-2);
				1431	}
				1432	len = strlen(ent->name);
				1433	if (out + 2 + len >= outend)
				1434	break;
				1435	*out++ = '&';
				1436	memcpy(out, ent->name, len);
				1437	out += len;
				1438	*out++ = ';';
				1439	}
				1440	processed = in;
				1441	}
				1442	*outlen = out - outstart;
				1443	*inlen = processed - instart;
				1444	return(0);
				1445	}
				1446
				1447	/**
				1448	* htmlEncodeEntities:
				1449	* @out: a pointer to an array of bytes to store the result
				1450	* @outlen: the length of @out
				1451	* @in: a pointer to an array of UTF-8 chars
				1452	* @inlen: the length of @in
				1453	* @quoteChar: the quote character to escape (' or ") or zero.
				1454	*
				1455	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1456	* plus HTML entities block of chars out.
				1457	*
				1458	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1459	* The value of @inlen after return is the number of octets consumed
				1460	* as the return value is positive, else unpredictiable.
				1461	* The value of @outlen after return is the number of octets consumed.
				1462	*/
				1463	int
				1464	htmlEncodeEntities(unsigned char* out, int *outlen,
				1465	const unsigned char* in, int *inlen, int quoteChar) {
				1466	const unsigned char* processed = in;
				1467	const unsigned char* outend = out + (*outlen);
				1468	const unsigned char* outstart = out;
				1469	const unsigned char* instart = in;
				1470	const unsigned char* inend = in + (*inlen);
				1471	unsigned int c, d;
				1472	int trailing;
				1473
				1474	while (in < inend) {
				1475	d = *in++;
				1476	if (d < 0x80) { c= d; trailing= 0; }
				1477	else if (d < 0xC0) {
				1478	/* trailing byte in leading position */
				1479	*outlen = out - outstart;
				1480	*inlen = processed - instart;
				1481	return(-2);
				1482	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1483	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1484	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1485	else {
				1486	/* no chance for this in Ascii */
				1487	*outlen = out - outstart;
				1488	*inlen = processed - instart;
				1489	return(-2);
				1490	}
				1491
				1492	if (inend - in < trailing)
				1493	break;
				1494
				1495	while (trailing--) {
				1496	if (((d= *in++) & 0xC0) != 0x80) {
				1497	*outlen = out - outstart;
				1498	*inlen = processed - instart;
				1499	return(-2);
				1500	}
				1501	c <<= 6;
				1502	c \|= d & 0x3F;
				1503	}
				1504
				1505	/* assertion: c is a single UTF-4 value */
				1506	if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
				1507	if (out >= outend)
				1508	break;
				1509	*out++ = c;
				1510	} else {
				1511	htmlEntityDescPtr ent;
				1512	const char *cp;
				1513	char nbuf[16];
				1514	int len;
				1515
				1516	/*
				1517	* Try to lookup a predefined HTML entity for it
				1518	*/
				1519	ent = htmlEntityValueLookup(c);
				1520	if (ent == NULL) {
				1521	sprintf(nbuf, "#%u", c);
				1522	cp = nbuf;
				1523	}
				1524	else
				1525	cp = ent->name;
				1526	len = strlen(cp);
				1527	if (out + 2 + len > outend)
				1528	break;
				1529	*out++ = '&';
				1530	memcpy(out, cp, len);
				1531	out += len;
				1532	*out++ = ';';
				1533	}
				1534	processed = in;
				1535	}
				1536	*outlen = out - outstart;
				1537	*inlen = processed - instart;
				1538	return(0);
				1539	}
				1540
				1541	/**
				1542	* htmlDecodeEntities:
				1543	* @ctxt: the parser context
				1544	* @len: the len to decode (in bytes !), -1 for no size limit
				1545	* @end: an end marker xmlChar, 0 if none
				1546	* @end2: an end marker xmlChar, 0 if none
				1547	* @end3: an end marker xmlChar, 0 if none
				1548	*
				1549	* Subtitute the HTML entities by their value
				1550	*
				1551	* DEPRECATED !!!!
				1552	*
				1553	* Returns A newly allocated string with the substitution done. The caller
				1554	* must deallocate it !
				1555	*/
				1556	xmlChar *
				1557	htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				1558	xmlChar end, xmlChar end2, xmlChar end3) {
				1559	xmlChar *name = NULL;
				1560	xmlChar *buffer = NULL;
				1561	unsigned int buffer_size = 0;
				1562	unsigned int nbchars = 0;
				1563	htmlEntityDescPtr ent;
				1564	unsigned int max = (unsigned int) len;
				1565	int c,l;
				1566
				1567	if (ctxt->depth > 40) {
				1568	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1569	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1570	ctxt->sax->error(ctxt->userData,
				1571	"Detected entity reference loop\n");
				1572	ctxt->wellFormed = 0;
				1573	ctxt->disableSAX = 1;
				1574	return(NULL);
				1575	}
				1576
				1577	/*
				1578	* allocate a translation buffer.
				1579	*/
				1580	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1581	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1582	if (buffer == NULL) {
				1583	perror("xmlDecodeEntities: malloc failed");
				1584	return(NULL);
				1585	}
				1586
				1587	/*
				1588	* Ok loop until we reach one of the ending char or a size limit.
				1589	*/
				1590	c = CUR_CHAR(l);
				1591	while ((nbchars < max) && (c != end) &&
				1592	(c != end2) && (c != end3)) {
				1593
				1594	if (c == 0) break;
				1595	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1596	int val = htmlParseCharRef(ctxt);
				1597	COPY_BUF(0,buffer,nbchars,val);
				1598	NEXTL(l);
				1599	} else if ((c == '&') && (ctxt->token != '&')) {
				1600	ent = htmlParseEntityRef(ctxt, &name);
				1601	if (name != NULL) {
				1602	if (ent != NULL) {
				1603	int val = ent->value;
				1604	COPY_BUF(0,buffer,nbchars,val);
				1605	NEXTL(l);
				1606	} else {
				1607	const xmlChar *cur = name;
				1608
				1609	buffer[nbchars++] = '&';
				1610	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1611	growBuffer(buffer);
				1612	}
				1613	while (*cur != 0) {
				1614	buffer[nbchars++] = *cur++;
				1615	}
				1616	buffer[nbchars++] = ';';
				1617	}
				1618	}
				1619	} else {
				1620	COPY_BUF(l,buffer,nbchars,c);
				1621	NEXTL(l);
				1622	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1623	growBuffer(buffer);
				1624	}
				1625	}
				1626	c = CUR_CHAR(l);
				1627	}
				1628	buffer[nbchars++] = 0;
				1629	return(buffer);
				1630	}
				1631
				1632	/************************************************************************
				1633	* *
				1634	* Commodity functions to handle streams *
				1635	* *
				1636	************************************************************************/
				1637
				1638	/**
				1639	* htmlFreeInputStream:
				1640	* @input: an htmlParserInputPtr
				1641	*
				1642	* Free up an input stream.
				1643	*/
				1644	void
				1645	htmlFreeInputStream(htmlParserInputPtr input) {
				1646	if (input == NULL) return;
				1647
				1648	if (input->filename != NULL) xmlFree((char *) input->filename);
				1649	if (input->directory != NULL) xmlFree((char *) input->directory);
				1650	if ((input->free != NULL) && (input->base != NULL))
				1651	input->free((xmlChar *) input->base);
				1652	if (input->buf != NULL)
				1653	xmlFreeParserInputBuffer(input->buf);
Daniel Veillard	48b2f89	2001-02-25 16:11:03 +0000	[diff] [blame]	1654	MEM_CLEANUP(input, sizeof(htmlParserInput));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1655	xmlFree(input);
				1656	}
				1657
				1658	/**
				1659	* htmlNewInputStream:
				1660	* @ctxt: an HTML parser context
				1661	*
				1662	* Create a new input stream structure
				1663	* Returns the new input stream or NULL
				1664	*/
				1665	htmlParserInputPtr
				1666	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1667	htmlParserInputPtr input;
				1668
				1669	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1670	if (input == NULL) {
				1671	ctxt->errNo = XML_ERR_NO_MEMORY;
				1672	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1673	ctxt->sax->error(ctxt->userData,
				1674	"malloc: couldn't allocate a new input stream\n");
				1675	return(NULL);
				1676	}
				1677	memset(input, 0, sizeof(htmlParserInput));
				1678	input->filename = NULL;
				1679	input->directory = NULL;
				1680	input->base = NULL;
				1681	input->cur = NULL;
				1682	input->buf = NULL;
				1683	input->line = 1;
				1684	input->col = 1;
				1685	input->buf = NULL;
				1686	input->free = NULL;
				1687	input->version = NULL;
				1688	input->consumed = 0;
				1689	input->length = 0;
				1690	return(input);
				1691	}
				1692
				1693
				1694	/************************************************************************
				1695	* *
				1696	* Commodity functions, cleanup needed ? *
				1697	* *
				1698	************************************************************************/
				1699
				1700	/**
				1701	* areBlanks:
				1702	* @ctxt: an HTML parser context
				1703	* @str: a xmlChar *
				1704	* @len: the size of @str
				1705	*
				1706	* Is this a sequence of blank chars that one can ignore ?
				1707	*
				1708	* Returns 1 if ignorable 0 otherwise.
				1709	*/
				1710
				1711	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1712	int i;
				1713	xmlNodePtr lastChild;
				1714
				1715	for (i = 0;i < len;i++)
				1716	if (!(IS_BLANK(str[i]))) return(0);
				1717
				1718	if (CUR == 0) return(1);
				1719	if (CUR != '<') return(0);
				1720	if (ctxt->name == NULL)
				1721	return(1);
				1722	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1723	return(1);
				1724	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1725	return(1);
				1726	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1727	return(1);
				1728	if (ctxt->node == NULL) return(0);
				1729	lastChild = xmlGetLastChild(ctxt->node);
				1730	if (lastChild == NULL) {
				1731	if (ctxt->node->content != NULL) return(0);
				1732	} else if (xmlNodeIsText(lastChild)) {
				1733	return(0);
				1734	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1735	return(0);
				1736	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1737	return(0);
				1738	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1739	return(0);
				1740	}
				1741	return(1);
				1742	}
				1743
				1744	/**
				1745	* htmlHandleEntity:
				1746	* @ctxt: an HTML parser context
				1747	* @entity: an XML entity pointer.
				1748	*
				1749	* Default handling of an HTML entity, call the parser with the
				1750	* substitution string
				1751	*/
				1752
				1753	void
				1754	htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
				1755	int len;
				1756
				1757	if (entity->content == NULL) {
				1758	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1759	ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
				1760	entity->name);
				1761	ctxt->wellFormed = 0;
				1762	return;
				1763	}
				1764	len = xmlStrlen(entity->content);
				1765
				1766	/*
				1767	* Just handle the content as a set of chars.
				1768	*/
				1769	htmlCheckParagraph(ctxt);
				1770	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				1771	ctxt->sax->characters(ctxt->userData, entity->content, len);
				1772
				1773	}
				1774
				1775	/**
				1776	* htmlNewDocNoDtD:
				1777	* @URI: URI for the dtd, or NULL
				1778	* @ExternalID: the external ID of the DTD, or NULL
				1779	*
				1780	* Returns a new document, do not intialize the DTD if not provided
				1781	*/
				1782	htmlDocPtr
				1783	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1784	xmlDocPtr cur;
				1785
				1786	/*
				1787	* Allocate a new document and fill the fields.
				1788	*/
				1789	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1790	if (cur == NULL) {
				1791	xmlGenericError(xmlGenericErrorContext,
				1792	"xmlNewDoc : malloc failed\n");
				1793	return(NULL);
				1794	}
				1795	memset(cur, 0, sizeof(xmlDoc));
				1796
				1797	cur->type = XML_HTML_DOCUMENT_NODE;
				1798	cur->version = NULL;
				1799	cur->intSubset = NULL;
				1800	if ((ExternalID != NULL) \|\|
				1801	(URI != NULL))
				1802	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1803	cur->doc = cur;
				1804	cur->name = NULL;
				1805	cur->children = NULL;
				1806	cur->extSubset = NULL;
				1807	cur->oldNs = NULL;
				1808	cur->encoding = NULL;
				1809	cur->standalone = 1;
				1810	cur->compression = 0;
				1811	cur->ids = NULL;
				1812	cur->refs = NULL;
				1813	#ifndef XML_WITHOUT_CORBA
				1814	cur->_private = NULL;
				1815	#endif
				1816	return(cur);
				1817	}
				1818
				1819	/**
				1820	* htmlNewDoc:
				1821	* @URI: URI for the dtd, or NULL
				1822	* @ExternalID: the external ID of the DTD, or NULL
				1823	*
				1824	* Returns a new document
				1825	*/
				1826	htmlDocPtr
				1827	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1828	if ((URI == NULL) && (ExternalID == NULL))
				1829	return(htmlNewDocNoDtD(
				1830	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				1831	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
				1832
				1833	return(htmlNewDocNoDtD(URI, ExternalID));
				1834	}
				1835
				1836
				1837	/************************************************************************
				1838	* *
				1839	* The parser itself *
				1840	* Relates to http://www.w3.org/TR/html40 *
				1841	* *
				1842	************************************************************************/
				1843
				1844	/************************************************************************
				1845	* *
				1846	* The parser itself *
				1847	* *
				1848	************************************************************************/
				1849
				1850	/**
				1851	* htmlParseHTMLName:
				1852	* @ctxt: an HTML parser context
				1853	*
				1854	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1855	* since HTML names are not case-sensitive.
				1856	*
				1857	* Returns the Tag Name parsed or NULL
				1858	*/
				1859
				1860	xmlChar *
				1861	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1862	xmlChar *ret = NULL;
				1863	int i = 0;
				1864	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1865
				1866	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1867	(CUR != ':')) return(NULL);
				1868
				1869	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1870	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1871	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1872	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1873	else loc[i] = CUR;
				1874	i++;
				1875
				1876	NEXT;
				1877	}
				1878
				1879	ret = xmlStrndup(loc, i);
				1880
				1881	return(ret);
				1882	}
				1883
				1884	/**
				1885	* htmlParseName:
				1886	* @ctxt: an HTML parser context
				1887	*
				1888	* parse an HTML name, this routine is case sensistive.
				1889	*
				1890	* Returns the Name parsed or NULL
				1891	*/
				1892
				1893	xmlChar *
				1894	htmlParseName(htmlParserCtxtPtr ctxt) {
				1895	xmlChar buf[HTML_MAX_NAMELEN];
				1896	int len = 0;
				1897
				1898	GROW;
				1899	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1900	return(NULL);
				1901	}
				1902
				1903	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1904	(CUR == '.') \|\| (CUR == '-') \|\|
				1905	(CUR == '_') \|\| (CUR == ':') \|\|
				1906	(IS_COMBINING(CUR)) \|\|
				1907	(IS_EXTENDER(CUR))) {
				1908	buf[len++] = CUR;
				1909	NEXT;
				1910	if (len >= HTML_MAX_NAMELEN) {
				1911	xmlGenericError(xmlGenericErrorContext,
				1912	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1913	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1914	(CUR == '.') \|\| (CUR == '-') \|\|
				1915	(CUR == '_') \|\| (CUR == ':') \|\|
				1916	(IS_COMBINING(CUR)) \|\|
				1917	(IS_EXTENDER(CUR)))
				1918	NEXT;
				1919	break;
				1920	}
				1921	}
				1922	return(xmlStrndup(buf, len));
				1923	}
				1924
				1925	/**
				1926	* htmlParseHTMLAttribute:
				1927	* @ctxt: an HTML parser context
				1928	* @stop: a char stop value
				1929	*
				1930	* parse an HTML attribute value till the stop (quote), if
				1931	* stop is 0 then it stops at the first space
				1932	*
				1933	* Returns the attribute parsed or NULL
				1934	*/
				1935
				1936	xmlChar *
				1937	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1938	xmlChar *buffer = NULL;
				1939	int buffer_size = 0;
				1940	xmlChar *out = NULL;
				1941	xmlChar *name = NULL;
				1942
				1943	xmlChar *cur = NULL;
				1944	htmlEntityDescPtr ent;
				1945
				1946	/*
				1947	* allocate a translation buffer.
				1948	*/
				1949	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1950	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1951	if (buffer == NULL) {
				1952	perror("htmlParseHTMLAttribute: malloc failed");
				1953	return(NULL);
				1954	}
				1955	out = buffer;
				1956
				1957	/*
				1958	* Ok loop until we reach one of the ending chars
				1959	*/
				1960	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1961	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1962	if (CUR == '&') {
				1963	if (NXT(1) == '#') {
				1964	unsigned int c;
				1965	int bits;
				1966
				1967	c = htmlParseCharRef(ctxt);
				1968	if (c < 0x80)
				1969	{ *out++ = c; bits= -6; }
				1970	else if (c < 0x800)
				1971	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1972	else if (c < 0x10000)
				1973	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1974	else
				1975	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1976
				1977	for ( ; bits >= 0; bits-= 6) {
				1978	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1979	}
				1980	} else {
				1981	ent = htmlParseEntityRef(ctxt, &name);
				1982	if (name == NULL) {
				1983	*out++ = '&';
				1984	if (out - buffer > buffer_size - 100) {
				1985	int index = out - buffer;
				1986
				1987	growBuffer(buffer);
				1988	out = &buffer[index];
				1989	}
				1990	} else if (ent == NULL) {
				1991	*out++ = '&';
				1992	cur = name;
				1993	while (*cur != 0) {
				1994	if (out - buffer > buffer_size - 100) {
				1995	int index = out - buffer;
				1996
				1997	growBuffer(buffer);
				1998	out = &buffer[index];
				1999	}
				2000	out++ = cur++;
				2001	}
				2002	xmlFree(name);
				2003	} else {
				2004	unsigned int c;
				2005	int bits;
				2006
				2007	if (out - buffer > buffer_size - 100) {
				2008	int index = out - buffer;
				2009
				2010	growBuffer(buffer);
				2011	out = &buffer[index];
				2012	}
				2013	c = (xmlChar)ent->value;
				2014	if (c < 0x80)
				2015	{ *out++ = c; bits= -6; }
				2016	else if (c < 0x800)
				2017	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2018	else if (c < 0x10000)
				2019	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2020	else
				2021	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2022
				2023	for ( ; bits >= 0; bits-= 6) {
				2024	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2025	}
				2026	xmlFree(name);
				2027	}
				2028	}
				2029	} else {
				2030	unsigned int c;
				2031	int bits, l;
				2032
				2033	if (out - buffer > buffer_size - 100) {
				2034	int index = out - buffer;
				2035
				2036	growBuffer(buffer);
				2037	out = &buffer[index];
				2038	}
				2039	c = CUR_CHAR(l);
				2040	if (c < 0x80)
				2041	{ *out++ = c; bits= -6; }
				2042	else if (c < 0x800)
				2043	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2044	else if (c < 0x10000)
				2045	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2046	else
				2047	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2048
				2049	for ( ; bits >= 0; bits-= 6) {
				2050	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2051	}
				2052	NEXT;
				2053	}
				2054	}
				2055	*out++ = 0;
				2056	return(buffer);
				2057	}
				2058
				2059	/**
				2060	* htmlParseNmtoken:
				2061	* @ctxt: an HTML parser context
				2062	*
				2063	* parse an HTML Nmtoken.
				2064	*
				2065	* Returns the Nmtoken parsed or NULL
				2066	*/
				2067
				2068	xmlChar *
				2069	htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
				2070	xmlChar buf[HTML_MAX_NAMELEN];
				2071	int len = 0;
				2072
				2073	GROW;
				2074	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				2075	(CUR == '.') \|\| (CUR == '-') \|\|
				2076	(CUR == '_') \|\| (CUR == ':') \|\|
				2077	(IS_COMBINING(CUR)) \|\|
				2078	(IS_EXTENDER(CUR))) {
				2079	buf[len++] = CUR;
				2080	NEXT;
				2081	if (len >= HTML_MAX_NAMELEN) {
				2082	xmlGenericError(xmlGenericErrorContext,
				2083	"htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
				2084	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				2085	(CUR == '.') \|\| (CUR == '-') \|\|
				2086	(CUR == '_') \|\| (CUR == ':') \|\|
				2087	(IS_COMBINING(CUR)) \|\|
				2088	(IS_EXTENDER(CUR)))
				2089	NEXT;
				2090	break;
				2091	}
				2092	}
				2093	return(xmlStrndup(buf, len));
				2094	}
				2095
				2096	/**
				2097	* htmlParseEntityRef:
				2098	* @ctxt: an HTML parser context
				2099	* @str: location to store the entity name
				2100	*
				2101	* parse an HTML ENTITY references
				2102	*
				2103	* [68] EntityRef ::= '&' Name ';'
				2104	*
				2105	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2106	* if non-NULL *str will have to be freed by the caller.
				2107	*/
				2108	htmlEntityDescPtr
				2109	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2110	xmlChar *name;
				2111	htmlEntityDescPtr ent = NULL;
				2112	*str = NULL;
				2113
				2114	if (CUR == '&') {
				2115	NEXT;
				2116	name = htmlParseName(ctxt);
				2117	if (name == NULL) {
				2118	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2119	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2120	ctxt->wellFormed = 0;
				2121	} else {
				2122	GROW;
				2123	if (CUR == ';') {
				2124	*str = name;
				2125
				2126	/*
				2127	* Lookup the entity in the table.
				2128	*/
				2129	ent = htmlEntityLookup(name);
				2130	if (ent != NULL) /* OK that's ugly !!! */
				2131	NEXT;
				2132	} else {
				2133	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2134	ctxt->sax->error(ctxt->userData,
				2135	"htmlParseEntityRef: expecting ';'\n");
				2136	*str = name;
				2137	}
				2138	}
				2139	}
				2140	return(ent);
				2141	}
				2142
				2143	/**
				2144	* htmlParseAttValue:
				2145	* @ctxt: an HTML parser context
				2146	*
				2147	* parse a value for an attribute
				2148	* Note: the parser won't do substitution of entities here, this
				2149	* will be handled later in xmlStringGetNodeList, unless it was
				2150	* asked for ctxt->replaceEntities != 0
				2151	*
				2152	* Returns the AttValue parsed or NULL.
				2153	*/
				2154
				2155	xmlChar *
				2156	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2157	xmlChar *ret = NULL;
				2158
				2159	if (CUR == '"') {
				2160	NEXT;
				2161	ret = htmlParseHTMLAttribute(ctxt, '"');
				2162	if (CUR != '"') {
				2163	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2164	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2165	ctxt->wellFormed = 0;
				2166	} else
				2167	NEXT;
				2168	} else if (CUR == '\'') {
				2169	NEXT;
				2170	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2171	if (CUR != '\'') {
				2172	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2173	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2174	ctxt->wellFormed = 0;
				2175	} else
				2176	NEXT;
				2177	} else {
				2178	/*
				2179	* That's an HTMLism, the attribute value may not be quoted
				2180	*/
				2181	ret = htmlParseHTMLAttribute(ctxt, 0);
				2182	if (ret == NULL) {
				2183	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2184	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2185	ctxt->wellFormed = 0;
				2186	}
				2187	}
				2188	return(ret);
				2189	}
				2190
				2191	/**
				2192	* htmlParseSystemLiteral:
				2193	* @ctxt: an HTML parser context
				2194	*
				2195	* parse an HTML Literal
				2196	*
				2197	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2198	*
				2199	* Returns the SystemLiteral parsed or NULL
				2200	*/
				2201
				2202	xmlChar *
				2203	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2204	const xmlChar *q;
				2205	xmlChar *ret = NULL;
				2206
				2207	if (CUR == '"') {
				2208	NEXT;
				2209	q = CUR_PTR;
				2210	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2211	NEXT;
				2212	if (!IS_CHAR(CUR)) {
				2213	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2214	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2215	ctxt->wellFormed = 0;
				2216	} else {
				2217	ret = xmlStrndup(q, CUR_PTR - q);
				2218	NEXT;
				2219	}
				2220	} else if (CUR == '\'') {
				2221	NEXT;
				2222	q = CUR_PTR;
				2223	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2224	NEXT;
				2225	if (!IS_CHAR(CUR)) {
				2226	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2227	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2228	ctxt->wellFormed = 0;
				2229	} else {
				2230	ret = xmlStrndup(q, CUR_PTR - q);
				2231	NEXT;
				2232	}
				2233	} else {
				2234	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2235	ctxt->sax->error(ctxt->userData,
				2236	"SystemLiteral \" or ' expected\n");
				2237	ctxt->wellFormed = 0;
				2238	}
				2239
				2240	return(ret);
				2241	}
				2242
				2243	/**
				2244	* htmlParsePubidLiteral:
				2245	* @ctxt: an HTML parser context
				2246	*
				2247	* parse an HTML public literal
				2248	*
				2249	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2250	*
				2251	* Returns the PubidLiteral parsed or NULL.
				2252	*/
				2253
				2254	xmlChar *
				2255	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2256	const xmlChar *q;
				2257	xmlChar *ret = NULL;
				2258	/*
				2259	* Name ::= (Letter \| '_') (NameChar)*
				2260	*/
				2261	if (CUR == '"') {
				2262	NEXT;
				2263	q = CUR_PTR;
				2264	while (IS_PUBIDCHAR(CUR)) NEXT;
				2265	if (CUR != '"') {
				2266	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2267	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2268	ctxt->wellFormed = 0;
				2269	} else {
				2270	ret = xmlStrndup(q, CUR_PTR - q);
				2271	NEXT;
				2272	}
				2273	} else if (CUR == '\'') {
				2274	NEXT;
				2275	q = CUR_PTR;
				2276	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2277	NEXT;
				2278	if (!IS_LETTER(CUR)) {
				2279	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2280	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2281	ctxt->wellFormed = 0;
				2282	} else {
				2283	ret = xmlStrndup(q, CUR_PTR - q);
				2284	NEXT;
				2285	}
				2286	} else {
				2287	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2288	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2289	ctxt->wellFormed = 0;
				2290	}
				2291
				2292	return(ret);
				2293	}
				2294
				2295	/**
				2296	* htmlParseScript:
				2297	* @ctxt: an HTML parser context
				2298	*
				2299	* parse the content of an HTML SCRIPT or STYLE element
				2300	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2301	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2302	* http://www.w3.org/TR/html4/types.html#type-script
				2303	* http://www.w3.org/TR/html4/types.html#h-6.15
				2304	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2305	*
				2306	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2307	* element and the value of intrinsic event attributes. User agents must
				2308	* not evaluate script data as HTML markup but instead must pass it on as
				2309	* data to a script engine.
				2310	* NOTES:
				2311	* - The content is passed like CDATA
				2312	* - the attributes for style and scripting "onXXX" are also described
				2313	* as CDATA but SGML allows entities references in attributes so their
				2314	* processing is identical as other attributes
				2315	*/
				2316	void
				2317	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2318	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2319	int nbchar = 0;
				2320	xmlChar cur;
				2321
				2322	SHRINK;
				2323	cur = CUR;
				2324	while (IS_CHAR(cur)) {
				2325	if ((cur == '<') && (NXT(1) == '/')) {
				2326	/*
				2327	* One should break here, the specification is clear:
				2328	* Authors should therefore escape "</" within the content.
				2329	* Escape mechanisms are specific to each scripting or
				2330	* style sheet language.
				2331	*/
				2332	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2333	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2334	break; /* while */
				2335	}
				2336	buf[nbchar++] = cur;
				2337	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2338	if (ctxt->sax->cdataBlock!= NULL) {
				2339	/*
				2340	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2341	*/
				2342	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2343	}
				2344	nbchar = 0;
				2345	}
				2346	NEXT;
				2347	cur = CUR;
				2348	}
				2349	if (!(IS_CHAR(cur))) {
				2350	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2351	ctxt->sax->error(ctxt->userData,
				2352	"Invalid char in CDATA 0x%X\n", cur);
				2353	ctxt->wellFormed = 0;
				2354	NEXT;
				2355	}
				2356
				2357	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2358	if (ctxt->sax->cdataBlock!= NULL) {
				2359	/*
				2360	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2361	*/
				2362	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2363	}
				2364	}
				2365	}
				2366
				2367
				2368	/**
				2369	* htmlParseCharData:
				2370	* @ctxt: an HTML parser context
				2371	* @cdata: int indicating whether we are within a CDATA section
				2372	*
				2373	* parse a CharData section.
				2374	* if we are within a CDATA section ']]>' marks an end of section.
				2375	*
				2376	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2377	*/
				2378
				2379	void
				2380	htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
				2381	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2382	int nbchar = 0;
				2383	int cur, l;
				2384
				2385	SHRINK;
				2386	cur = CUR_CHAR(l);
				2387	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2388	((cur != '&') \|\| (ctxt->token == '&')) &&
				2389	(IS_CHAR(cur))) {
				2390	COPY_BUF(l,buf,nbchar,cur);
				2391	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2392	/*
				2393	* Ok the segment is to be consumed as chars.
				2394	*/
				2395	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2396	if (areBlanks(ctxt, buf, nbchar)) {
				2397	if (ctxt->sax->ignorableWhitespace != NULL)
				2398	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2399	buf, nbchar);
				2400	} else {
				2401	htmlCheckParagraph(ctxt);
				2402	if (ctxt->sax->characters != NULL)
				2403	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2404	}
				2405	}
				2406	nbchar = 0;
				2407	}
				2408	NEXTL(l);
				2409	cur = CUR_CHAR(l);
				2410	}
				2411	if (nbchar != 0) {
				2412	/*
				2413	* Ok the segment is to be consumed as chars.
				2414	*/
				2415	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2416	if (areBlanks(ctxt, buf, nbchar)) {
				2417	if (ctxt->sax->ignorableWhitespace != NULL)
				2418	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2419	} else {
				2420	htmlCheckParagraph(ctxt);
				2421	if (ctxt->sax->characters != NULL)
				2422	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2423	}
				2424	}
				2425	}
				2426	}
				2427
				2428	/**
				2429	* htmlParseExternalID:
				2430	* @ctxt: an HTML parser context
				2431	* @publicID: a xmlChar** receiving PubidLiteral
				2432	* @strict: indicate whether we should restrict parsing to only
				2433	* production [75], see NOTE below
				2434	*
				2435	* Parse an External ID or a Public ID
				2436	*
				2437	* NOTE: Productions [75] and [83] interract badly since [75] can generate
				2438	* 'PUBLIC' S PubidLiteral S SystemLiteral
				2439	*
				2440	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2441	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2442	*
				2443	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2444	*
				2445	* Returns the function returns SystemLiteral and in the second
				2446	* case publicID receives PubidLiteral, is strict is off
				2447	* it is possible to return NULL and have publicID set.
				2448	*/
				2449
				2450	xmlChar *
				2451	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
				2452	xmlChar *URI = NULL;
				2453
				2454	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2455	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2456	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2457	SKIP(6);
				2458	if (!IS_BLANK(CUR)) {
				2459	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2460	ctxt->sax->error(ctxt->userData,
				2461	"Space required after 'SYSTEM'\n");
				2462	ctxt->wellFormed = 0;
				2463	}
				2464	SKIP_BLANKS;
				2465	URI = htmlParseSystemLiteral(ctxt);
				2466	if (URI == NULL) {
				2467	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2468	ctxt->sax->error(ctxt->userData,
				2469	"htmlParseExternalID: SYSTEM, no URI\n");
				2470	ctxt->wellFormed = 0;
				2471	}
				2472	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2473	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2474	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2475	SKIP(6);
				2476	if (!IS_BLANK(CUR)) {
				2477	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2478	ctxt->sax->error(ctxt->userData,
				2479	"Space required after 'PUBLIC'\n");
				2480	ctxt->wellFormed = 0;
				2481	}
				2482	SKIP_BLANKS;
				2483	*publicID = htmlParsePubidLiteral(ctxt);
				2484	if (*publicID == NULL) {
				2485	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2486	ctxt->sax->error(ctxt->userData,
				2487	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2488	ctxt->wellFormed = 0;
				2489	}
				2490	SKIP_BLANKS;
				2491	if ((CUR == '"') \|\| (CUR == '\'')) {
				2492	URI = htmlParseSystemLiteral(ctxt);
				2493	}
				2494	}
				2495	return(URI);
				2496	}
				2497
				2498	/**
				2499	* htmlParseComment:
				2500	* @ctxt: an HTML parser context
				2501	*
				2502	* Parse an XML (SGML) comment <!-- .... -->
				2503	*
				2504	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2505	*/
				2506	void
				2507	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2508	xmlChar *buf = NULL;
				2509	int len;
				2510	int size = HTML_PARSER_BUFFER_SIZE;
				2511	int q, ql;
				2512	int r, rl;
				2513	int cur, l;
				2514	xmlParserInputState state;
				2515
				2516	/*
				2517	* Check that there is a comment right here.
				2518	*/
				2519	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2520	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2521
				2522	state = ctxt->instate;
				2523	ctxt->instate = XML_PARSER_COMMENT;
				2524	SHRINK;
				2525	SKIP(4);
				2526	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2527	if (buf == NULL) {
				2528	xmlGenericError(xmlGenericErrorContext,
				2529	"malloc of %d byte failed\n", size);
				2530	ctxt->instate = state;
				2531	return;
				2532	}
				2533	q = CUR_CHAR(ql);
				2534	NEXTL(ql);
				2535	r = CUR_CHAR(rl);
				2536	NEXTL(rl);
				2537	cur = CUR_CHAR(l);
				2538	len = 0;
				2539	while (IS_CHAR(cur) &&
				2540	((cur != '>') \|\|
				2541	(r != '-') \|\| (q != '-'))) {
				2542	if (len + 5 >= size) {
				2543	size *= 2;
				2544	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2545	if (buf == NULL) {
				2546	xmlGenericError(xmlGenericErrorContext,
				2547	"realloc of %d byte failed\n", size);
				2548	ctxt->instate = state;
				2549	return;
				2550	}
				2551	}
				2552	COPY_BUF(ql,buf,len,q);
				2553	q = r;
				2554	ql = rl;
				2555	r = cur;
				2556	rl = l;
				2557	NEXTL(l);
				2558	cur = CUR_CHAR(l);
				2559	if (cur == 0) {
				2560	SHRINK;
				2561	GROW;
				2562	cur = CUR_CHAR(l);
				2563	}
				2564	}
				2565	buf[len] = 0;
				2566	if (!IS_CHAR(cur)) {
				2567	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2568	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2569	ctxt->sax->error(ctxt->userData,
				2570	"Comment not terminated \n<!--%.50s\n", buf);
				2571	ctxt->wellFormed = 0;
				2572	xmlFree(buf);
				2573	} else {
				2574	NEXT;
				2575	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2576	(!ctxt->disableSAX))
				2577	ctxt->sax->comment(ctxt->userData, buf);
				2578	xmlFree(buf);
				2579	}
				2580	ctxt->instate = state;
				2581	}
				2582
				2583	/**
				2584	* htmlParseCharRef:
				2585	* @ctxt: an HTML parser context
				2586	*
				2587	* parse Reference declarations
				2588	*
				2589	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2590	* '&#x' [0-9a-fA-F]+ ';'
				2591	*
				2592	* Returns the value parsed (as an int)
				2593	*/
				2594	int
				2595	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2596	int val = 0;
				2597
				2598	if ((CUR == '&') && (NXT(1) == '#') &&
				2599	(NXT(2) == 'x')) {
				2600	SKIP(3);
				2601	while (CUR != ';') {
				2602	if ((CUR >= '0') && (CUR <= '9'))
				2603	val = val * 16 + (CUR - '0');
				2604	else if ((CUR >= 'a') && (CUR <= 'f'))
				2605	val = val * 16 + (CUR - 'a') + 10;
				2606	else if ((CUR >= 'A') && (CUR <= 'F'))
				2607	val = val * 16 + (CUR - 'A') + 10;
				2608	else {
				2609	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2610	ctxt->sax->error(ctxt->userData,
				2611	"htmlParseCharRef: invalid hexadecimal value\n");
				2612	ctxt->wellFormed = 0;
				2613	return(0);
				2614	}
				2615	NEXT;
				2616	}
				2617	if (CUR == ';')
				2618	NEXT;
				2619	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2620	SKIP(2);
				2621	while (CUR != ';') {
				2622	if ((CUR >= '0') && (CUR <= '9'))
				2623	val = val * 10 + (CUR - '0');
				2624	else {
				2625	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2626	ctxt->sax->error(ctxt->userData,
				2627	"htmlParseCharRef: invalid decimal value\n");
				2628	ctxt->wellFormed = 0;
				2629	return(0);
				2630	}
				2631	NEXT;
				2632	}
				2633	if (CUR == ';')
				2634	NEXT;
				2635	} else {
				2636	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2637	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2638	ctxt->wellFormed = 0;
				2639	}
				2640	/*
				2641	* Check the value IS_CHAR ...
				2642	*/
				2643	if (IS_CHAR(val)) {
				2644	return(val);
				2645	} else {
				2646	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2647	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2648	val);
				2649	ctxt->wellFormed = 0;
				2650	}
				2651	return(0);
				2652	}
				2653
				2654
				2655	/**
				2656	* htmlParseDocTypeDecl :
				2657	* @ctxt: an HTML parser context
				2658	*
				2659	* parse a DOCTYPE declaration
				2660	*
				2661	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2662	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2663	*/
				2664
				2665	void
				2666	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2667	xmlChar *name;
				2668	xmlChar *ExternalID = NULL;
				2669	xmlChar *URI = NULL;
				2670
				2671	/*
				2672	* We know that '<!DOCTYPE' has been detected.
				2673	*/
				2674	SKIP(9);
				2675
				2676	SKIP_BLANKS;
				2677
				2678	/*
				2679	* Parse the DOCTYPE name.
				2680	*/
				2681	name = htmlParseName(ctxt);
				2682	if (name == NULL) {
				2683	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2684	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2685	ctxt->wellFormed = 0;
				2686	}
				2687	/*
				2688	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2689	*/
				2690
				2691	SKIP_BLANKS;
				2692
				2693	/*
				2694	* Check for SystemID and ExternalID
				2695	*/
				2696	URI = htmlParseExternalID(ctxt, &ExternalID, 0);
				2697	SKIP_BLANKS;
				2698
				2699	/*
				2700	* We should be at the end of the DOCTYPE declaration.
				2701	*/
				2702	if (CUR != '>') {
				2703	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2704	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2705	ctxt->wellFormed = 0;
				2706	/* We shouldn't try to resynchronize ... */
				2707	}
				2708	NEXT;
				2709
				2710	/*
				2711	* Create or update the document accordingly to the DOCTYPE
				2712	*/
				2713	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2714	(!ctxt->disableSAX))
				2715	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2716
				2717	/*
				2718	* Cleanup, since we don't use all those identifiers
				2719	*/
				2720	if (URI != NULL) xmlFree(URI);
				2721	if (ExternalID != NULL) xmlFree(ExternalID);
				2722	if (name != NULL) xmlFree(name);
				2723	}
				2724
				2725	/**
				2726	* htmlParseAttribute:
				2727	* @ctxt: an HTML parser context
				2728	* @value: a xmlChar ** used to store the value of the attribute
				2729	*
				2730	* parse an attribute
				2731	*
				2732	* [41] Attribute ::= Name Eq AttValue
				2733	*
				2734	* [25] Eq ::= S? '=' S?
				2735	*
				2736	* With namespace:
				2737	*
				2738	* [NS 11] Attribute ::= QName Eq AttValue
				2739	*
				2740	* Also the case QName == xmlns:??? is handled independently as a namespace
				2741	* definition.
				2742	*
				2743	* Returns the attribute name, and the value in *value.
				2744	*/
				2745
				2746	xmlChar *
				2747	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2748	xmlChar name, val = NULL;
				2749
				2750	*value = NULL;
				2751	name = htmlParseHTMLName(ctxt);
				2752	if (name == NULL) {
				2753	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2754	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2755	ctxt->wellFormed = 0;
				2756	return(NULL);
				2757	}
				2758
				2759	/*
				2760	* read the value
				2761	*/
				2762	SKIP_BLANKS;
				2763	if (CUR == '=') {
				2764	NEXT;
				2765	SKIP_BLANKS;
				2766	val = htmlParseAttValue(ctxt);
				2767	/******
				2768	} else {
				2769	* TODO : some attribute must have values, some may not
				2770	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2771	ctxt->sax->warning(ctxt->userData,
				2772	"No value for attribute %s\n", name); */
				2773	}
				2774
				2775	*value = val;
				2776	return(name);
				2777	}
				2778
				2779	/**
				2780	* htmlCheckEncoding:
				2781	* @ctxt: an HTML parser context
				2782	* @attvalue: the attribute value
				2783	*
				2784	* Checks an http-equiv attribute from a Meta tag to detect
				2785	* the encoding
				2786	* If a new encoding is detected the parser is switched to decode
				2787	* it and pass UTF8
				2788	*/
				2789	void
				2790	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2791	const xmlChar *encoding;
				2792
				2793	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2794	return;
				2795
				2796	/* do not change encoding */
				2797	if (ctxt->input->encoding != NULL)
				2798	return;
				2799
				2800	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2801	if (encoding != NULL) {
				2802	encoding += 8;
				2803	} else {
				2804	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2805	if (encoding != NULL)
				2806	encoding += 9;
				2807	}
				2808	if (encoding != NULL) {
				2809	xmlCharEncoding enc;
				2810	xmlCharEncodingHandlerPtr handler;
				2811
				2812	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2813
				2814	if (ctxt->input->encoding != NULL)
				2815	xmlFree((xmlChar *) ctxt->input->encoding);
				2816	ctxt->input->encoding = xmlStrdup(encoding);
				2817
				2818	enc = xmlParseCharEncoding((const char *) encoding);
				2819	/*
				2820	* registered set of known encodings
				2821	*/
				2822	if (enc != XML_CHAR_ENCODING_ERROR) {
				2823	xmlSwitchEncoding(ctxt, enc);
				2824	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2825	} else {
				2826	/*
				2827	* fallback for unknown encodings
				2828	*/
				2829	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2830	if (handler != NULL) {
				2831	xmlSwitchToEncoding(ctxt, handler);
				2832	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2833	} else {
				2834	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2835	}
				2836	}
				2837
				2838	if ((ctxt->input->buf != NULL) &&
				2839	(ctxt->input->buf->encoder != NULL) &&
				2840	(ctxt->input->buf->raw != NULL) &&
				2841	(ctxt->input->buf->buffer != NULL)) {
				2842	int nbchars;
				2843	int processed;
				2844
				2845	/*
				2846	* convert as much as possible to the parser reading buffer.
				2847	*/
				2848	processed = ctxt->input->cur - ctxt->input->base;
				2849	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2850	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2851	ctxt->input->buf->buffer,
				2852	ctxt->input->buf->raw);
				2853	if (nbchars < 0) {
				2854	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2855	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2856	ctxt->sax->error(ctxt->userData,
				2857	"htmlCheckEncoding: encoder error\n");
				2858	}
				2859	ctxt->input->base =
				2860	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2861	}
				2862	}
				2863	}
				2864
				2865	/**
				2866	* htmlCheckMeta:
				2867	* @ctxt: an HTML parser context
				2868	* @atts: the attributes values
				2869	*
				2870	* Checks an attributes from a Meta tag
				2871	*/
				2872	void
				2873	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2874	int i;
				2875	const xmlChar att, value;
				2876	int http = 0;
				2877	const xmlChar *content = NULL;
				2878
				2879	if ((ctxt == NULL) \|\| (atts == NULL))
				2880	return;
				2881
				2882	i = 0;
				2883	att = atts[i++];
				2884	while (att != NULL) {
				2885	value = atts[i++];
				2886	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2887	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2888	http = 1;
				2889	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2890	content = value;
				2891	att = atts[i++];
				2892	}
				2893	if ((http) && (content != NULL))
				2894	htmlCheckEncoding(ctxt, content);
				2895
				2896	}
				2897
				2898	/**
				2899	* htmlParseStartTag:
				2900	* @ctxt: an HTML parser context
				2901	*
				2902	* parse a start of tag either for rule element or
				2903	* EmptyElement. In both case we don't parse the tag closing chars.
				2904	*
				2905	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2906	*
				2907	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2908	*
				2909	* With namespace:
				2910	*
				2911	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2912	*
				2913	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2914	*
				2915	*/
				2916
				2917	void
				2918	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2919	xmlChar *name;
				2920	xmlChar *attname;
				2921	xmlChar *attvalue;
				2922	const xmlChar **atts = NULL;
				2923	int nbatts = 0;
				2924	int maxatts = 0;
				2925	int meta = 0;
				2926	int i;
				2927
				2928	if (CUR != '<') return;
				2929	NEXT;
				2930
				2931	GROW;
				2932	name = htmlParseHTMLName(ctxt);
				2933	if (name == NULL) {
				2934	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2935	ctxt->sax->error(ctxt->userData,
				2936	"htmlParseStartTag: invalid element name\n");
				2937	ctxt->wellFormed = 0;
				2938	/* Dump the bogus tag like browsers do */
				2939	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2940	NEXT;
				2941	return;
				2942	}
				2943	if (xmlStrEqual(name, BAD_CAST"meta"))
				2944	meta = 1;
				2945
				2946	/*
				2947	* Check for auto-closure of HTML elements.
				2948	*/
				2949	htmlAutoClose(ctxt, name);
				2950
				2951	/*
				2952	* Check for implied HTML elements.
				2953	*/
				2954	htmlCheckImplied(ctxt, name);
				2955
				2956	/*
				2957	* Avoid html at any level > 0, head at any level != 1
				2958	* or any attempt to recurse body
				2959	*/
				2960	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2961	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2962	ctxt->sax->error(ctxt->userData,
				2963	"htmlParseStartTag: misplaced <html> tag\n");
				2964	ctxt->wellFormed = 0;
				2965	xmlFree(name);
				2966	return;
				2967	}
				2968	if ((ctxt->nameNr != 1) &&
				2969	(xmlStrEqual(name, BAD_CAST"head"))) {
				2970	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2971	ctxt->sax->error(ctxt->userData,
				2972	"htmlParseStartTag: misplaced <head> tag\n");
				2973	ctxt->wellFormed = 0;
				2974	xmlFree(name);
				2975	return;
				2976	}
				2977	if (xmlStrEqual(name, BAD_CAST"body")) {
				2978	int i;
				2979	for (i = 0;i < ctxt->nameNr;i++) {
				2980	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				2981	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2982	ctxt->sax->error(ctxt->userData,
				2983	"htmlParseStartTag: misplaced <body> tag\n");
				2984	ctxt->wellFormed = 0;
				2985	xmlFree(name);
				2986	return;
				2987	}
				2988	}
				2989	}
				2990
				2991	/*
				2992	* Now parse the attributes, it ends up with the ending
				2993	*
				2994	* (S Attribute)* S?
				2995	*/
				2996	SKIP_BLANKS;
				2997	while ((IS_CHAR(CUR)) &&
				2998	(CUR != '>') &&
				2999	((CUR != '/') \|\| (NXT(1) != '>'))) {
				3000	long cons = ctxt->nbChars;
				3001
				3002	GROW;
				3003	attname = htmlParseAttribute(ctxt, &attvalue);
				3004	if (attname != NULL) {
				3005
				3006	/*
				3007	* Well formedness requires at most one declaration of an attribute
				3008	*/
				3009	for (i = 0; i < nbatts;i += 2) {
				3010	if (xmlStrEqual(atts[i], attname)) {
				3011	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3012	ctxt->sax->error(ctxt->userData,
				3013	"Attribute %s redefined\n",
				3014	attname);
				3015	ctxt->wellFormed = 0;
				3016	xmlFree(attname);
				3017	if (attvalue != NULL)
				3018	xmlFree(attvalue);
				3019	goto failed;
				3020	}
				3021	}
				3022
				3023	/*
				3024	* Add the pair to atts
				3025	*/
				3026	if (atts == NULL) {
				3027	maxatts = 10;
				3028	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				3029	if (atts == NULL) {
				3030	xmlGenericError(xmlGenericErrorContext,
				3031	"malloc of %ld byte failed\n",
				3032	maxatts * (long)sizeof(xmlChar *));
				3033	if (name != NULL) xmlFree(name);
				3034	return;
				3035	}
				3036	} else if (nbatts + 4 > maxatts) {
				3037	maxatts *= 2;
				3038	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3039	maxatts * sizeof(xmlChar *));
				3040	if (atts == NULL) {
				3041	xmlGenericError(xmlGenericErrorContext,
				3042	"realloc of %ld byte failed\n",
				3043	maxatts * (long)sizeof(xmlChar *));
				3044	if (name != NULL) xmlFree(name);
				3045	return;
				3046	}
				3047	}
				3048	atts[nbatts++] = attname;
				3049	atts[nbatts++] = attvalue;
				3050	atts[nbatts] = NULL;
				3051	atts[nbatts + 1] = NULL;
				3052	}
				3053	else {
				3054	/* Dump the bogus attribute string up to the next blank or
				3055	* the end of the tag. */
				3056	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3057	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3058	NEXT;
				3059	}
				3060
				3061	failed:
				3062	SKIP_BLANKS;
				3063	if (cons == ctxt->nbChars) {
				3064	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3065	ctxt->sax->error(ctxt->userData,
				3066	"htmlParseStartTag: problem parsing attributes\n");
				3067	ctxt->wellFormed = 0;
				3068	break;
				3069	}
				3070	}
				3071
				3072	/*
				3073	* Handle specific association to the META tag
				3074	*/
				3075	if (meta)
				3076	htmlCheckMeta(ctxt, atts);
				3077
				3078	/*
				3079	* SAX: Start of Element !
				3080	*/
				3081	htmlnamePush(ctxt, xmlStrdup(name));
				3082	#ifdef DEBUG
				3083	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3084	#endif
				3085	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3086	ctxt->sax->startElement(ctxt->userData, name, atts);
				3087
				3088	if (atts != NULL) {
				3089	for (i = 0;i < nbatts;i++) {
				3090	if (atts[i] != NULL)
				3091	xmlFree((xmlChar *) atts[i]);
				3092	}
				3093	xmlFree((void *) atts);
				3094	}
				3095	if (name != NULL) xmlFree(name);
				3096	}
				3097
				3098	/**
				3099	* htmlParseEndTag:
				3100	* @ctxt: an HTML parser context
				3101	*
				3102	* parse an end of tag
				3103	*
				3104	* [42] ETag ::= '</' Name S? '>'
				3105	*
				3106	* With namespace
				3107	*
				3108	* [NS 9] ETag ::= '</' QName S? '>'
				3109	*/
				3110
				3111	void
				3112	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3113	xmlChar *name;
				3114	xmlChar *oldname;
				3115	int i;
				3116
				3117	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3118	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3119	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3120	ctxt->wellFormed = 0;
				3121	return;
				3122	}
				3123	SKIP(2);
				3124
				3125	name = htmlParseHTMLName(ctxt);
				3126	if (name == NULL) return;
				3127
				3128	/*
				3129	* We should definitely be at the ending "S? '>'" part
				3130	*/
				3131	SKIP_BLANKS;
				3132	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3133	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3134	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3135	ctxt->wellFormed = 0;
				3136	} else
				3137	NEXT;
				3138
				3139	/*
				3140	* If the name read is not one of the element in the parsing stack
				3141	* then return, it's just an error.
				3142	*/
				3143	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3144	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3145	}
				3146	if (i < 0) {
				3147	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3148	ctxt->sax->error(ctxt->userData,
				3149	"Unexpected end tag : %s\n", name);
				3150	xmlFree(name);
				3151	ctxt->wellFormed = 0;
				3152	return;
				3153	}
				3154
				3155
				3156	/*
				3157	* Check for auto-closure of HTML elements.
				3158	*/
				3159
				3160	htmlAutoCloseOnClose(ctxt, name);
				3161
				3162	/*
				3163	* Well formedness constraints, opening and closing must match.
				3164	* With the exception that the autoclose may have popped stuff out
				3165	* of the stack.
				3166	*/
				3167	if (!xmlStrEqual(name, ctxt->name)) {
				3168	#ifdef DEBUG
				3169	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3170	#endif
				3171	if ((ctxt->name != NULL) &&
				3172	(!xmlStrEqual(ctxt->name, name))) {
				3173	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3174	ctxt->sax->error(ctxt->userData,
				3175	"Opening and ending tag mismatch: %s and %s\n",
				3176	name, ctxt->name);
				3177	ctxt->wellFormed = 0;
				3178	}
				3179	}
				3180
				3181	/*
				3182	* SAX: End of Tag
				3183	*/
				3184	oldname = ctxt->name;
				3185	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3186	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3187	ctxt->sax->endElement(ctxt->userData, name);
				3188	oldname = htmlnamePop(ctxt);
				3189	if (oldname != NULL) {
				3190	#ifdef DEBUG
				3191	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3192	#endif
				3193	xmlFree(oldname);
				3194	#ifdef DEBUG
				3195	} else {
				3196	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3197	#endif
				3198	}
				3199	}
				3200
				3201	if (name != NULL)
				3202	xmlFree(name);
				3203
				3204	return;
				3205	}
				3206
				3207
				3208	/**
				3209	* htmlParseReference:
				3210	* @ctxt: an HTML parser context
				3211	*
				3212	* parse and handle entity references in content,
				3213	* this will end-up in a call to character() since this is either a
				3214	* CharRef, or a predefined entity.
				3215	*/
				3216	void
				3217	htmlParseReference(htmlParserCtxtPtr ctxt) {
				3218	htmlEntityDescPtr ent;
				3219	xmlChar out[6];
				3220	xmlChar *name;
				3221	if (CUR != '&') return;
				3222
				3223	if (NXT(1) == '#') {
				3224	unsigned int c;
				3225	int bits, i = 0;
				3226
				3227	c = htmlParseCharRef(ctxt);
				3228	if (c == 0)
				3229	return;
				3230
				3231	if (c < 0x80) { out[i++]= c; bits= -6; }
				3232	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3233	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3234	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3235
				3236	for ( ; bits >= 0; bits-= 6) {
				3237	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3238	}
				3239	out[i] = 0;
				3240
				3241	htmlCheckParagraph(ctxt);
				3242	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3243	ctxt->sax->characters(ctxt->userData, out, i);
				3244	} else {
				3245	ent = htmlParseEntityRef(ctxt, &name);
				3246	if (name == NULL) {
				3247	htmlCheckParagraph(ctxt);
				3248	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3249	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3250	return;
				3251	}
				3252	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3253	htmlCheckParagraph(ctxt);
				3254	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3255	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3256	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3257	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3258	}
				3259	} else {
				3260	unsigned int c;
				3261	int bits, i = 0;
				3262
				3263	c = ent->value;
				3264	if (c < 0x80)
				3265	{ out[i++]= c; bits= -6; }
				3266	else if (c < 0x800)
				3267	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3268	else if (c < 0x10000)
				3269	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3270	else
				3271	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3272
				3273	for ( ; bits >= 0; bits-= 6) {
				3274	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3275	}
				3276	out[i] = 0;
				3277
				3278	htmlCheckParagraph(ctxt);
				3279	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3280	ctxt->sax->characters(ctxt->userData, out, i);
				3281	}
				3282	xmlFree(name);
				3283	}
				3284	}
				3285
				3286	/**
				3287	* htmlParseContent:
				3288	* @ctxt: an HTML parser context
				3289	* @name: the node name
				3290	*
				3291	* Parse a content: comment, sub-element, reference or text.
				3292	*
				3293	*/
				3294
				3295	void
				3296	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3297	xmlChar *currentNode;
				3298	int depth;
				3299
				3300	currentNode = xmlStrdup(ctxt->name);
				3301	depth = ctxt->nameNr;
				3302	while (1) {
				3303	long cons = ctxt->nbChars;
				3304
				3305	GROW;
				3306	/*
				3307	* Our tag or one of it's parent or children is ending.
				3308	*/
				3309	if ((CUR == '<') && (NXT(1) == '/')) {
				3310	htmlParseEndTag(ctxt);
				3311	if (currentNode != NULL) xmlFree(currentNode);
				3312	return;
				3313	}
				3314
				3315	/*
				3316	* Has this node been popped out during parsing of
				3317	* the next element
				3318	*/
				3319	if ((!xmlStrEqual(currentNode, ctxt->name)) &&
				3320	(depth >= ctxt->nameNr)) {
				3321	if (currentNode != NULL) xmlFree(currentNode);
				3322	return;
				3323	}
				3324
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame^]	3325	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3326	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3327	/*
				3328	* Handle SCRIPT/STYLE separately
				3329	*/
				3330	htmlParseScript(ctxt);
				3331	} else {
				3332	/*
				3333	* Sometimes DOCTYPE arrives in the middle of the document
				3334	*/
				3335	if ((CUR == '<') && (NXT(1) == '!') &&
				3336	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3337	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3338	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3339	(UPP(8) == 'E')) {
				3340	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3341	ctxt->sax->error(ctxt->userData,
				3342	"Misplaced DOCTYPE declaration\n");
				3343	ctxt->wellFormed = 0;
				3344	htmlParseDocTypeDecl(ctxt);
				3345	}
				3346
				3347	/*
				3348	* First case : a comment
				3349	*/
				3350	if ((CUR == '<') && (NXT(1) == '!') &&
				3351	(NXT(2) == '-') && (NXT(3) == '-')) {
				3352	htmlParseComment(ctxt);
				3353	}
				3354
				3355	/*
				3356	* Second case : a sub-element.
				3357	*/
				3358	else if (CUR == '<') {
				3359	htmlParseElement(ctxt);
				3360	}
				3361
				3362	/*
				3363	* Third case : a reference. If if has not been resolved,
				3364	* parsing returns it's Name, create the node
				3365	*/
				3366	else if (CUR == '&') {
				3367	htmlParseReference(ctxt);
				3368	}
				3369
				3370	/*
				3371	* Fourth : end of the resource
				3372	*/
				3373	else if (CUR == 0) {
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame^]	3374	int level = ctxt->nodeNr;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3375	htmlAutoClose(ctxt, NULL);
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame^]	3376	if (level == ctxt->nodeNr)
				3377	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3378	}
				3379
				3380	/*
				3381	* Last case, text. Note that References are handled directly.
				3382	*/
				3383	else {
				3384	htmlParseCharData(ctxt, 0);
				3385	}
				3386
				3387	if (cons == ctxt->nbChars) {
				3388	if (ctxt->node != NULL) {
				3389	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3390	ctxt->sax->error(ctxt->userData,
				3391	"detected an error in element content\n");
				3392	ctxt->wellFormed = 0;
				3393	}
				3394	break;
				3395	}
				3396	}
				3397	GROW;
				3398	}
				3399	if (currentNode != NULL) xmlFree(currentNode);
				3400	}
				3401
				3402	/**
				3403	* htmlParseElement:
				3404	* @ctxt: an HTML parser context
				3405	*
				3406	* parse an HTML element, this is highly recursive
				3407	*
				3408	* [39] element ::= EmptyElemTag \| STag content ETag
				3409	*
				3410	* [41] Attribute ::= Name Eq AttValue
				3411	*/
				3412
				3413	void
				3414	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3415	xmlChar *name;
				3416	xmlChar *currentNode = NULL;
				3417	htmlElemDescPtr info;
				3418	htmlParserNodeInfo node_info;
				3419	xmlChar *oldname;
				3420	int depth = ctxt->nameNr;
				3421
				3422	/* Capture start position */
				3423	if (ctxt->record_info) {
				3424	node_info.begin_pos = ctxt->input->consumed +
				3425	(CUR_PTR - ctxt->input->base);
				3426	node_info.begin_line = ctxt->input->line;
				3427	}
				3428
				3429	oldname = xmlStrdup(ctxt->name);
				3430	htmlParseStartTag(ctxt);
				3431	name = ctxt->name;
				3432	#ifdef DEBUG
				3433	if (oldname == NULL)
				3434	xmlGenericError(xmlGenericErrorContext,
				3435	"Start of element %s\n", name);
				3436	else if (name == NULL)
				3437	xmlGenericError(xmlGenericErrorContext,
				3438	"Start of element failed, was %s\n", oldname);
				3439	else
				3440	xmlGenericError(xmlGenericErrorContext,
				3441	"Start of element %s, was %s\n", name, oldname);
				3442	#endif
				3443	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3444	(name == NULL)) {
				3445	if (CUR == '>')
				3446	NEXT;
				3447	if (oldname != NULL)
				3448	xmlFree(oldname);
				3449	return;
				3450	}
				3451	if (oldname != NULL)
				3452	xmlFree(oldname);
				3453
				3454	/*
				3455	* Lookup the info for that element.
				3456	*/
				3457	info = htmlTagLookup(name);
				3458	if (info == NULL) {
				3459	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3460	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3461	name);
				3462	ctxt->wellFormed = 0;
				3463	} else if (info->depr) {
				3464	/***************************
				3465	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3466	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3467	name);
				3468	***************************/
				3469	}
				3470
				3471	/*
				3472	* Check for an Empty Element labelled the XML/SGML way
				3473	*/
				3474	if ((CUR == '/') && (NXT(1) == '>')) {
				3475	SKIP(2);
				3476	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3477	ctxt->sax->endElement(ctxt->userData, name);
				3478	oldname = htmlnamePop(ctxt);
				3479	#ifdef DEBUG
				3480	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3481	#endif
				3482	if (oldname != NULL)
				3483	xmlFree(oldname);
				3484	return;
				3485	}
				3486
				3487	if (CUR == '>') {
				3488	NEXT;
				3489	} else {
				3490	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3491	ctxt->sax->error(ctxt->userData,
				3492	"Couldn't find end of Start Tag %s\n",
				3493	name);
				3494	ctxt->wellFormed = 0;
				3495
				3496	/*
				3497	* end of parsing of this node.
				3498	*/
				3499	if (xmlStrEqual(name, ctxt->name)) {
				3500	nodePop(ctxt);
				3501	oldname = htmlnamePop(ctxt);
				3502	#ifdef DEBUG
				3503	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3504	#endif
				3505	if (oldname != NULL)
				3506	xmlFree(oldname);
				3507	}
				3508
				3509	/*
				3510	* Capture end position and add node
				3511	*/
				3512	if ( currentNode != NULL && ctxt->record_info ) {
				3513	node_info.end_pos = ctxt->input->consumed +
				3514	(CUR_PTR - ctxt->input->base);
				3515	node_info.end_line = ctxt->input->line;
				3516	node_info.node = ctxt->node;
				3517	xmlParserAddNodeInfo(ctxt, &node_info);
				3518	}
				3519	return;
				3520	}
				3521
				3522	/*
				3523	* Check for an Empty Element from DTD definition
				3524	*/
				3525	if ((info != NULL) && (info->empty)) {
				3526	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3527	ctxt->sax->endElement(ctxt->userData, name);
				3528	oldname = htmlnamePop(ctxt);
				3529	#ifdef DEBUG
				3530	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3531	#endif
				3532	if (oldname != NULL)
				3533	xmlFree(oldname);
				3534	return;
				3535	}
				3536
				3537	/*
				3538	* Parse the content of the element:
				3539	*/
				3540	currentNode = xmlStrdup(ctxt->name);
				3541	depth = ctxt->nameNr;
				3542	while (IS_CHAR(CUR)) {
				3543	htmlParseContent(ctxt);
				3544	if (ctxt->nameNr < depth) break;
				3545	}
				3546
				3547	if (!IS_CHAR(CUR)) {
				3548	/************
				3549	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3550	ctxt->sax->error(ctxt->userData,
				3551	"Premature end of data in tag %s\n", currentNode);
				3552	ctxt->wellFormed = 0;
				3553	*************/
				3554
				3555	/*
				3556	* end of parsing of this node.
				3557	*/
				3558	nodePop(ctxt);
				3559	oldname = htmlnamePop(ctxt);
				3560	#ifdef DEBUG
				3561	xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
				3562	#endif
				3563	if (oldname != NULL)
				3564	xmlFree(oldname);
				3565	if (currentNode != NULL)
				3566	xmlFree(currentNode);
				3567	return;
				3568	}
				3569
				3570	/*
				3571	* Capture end position and add node
				3572	*/
				3573	if ( currentNode != NULL && ctxt->record_info ) {
				3574	node_info.end_pos = ctxt->input->consumed +
				3575	(CUR_PTR - ctxt->input->base);
				3576	node_info.end_line = ctxt->input->line;
				3577	node_info.node = ctxt->node;
				3578	xmlParserAddNodeInfo(ctxt, &node_info);
				3579	}
				3580	if (currentNode != NULL)
				3581	xmlFree(currentNode);
				3582	}
				3583
				3584	/**
				3585	* htmlParseDocument :
				3586	* @ctxt: an HTML parser context
				3587	*
				3588	* parse an HTML document (and build a tree if using the standard SAX
				3589	* interface).
				3590	*
				3591	* Returns 0, -1 in case of error. the parser context is augmented
				3592	* as a result of the parsing.
				3593	*/
				3594
				3595	int
				3596	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3597	xmlDtdPtr dtd;
				3598
				3599	htmlDefaultSAXHandlerInit();
				3600	ctxt->html = 1;
				3601
				3602	GROW;
				3603	/*
				3604	* SAX: beginning of the document processing.
				3605	*/
				3606	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3607	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3608
				3609	/*
				3610	* Wipe out everything which is before the first '<'
				3611	*/
				3612	SKIP_BLANKS;
				3613	if (CUR == 0) {
				3614	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3615	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3616	ctxt->wellFormed = 0;
				3617	}
				3618
				3619	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3620	ctxt->sax->startDocument(ctxt->userData);
				3621
				3622
				3623	/*
				3624	* Parse possible comments before any content
				3625	*/
				3626	while ((CUR == '<') && (NXT(1) == '!') &&
				3627	(NXT(2) == '-') && (NXT(3) == '-')) {
				3628	htmlParseComment(ctxt);
				3629	SKIP_BLANKS;
				3630	}
				3631
				3632
				3633	/*
				3634	* Then possibly doc type declaration(s) and more Misc
				3635	* (doctypedecl Misc*)?
				3636	*/
				3637	if ((CUR == '<') && (NXT(1) == '!') &&
				3638	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3639	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3640	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3641	(UPP(8) == 'E')) {
				3642	htmlParseDocTypeDecl(ctxt);
				3643	}
				3644	SKIP_BLANKS;
				3645
				3646	/*
				3647	* Parse possible comments before any content
				3648	*/
				3649	while ((CUR == '<') && (NXT(1) == '!') &&
				3650	(NXT(2) == '-') && (NXT(3) == '-')) {
				3651	htmlParseComment(ctxt);
				3652	SKIP_BLANKS;
				3653	}
				3654
				3655	/*
				3656	* Time to start parsing the tree itself
				3657	*/
				3658	htmlParseContent(ctxt);
				3659
				3660	/*
				3661	* autoclose
				3662	*/
				3663	if (CUR == 0)
				3664	htmlAutoClose(ctxt, NULL);
				3665
				3666
				3667	/*
				3668	* SAX: end of the document processing.
				3669	*/
				3670	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3671	ctxt->sax->endDocument(ctxt->userData);
				3672
				3673	if (ctxt->myDoc != NULL) {
				3674	dtd = xmlGetIntSubset(ctxt->myDoc);
				3675	if (dtd == NULL)
				3676	ctxt->myDoc->intSubset =
				3677	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3678	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3679	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3680	}
				3681	if (! ctxt->wellFormed) return(-1);
				3682	return(0);
				3683	}
				3684
				3685
				3686	/************************************************************************
				3687	* *
				3688	* Parser contexts handling *
				3689	* *
				3690	************************************************************************/
				3691
				3692	/**
				3693	* xmlInitParserCtxt:
				3694	* @ctxt: an HTML parser context
				3695	*
				3696	* Initialize a parser context
				3697	*/
				3698
				3699	void
				3700	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3701	{
				3702	htmlSAXHandler *sax;
				3703
				3704	if (ctxt == NULL) return;
				3705	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3706
				3707	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3708	if (sax == NULL) {
				3709	xmlGenericError(xmlGenericErrorContext,
				3710	"htmlInitParserCtxt: out of memory\n");
				3711	}
				3712	else
				3713	memset(sax, 0, sizeof(htmlSAXHandler));
				3714
				3715	/* Allocate the Input stack */
				3716	ctxt->inputTab = (htmlParserInputPtr *)
				3717	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3718	if (ctxt->inputTab == NULL) {
				3719	xmlGenericError(xmlGenericErrorContext,
				3720	"htmlInitParserCtxt: out of memory\n");
				3721	ctxt->inputNr = 0;
				3722	ctxt->inputMax = 0;
				3723	ctxt->input = NULL;
				3724	return;
				3725	}
				3726	ctxt->inputNr = 0;
				3727	ctxt->inputMax = 5;
				3728	ctxt->input = NULL;
				3729	ctxt->version = NULL;
				3730	ctxt->encoding = NULL;
				3731	ctxt->standalone = -1;
				3732	ctxt->instate = XML_PARSER_START;
				3733
				3734	/* Allocate the Node stack */
				3735	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3736	if (ctxt->nodeTab == NULL) {
				3737	xmlGenericError(xmlGenericErrorContext,
				3738	"htmlInitParserCtxt: out of memory\n");
				3739	ctxt->nodeNr = 0;
				3740	ctxt->nodeMax = 0;
				3741	ctxt->node = NULL;
				3742	ctxt->inputNr = 0;
				3743	ctxt->inputMax = 0;
				3744	ctxt->input = NULL;
				3745	return;
				3746	}
				3747	ctxt->nodeNr = 0;
				3748	ctxt->nodeMax = 10;
				3749	ctxt->node = NULL;
				3750
				3751	/* Allocate the Name stack */
				3752	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3753	if (ctxt->nameTab == NULL) {
				3754	xmlGenericError(xmlGenericErrorContext,
				3755	"htmlInitParserCtxt: out of memory\n");
				3756	ctxt->nameNr = 0;
				3757	ctxt->nameMax = 10;
				3758	ctxt->name = NULL;
				3759	ctxt->nodeNr = 0;
				3760	ctxt->nodeMax = 0;
				3761	ctxt->node = NULL;
				3762	ctxt->inputNr = 0;
				3763	ctxt->inputMax = 0;
				3764	ctxt->input = NULL;
				3765	return;
				3766	}
				3767	ctxt->nameNr = 0;
				3768	ctxt->nameMax = 10;
				3769	ctxt->name = NULL;
				3770
				3771	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3772	else {
				3773	ctxt->sax = sax;
				3774	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3775	}
				3776	ctxt->userData = ctxt;
				3777	ctxt->myDoc = NULL;
				3778	ctxt->wellFormed = 1;
				3779	ctxt->replaceEntities = 0;
				3780	ctxt->html = 1;
				3781	ctxt->record_info = 0;
				3782	ctxt->validate = 0;
				3783	ctxt->nbChars = 0;
				3784	ctxt->checkIndex = 0;
				3785	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3786	}
				3787
				3788	/**
				3789	* htmlFreeParserCtxt:
				3790	* @ctxt: an HTML parser context
				3791	*
				3792	* Free all the memory used by a parser context. However the parsed
				3793	* document in ctxt->myDoc is not freed.
				3794	*/
				3795
				3796	void
				3797	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3798	{
				3799	xmlFreeParserCtxt(ctxt);
				3800	}
				3801
				3802	/**
				3803	* htmlCreateDocParserCtxt :
				3804	* @cur: a pointer to an array of xmlChar
				3805	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3806	*
				3807	* Create a parser context for an HTML document.
				3808	*
				3809	* Returns the new parser context or NULL
				3810	*/
				3811	htmlParserCtxtPtr
				3812	htmlCreateDocParserCtxt(xmlChar cur, const char encoding) {
				3813	htmlParserCtxtPtr ctxt;
				3814	htmlParserInputPtr input;
				3815	/* htmlCharEncoding enc; */
				3816
				3817	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3818	if (ctxt == NULL) {
				3819	perror("malloc");
				3820	return(NULL);
				3821	}
				3822	htmlInitParserCtxt(ctxt);
				3823	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3824	if (input == NULL) {
				3825	perror("malloc");
				3826	xmlFree(ctxt);
				3827	return(NULL);
				3828	}
				3829	memset(input, 0, sizeof(htmlParserInput));
				3830
				3831	input->line = 1;
				3832	input->col = 1;
				3833	input->base = cur;
				3834	input->cur = cur;
				3835
				3836	inputPush(ctxt, input);
				3837	return(ctxt);
				3838	}
				3839
				3840	/************************************************************************
				3841	* *
				3842	* Progressive parsing interfaces *
				3843	* *
				3844	************************************************************************/
				3845
				3846	/**
				3847	* htmlParseLookupSequence:
				3848	* @ctxt: an HTML parser context
				3849	* @first: the first char to lookup
				3850	* @next: the next char to lookup or zero
				3851	* @third: the next char to lookup or zero
				3852	*
				3853	* Try to find if a sequence (first, next, third) or just (first next) or
				3854	* (first) is available in the input stream.
				3855	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3856	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3857	* parser, do not use liberally.
				3858	* This is basically similar to xmlParseLookupSequence()
				3859	*
				3860	* Returns the index to the current parsing point if the full sequence
				3861	* is available, -1 otherwise.
				3862	*/
				3863	int
				3864	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3865	xmlChar next, xmlChar third) {
				3866	int base, len;
				3867	htmlParserInputPtr in;
				3868	const xmlChar *buf;
				3869
				3870	in = ctxt->input;
				3871	if (in == NULL) return(-1);
				3872	base = in->cur - in->base;
				3873	if (base < 0) return(-1);
				3874	if (ctxt->checkIndex > base)
				3875	base = ctxt->checkIndex;
				3876	if (in->buf == NULL) {
				3877	buf = in->base;
				3878	len = in->length;
				3879	} else {
				3880	buf = in->buf->buffer->content;
				3881	len = in->buf->buffer->use;
				3882	}
				3883	/* take into account the sequence length */
				3884	if (third) len -= 2;
				3885	else if (next) len --;
				3886	for (;base < len;base++) {
				3887	if (buf[base] == first) {
				3888	if (third != 0) {
				3889	if ((buf[base + 1] != next) \|\|
				3890	(buf[base + 2] != third)) continue;
				3891	} else if (next != 0) {
				3892	if (buf[base + 1] != next) continue;
				3893	}
				3894	ctxt->checkIndex = 0;
				3895	#ifdef DEBUG_PUSH
				3896	if (next == 0)
				3897	xmlGenericError(xmlGenericErrorContext,
				3898	"HPP: lookup '%c' found at %d\n",
				3899	first, base);
				3900	else if (third == 0)
				3901	xmlGenericError(xmlGenericErrorContext,
				3902	"HPP: lookup '%c%c' found at %d\n",
				3903	first, next, base);
				3904	else
				3905	xmlGenericError(xmlGenericErrorContext,
				3906	"HPP: lookup '%c%c%c' found at %d\n",
				3907	first, next, third, base);
				3908	#endif
				3909	return(base - (in->cur - in->base));
				3910	}
				3911	}
				3912	ctxt->checkIndex = base;
				3913	#ifdef DEBUG_PUSH
				3914	if (next == 0)
				3915	xmlGenericError(xmlGenericErrorContext,
				3916	"HPP: lookup '%c' failed\n", first);
				3917	else if (third == 0)
				3918	xmlGenericError(xmlGenericErrorContext,
				3919	"HPP: lookup '%c%c' failed\n", first, next);
				3920	else
				3921	xmlGenericError(xmlGenericErrorContext,
				3922	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3923	#endif
				3924	return(-1);
				3925	}
				3926
				3927	/**
				3928	* htmlParseTryOrFinish:
				3929	* @ctxt: an HTML parser context
				3930	* @terminate: last chunk indicator
				3931	*
				3932	* Try to progress on parsing
				3933	*
				3934	* Returns zero if no parsing was possible
				3935	*/
				3936	int
				3937	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3938	int ret = 0;
				3939	htmlParserInputPtr in;
				3940	int avail = 0;
				3941	xmlChar cur, next;
				3942
				3943	#ifdef DEBUG_PUSH
				3944	switch (ctxt->instate) {
				3945	case XML_PARSER_EOF:
				3946	xmlGenericError(xmlGenericErrorContext,
				3947	"HPP: try EOF\n"); break;
				3948	case XML_PARSER_START:
				3949	xmlGenericError(xmlGenericErrorContext,
				3950	"HPP: try START\n"); break;
				3951	case XML_PARSER_MISC:
				3952	xmlGenericError(xmlGenericErrorContext,
				3953	"HPP: try MISC\n");break;
				3954	case XML_PARSER_COMMENT:
				3955	xmlGenericError(xmlGenericErrorContext,
				3956	"HPP: try COMMENT\n");break;
				3957	case XML_PARSER_PROLOG:
				3958	xmlGenericError(xmlGenericErrorContext,
				3959	"HPP: try PROLOG\n");break;
				3960	case XML_PARSER_START_TAG:
				3961	xmlGenericError(xmlGenericErrorContext,
				3962	"HPP: try START_TAG\n");break;
				3963	case XML_PARSER_CONTENT:
				3964	xmlGenericError(xmlGenericErrorContext,
				3965	"HPP: try CONTENT\n");break;
				3966	case XML_PARSER_CDATA_SECTION:
				3967	xmlGenericError(xmlGenericErrorContext,
				3968	"HPP: try CDATA_SECTION\n");break;
				3969	case XML_PARSER_END_TAG:
				3970	xmlGenericError(xmlGenericErrorContext,
				3971	"HPP: try END_TAG\n");break;
				3972	case XML_PARSER_ENTITY_DECL:
				3973	xmlGenericError(xmlGenericErrorContext,
				3974	"HPP: try ENTITY_DECL\n");break;
				3975	case XML_PARSER_ENTITY_VALUE:
				3976	xmlGenericError(xmlGenericErrorContext,
				3977	"HPP: try ENTITY_VALUE\n");break;
				3978	case XML_PARSER_ATTRIBUTE_VALUE:
				3979	xmlGenericError(xmlGenericErrorContext,
				3980	"HPP: try ATTRIBUTE_VALUE\n");break;
				3981	case XML_PARSER_DTD:
				3982	xmlGenericError(xmlGenericErrorContext,
				3983	"HPP: try DTD\n");break;
				3984	case XML_PARSER_EPILOG:
				3985	xmlGenericError(xmlGenericErrorContext,
				3986	"HPP: try EPILOG\n");break;
				3987	case XML_PARSER_PI:
				3988	xmlGenericError(xmlGenericErrorContext,
				3989	"HPP: try PI\n");break;
				3990	case XML_PARSER_SYSTEM_LITERAL:
				3991	xmlGenericError(xmlGenericErrorContext,
				3992	"HPP: try SYSTEM_LITERAL\n");break;
				3993	}
				3994	#endif
				3995
				3996	while (1) {
				3997
				3998	in = ctxt->input;
				3999	if (in == NULL) break;
				4000	if (in->buf == NULL)
				4001	avail = in->length - (in->cur - in->base);
				4002	else
				4003	avail = in->buf->buffer->use - (in->cur - in->base);
				4004	if ((avail == 0) && (terminate)) {
				4005	htmlAutoClose(ctxt, NULL);
				4006	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4007	/*
				4008	* SAX: end of the document processing.
				4009	*/
				4010	ctxt->instate = XML_PARSER_EOF;
				4011	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4012	ctxt->sax->endDocument(ctxt->userData);
				4013	}
				4014	}
				4015	if (avail < 1)
				4016	goto done;
				4017	switch (ctxt->instate) {
				4018	case XML_PARSER_EOF:
				4019	/*
				4020	* Document parsing is done !
				4021	*/
				4022	goto done;
				4023	case XML_PARSER_START:
				4024	/*
				4025	* Very first chars read from the document flow.
				4026	*/
				4027	cur = in->cur[0];
				4028	if (IS_BLANK(cur)) {
				4029	SKIP_BLANKS;
				4030	if (in->buf == NULL)
				4031	avail = in->length - (in->cur - in->base);
				4032	else
				4033	avail = in->buf->buffer->use - (in->cur - in->base);
				4034	}
				4035	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4036	ctxt->sax->setDocumentLocator(ctxt->userData,
				4037	&xmlDefaultSAXLocator);
				4038	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4039	(!ctxt->disableSAX))
				4040	ctxt->sax->startDocument(ctxt->userData);
				4041
				4042	cur = in->cur[0];
				4043	next = in->cur[1];
				4044	if ((cur == '<') && (next == '!') &&
				4045	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4046	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4047	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4048	(UPP(8) == 'E')) {
				4049	if ((!terminate) &&
				4050	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4051	goto done;
				4052	#ifdef DEBUG_PUSH
				4053	xmlGenericError(xmlGenericErrorContext,
				4054	"HPP: Parsing internal subset\n");
				4055	#endif
				4056	htmlParseDocTypeDecl(ctxt);
				4057	ctxt->instate = XML_PARSER_PROLOG;
				4058	#ifdef DEBUG_PUSH
				4059	xmlGenericError(xmlGenericErrorContext,
				4060	"HPP: entering PROLOG\n");
				4061	#endif
				4062	} else {
				4063	ctxt->instate = XML_PARSER_MISC;
				4064	}
				4065	#ifdef DEBUG_PUSH
				4066	xmlGenericError(xmlGenericErrorContext,
				4067	"HPP: entering MISC\n");
				4068	#endif
				4069	break;
				4070	case XML_PARSER_MISC:
				4071	SKIP_BLANKS;
				4072	if (in->buf == NULL)
				4073	avail = in->length - (in->cur - in->base);
				4074	else
				4075	avail = in->buf->buffer->use - (in->cur - in->base);
				4076	if (avail < 2)
				4077	goto done;
				4078	cur = in->cur[0];
				4079	next = in->cur[1];
				4080	if ((cur == '<') && (next == '!') &&
				4081	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4082	if ((!terminate) &&
				4083	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4084	goto done;
				4085	#ifdef DEBUG_PUSH
				4086	xmlGenericError(xmlGenericErrorContext,
				4087	"HPP: Parsing Comment\n");
				4088	#endif
				4089	htmlParseComment(ctxt);
				4090	ctxt->instate = XML_PARSER_MISC;
				4091	} else if ((cur == '<') && (next == '!') &&
				4092	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4093	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4094	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4095	(UPP(8) == 'E')) {
				4096	if ((!terminate) &&
				4097	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4098	goto done;
				4099	#ifdef DEBUG_PUSH
				4100	xmlGenericError(xmlGenericErrorContext,
				4101	"HPP: Parsing internal subset\n");
				4102	#endif
				4103	htmlParseDocTypeDecl(ctxt);
				4104	ctxt->instate = XML_PARSER_PROLOG;
				4105	#ifdef DEBUG_PUSH
				4106	xmlGenericError(xmlGenericErrorContext,
				4107	"HPP: entering PROLOG\n");
				4108	#endif
				4109	} else if ((cur == '<') && (next == '!') &&
				4110	(avail < 9)) {
				4111	goto done;
				4112	} else {
				4113	ctxt->instate = XML_PARSER_START_TAG;
				4114	#ifdef DEBUG_PUSH
				4115	xmlGenericError(xmlGenericErrorContext,
				4116	"HPP: entering START_TAG\n");
				4117	#endif
				4118	}
				4119	break;
				4120	case XML_PARSER_PROLOG:
				4121	SKIP_BLANKS;
				4122	if (in->buf == NULL)
				4123	avail = in->length - (in->cur - in->base);
				4124	else
				4125	avail = in->buf->buffer->use - (in->cur - in->base);
				4126	if (avail < 2)
				4127	goto done;
				4128	cur = in->cur[0];
				4129	next = in->cur[1];
				4130	if ((cur == '<') && (next == '!') &&
				4131	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4132	if ((!terminate) &&
				4133	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4134	goto done;
				4135	#ifdef DEBUG_PUSH
				4136	xmlGenericError(xmlGenericErrorContext,
				4137	"HPP: Parsing Comment\n");
				4138	#endif
				4139	htmlParseComment(ctxt);
				4140	ctxt->instate = XML_PARSER_PROLOG;
				4141	} else if ((cur == '<') && (next == '!') &&
				4142	(avail < 4)) {
				4143	goto done;
				4144	} else {
				4145	ctxt->instate = XML_PARSER_START_TAG;
				4146	#ifdef DEBUG_PUSH
				4147	xmlGenericError(xmlGenericErrorContext,
				4148	"HPP: entering START_TAG\n");
				4149	#endif
				4150	}
				4151	break;
				4152	case XML_PARSER_EPILOG:
				4153	if (in->buf == NULL)
				4154	avail = in->length - (in->cur - in->base);
				4155	else
				4156	avail = in->buf->buffer->use - (in->cur - in->base);
				4157	if (avail < 1)
				4158	goto done;
				4159	cur = in->cur[0];
				4160	if (IS_BLANK(cur)) {
				4161	htmlParseCharData(ctxt, 0);
				4162	goto done;
				4163	}
				4164	if (avail < 2)
				4165	goto done;
				4166	next = in->cur[1];
				4167	if ((cur == '<') && (next == '!') &&
				4168	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4169	if ((!terminate) &&
				4170	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4171	goto done;
				4172	#ifdef DEBUG_PUSH
				4173	xmlGenericError(xmlGenericErrorContext,
				4174	"HPP: Parsing Comment\n");
				4175	#endif
				4176	htmlParseComment(ctxt);
				4177	ctxt->instate = XML_PARSER_EPILOG;
				4178	} else if ((cur == '<') && (next == '!') &&
				4179	(avail < 4)) {
				4180	goto done;
				4181	} else {
				4182	ctxt->errNo = XML_ERR_DOCUMENT_END;
				4183	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4184	ctxt->sax->error(ctxt->userData,
				4185	"Extra content at the end of the document\n");
				4186	ctxt->wellFormed = 0;
				4187	ctxt->instate = XML_PARSER_EOF;
				4188	#ifdef DEBUG_PUSH
				4189	xmlGenericError(xmlGenericErrorContext,
				4190	"HPP: entering EOF\n");
				4191	#endif
				4192	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4193	ctxt->sax->endDocument(ctxt->userData);
				4194	goto done;
				4195	}
				4196	break;
				4197	case XML_PARSER_START_TAG: {
				4198	xmlChar name, oldname;
				4199	int depth = ctxt->nameNr;
				4200	htmlElemDescPtr info;
				4201
				4202	if (avail < 2)
				4203	goto done;
				4204	cur = in->cur[0];
				4205	if (cur != '<') {
				4206	ctxt->instate = XML_PARSER_CONTENT;
				4207	#ifdef DEBUG_PUSH
				4208	xmlGenericError(xmlGenericErrorContext,
				4209	"HPP: entering CONTENT\n");
				4210	#endif
				4211	break;
				4212	}
				4213	if ((!terminate) &&
				4214	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4215	goto done;
				4216
				4217	oldname = xmlStrdup(ctxt->name);
				4218	htmlParseStartTag(ctxt);
				4219	name = ctxt->name;
				4220	#ifdef DEBUG
				4221	if (oldname == NULL)
				4222	xmlGenericError(xmlGenericErrorContext,
				4223	"Start of element %s\n", name);
				4224	else if (name == NULL)
				4225	xmlGenericError(xmlGenericErrorContext,
				4226	"Start of element failed, was %s\n",
				4227	oldname);
				4228	else
				4229	xmlGenericError(xmlGenericErrorContext,
				4230	"Start of element %s, was %s\n",
				4231	name, oldname);
				4232	#endif
				4233	if (((depth == ctxt->nameNr) &&
				4234	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4235	(name == NULL)) {
				4236	if (CUR == '>')
				4237	NEXT;
				4238	if (oldname != NULL)
				4239	xmlFree(oldname);
				4240	break;
				4241	}
				4242	if (oldname != NULL)
				4243	xmlFree(oldname);
				4244
				4245	/*
				4246	* Lookup the info for that element.
				4247	*/
				4248	info = htmlTagLookup(name);
				4249	if (info == NULL) {
				4250	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4251	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4252	name);
				4253	ctxt->wellFormed = 0;
				4254	} else if (info->depr) {
				4255	/***************************
				4256	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4257	ctxt->sax->warning(ctxt->userData,
				4258	"Tag %s is deprecated\n",
				4259	name);
				4260	***************************/
				4261	}
				4262
				4263	/*
				4264	* Check for an Empty Element labelled the XML/SGML way
				4265	*/
				4266	if ((CUR == '/') && (NXT(1) == '>')) {
				4267	SKIP(2);
				4268	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4269	ctxt->sax->endElement(ctxt->userData, name);
				4270	oldname = htmlnamePop(ctxt);
				4271	#ifdef DEBUG
				4272	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4273	oldname);
				4274	#endif
				4275	if (oldname != NULL)
				4276	xmlFree(oldname);
				4277	ctxt->instate = XML_PARSER_CONTENT;
				4278	#ifdef DEBUG_PUSH
				4279	xmlGenericError(xmlGenericErrorContext,
				4280	"HPP: entering CONTENT\n");
				4281	#endif
				4282	break;
				4283	}
				4284
				4285	if (CUR == '>') {
				4286	NEXT;
				4287	} else {
				4288	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4289	ctxt->sax->error(ctxt->userData,
				4290	"Couldn't find end of Start Tag %s\n",
				4291	name);
				4292	ctxt->wellFormed = 0;
				4293
				4294	/*
				4295	* end of parsing of this node.
				4296	*/
				4297	if (xmlStrEqual(name, ctxt->name)) {
				4298	nodePop(ctxt);
				4299	oldname = htmlnamePop(ctxt);
				4300	#ifdef DEBUG
				4301	xmlGenericError(xmlGenericErrorContext,
				4302	"End of start tag problem: popping out %s\n", oldname);
				4303	#endif
				4304	if (oldname != NULL)
				4305	xmlFree(oldname);
				4306	}
				4307
				4308	ctxt->instate = XML_PARSER_CONTENT;
				4309	#ifdef DEBUG_PUSH
				4310	xmlGenericError(xmlGenericErrorContext,
				4311	"HPP: entering CONTENT\n");
				4312	#endif
				4313	break;
				4314	}
				4315
				4316	/*
				4317	* Check for an Empty Element from DTD definition
				4318	*/
				4319	if ((info != NULL) && (info->empty)) {
				4320	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4321	ctxt->sax->endElement(ctxt->userData, name);
				4322	oldname = htmlnamePop(ctxt);
				4323	#ifdef DEBUG
				4324	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4325	#endif
				4326	if (oldname != NULL)
				4327	xmlFree(oldname);
				4328	}
				4329	ctxt->instate = XML_PARSER_CONTENT;
				4330	#ifdef DEBUG_PUSH
				4331	xmlGenericError(xmlGenericErrorContext,
				4332	"HPP: entering CONTENT\n");
				4333	#endif
				4334	break;
				4335	}
				4336	case XML_PARSER_CONTENT: {
				4337	long cons;
				4338	/*
				4339	* Handle preparsed entities and charRef
				4340	*/
				4341	if (ctxt->token != 0) {
				4342	xmlChar chr[2] = { 0 , 0 } ;
				4343
				4344	chr[0] = (xmlChar) ctxt->token;
				4345	htmlCheckParagraph(ctxt);
				4346	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4347	ctxt->sax->characters(ctxt->userData, chr, 1);
				4348	ctxt->token = 0;
				4349	ctxt->checkIndex = 0;
				4350	}
				4351	if ((avail == 1) && (terminate)) {
				4352	cur = in->cur[0];
				4353	if ((cur != '<') && (cur != '&')) {
				4354	if (ctxt->sax != NULL) {
				4355	if (IS_BLANK(cur)) {
				4356	if (ctxt->sax->ignorableWhitespace != NULL)
				4357	ctxt->sax->ignorableWhitespace(
				4358	ctxt->userData, &cur, 1);
				4359	} else {
				4360	htmlCheckParagraph(ctxt);
				4361	if (ctxt->sax->characters != NULL)
				4362	ctxt->sax->characters(
				4363	ctxt->userData, &cur, 1);
				4364	}
				4365	}
				4366	ctxt->token = 0;
				4367	ctxt->checkIndex = 0;
				4368	NEXT;
				4369	}
				4370	break;
				4371	}
				4372	if (avail < 2)
				4373	goto done;
				4374	cur = in->cur[0];
				4375	next = in->cur[1];
				4376	cons = ctxt->nbChars;
				4377	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4378	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4379	/*
				4380	* Handle SCRIPT/STYLE separately
				4381	*/
				4382	if ((!terminate) &&
				4383	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4384	goto done;
				4385	htmlParseScript(ctxt);
				4386	if ((cur == '<') && (next == '/')) {
				4387	ctxt->instate = XML_PARSER_END_TAG;
				4388	ctxt->checkIndex = 0;
				4389	#ifdef DEBUG_PUSH
				4390	xmlGenericError(xmlGenericErrorContext,
				4391	"HPP: entering END_TAG\n");
				4392	#endif
				4393	break;
				4394	}
				4395	} else {
				4396	/*
				4397	* Sometimes DOCTYPE arrives in the middle of the document
				4398	*/
				4399	if ((cur == '<') && (next == '!') &&
				4400	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4401	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4402	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4403	(UPP(8) == 'E')) {
				4404	if ((!terminate) &&
				4405	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4406	goto done;
				4407	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4408	ctxt->sax->error(ctxt->userData,
				4409	"Misplaced DOCTYPE declaration\n");
				4410	ctxt->wellFormed = 0;
				4411	htmlParseDocTypeDecl(ctxt);
				4412	} else if ((cur == '<') && (next == '!') &&
				4413	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4414	if ((!terminate) &&
				4415	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4416	goto done;
				4417	#ifdef DEBUG_PUSH
				4418	xmlGenericError(xmlGenericErrorContext,
				4419	"HPP: Parsing Comment\n");
				4420	#endif
				4421	htmlParseComment(ctxt);
				4422	ctxt->instate = XML_PARSER_CONTENT;
				4423	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4424	goto done;
				4425	} else if ((cur == '<') && (next == '/')) {
				4426	ctxt->instate = XML_PARSER_END_TAG;
				4427	ctxt->checkIndex = 0;
				4428	#ifdef DEBUG_PUSH
				4429	xmlGenericError(xmlGenericErrorContext,
				4430	"HPP: entering END_TAG\n");
				4431	#endif
				4432	break;
				4433	} else if (cur == '<') {
				4434	ctxt->instate = XML_PARSER_START_TAG;
				4435	ctxt->checkIndex = 0;
				4436	#ifdef DEBUG_PUSH
				4437	xmlGenericError(xmlGenericErrorContext,
				4438	"HPP: entering START_TAG\n");
				4439	#endif
				4440	break;
				4441	} else if (cur == '&') {
				4442	if ((!terminate) &&
				4443	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4444	goto done;
				4445	#ifdef DEBUG_PUSH
				4446	xmlGenericError(xmlGenericErrorContext,
				4447	"HPP: Parsing Reference\n");
				4448	#endif
				4449	/* TODO: check generation of subtrees if noent !!! */
				4450	htmlParseReference(ctxt);
				4451	} else {
				4452	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4453	/*
				4454	* Goal of the following test is :
				4455	* - minimize calls to the SAX 'character' callback
				4456	* when they are mergeable
				4457	*/
				4458	if ((ctxt->inputNr == 1) &&
				4459	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4460	if ((!terminate) &&
				4461	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4462	goto done;
				4463	}
				4464	ctxt->checkIndex = 0;
				4465	#ifdef DEBUG_PUSH
				4466	xmlGenericError(xmlGenericErrorContext,
				4467	"HPP: Parsing char data\n");
				4468	#endif
				4469	htmlParseCharData(ctxt, 0);
				4470	}
				4471	}
				4472	if (cons == ctxt->nbChars) {
				4473	if (ctxt->node != NULL) {
				4474	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4475	ctxt->sax->error(ctxt->userData,
				4476	"detected an error in element content\n");
				4477	ctxt->wellFormed = 0;
				4478	}
				4479	NEXT;
				4480	break;
				4481	}
				4482
				4483	break;
				4484	}
				4485	case XML_PARSER_END_TAG:
				4486	if (avail < 2)
				4487	goto done;
				4488	if ((!terminate) &&
				4489	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4490	goto done;
				4491	htmlParseEndTag(ctxt);
				4492	if (ctxt->nameNr == 0) {
				4493	ctxt->instate = XML_PARSER_EPILOG;
				4494	} else {
				4495	ctxt->instate = XML_PARSER_CONTENT;
				4496	}
				4497	ctxt->checkIndex = 0;
				4498	#ifdef DEBUG_PUSH
				4499	xmlGenericError(xmlGenericErrorContext,
				4500	"HPP: entering CONTENT\n");
				4501	#endif
				4502	break;
				4503	case XML_PARSER_CDATA_SECTION:
				4504	xmlGenericError(xmlGenericErrorContext,
				4505	"HPP: internal error, state == CDATA\n");
				4506	ctxt->instate = XML_PARSER_CONTENT;
				4507	ctxt->checkIndex = 0;
				4508	#ifdef DEBUG_PUSH
				4509	xmlGenericError(xmlGenericErrorContext,
				4510	"HPP: entering CONTENT\n");
				4511	#endif
				4512	break;
				4513	case XML_PARSER_DTD:
				4514	xmlGenericError(xmlGenericErrorContext,
				4515	"HPP: internal error, state == DTD\n");
				4516	ctxt->instate = XML_PARSER_CONTENT;
				4517	ctxt->checkIndex = 0;
				4518	#ifdef DEBUG_PUSH
				4519	xmlGenericError(xmlGenericErrorContext,
				4520	"HPP: entering CONTENT\n");
				4521	#endif
				4522	break;
				4523	case XML_PARSER_COMMENT:
				4524	xmlGenericError(xmlGenericErrorContext,
				4525	"HPP: internal error, state == COMMENT\n");
				4526	ctxt->instate = XML_PARSER_CONTENT;
				4527	ctxt->checkIndex = 0;
				4528	#ifdef DEBUG_PUSH
				4529	xmlGenericError(xmlGenericErrorContext,
				4530	"HPP: entering CONTENT\n");
				4531	#endif
				4532	break;
				4533	case XML_PARSER_PI:
				4534	xmlGenericError(xmlGenericErrorContext,
				4535	"HPP: internal error, state == PI\n");
				4536	ctxt->instate = XML_PARSER_CONTENT;
				4537	ctxt->checkIndex = 0;
				4538	#ifdef DEBUG_PUSH
				4539	xmlGenericError(xmlGenericErrorContext,
				4540	"HPP: entering CONTENT\n");
				4541	#endif
				4542	break;
				4543	case XML_PARSER_ENTITY_DECL:
				4544	xmlGenericError(xmlGenericErrorContext,
				4545	"HPP: internal error, state == ENTITY_DECL\n");
				4546	ctxt->instate = XML_PARSER_CONTENT;
				4547	ctxt->checkIndex = 0;
				4548	#ifdef DEBUG_PUSH
				4549	xmlGenericError(xmlGenericErrorContext,
				4550	"HPP: entering CONTENT\n");
				4551	#endif
				4552	break;
				4553	case XML_PARSER_ENTITY_VALUE:
				4554	xmlGenericError(xmlGenericErrorContext,
				4555	"HPP: internal error, state == ENTITY_VALUE\n");
				4556	ctxt->instate = XML_PARSER_CONTENT;
				4557	ctxt->checkIndex = 0;
				4558	#ifdef DEBUG_PUSH
				4559	xmlGenericError(xmlGenericErrorContext,
				4560	"HPP: entering DTD\n");
				4561	#endif
				4562	break;
				4563	case XML_PARSER_ATTRIBUTE_VALUE:
				4564	xmlGenericError(xmlGenericErrorContext,
				4565	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4566	ctxt->instate = XML_PARSER_START_TAG;
				4567	ctxt->checkIndex = 0;
				4568	#ifdef DEBUG_PUSH
				4569	xmlGenericError(xmlGenericErrorContext,
				4570	"HPP: entering START_TAG\n");
				4571	#endif
				4572	break;
				4573	case XML_PARSER_SYSTEM_LITERAL:
				4574	xmlGenericError(xmlGenericErrorContext,
				4575	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4576	ctxt->instate = XML_PARSER_CONTENT;
				4577	ctxt->checkIndex = 0;
				4578	#ifdef DEBUG_PUSH
				4579	xmlGenericError(xmlGenericErrorContext,
				4580	"HPP: entering CONTENT\n");
				4581	#endif
				4582	break;
				4583	case XML_PARSER_IGNORE:
				4584	xmlGenericError(xmlGenericErrorContext,
				4585	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4586	ctxt->instate = XML_PARSER_CONTENT;
				4587	ctxt->checkIndex = 0;
				4588	#ifdef DEBUG_PUSH
				4589	xmlGenericError(xmlGenericErrorContext,
				4590	"HPP: entering CONTENT\n");
				4591	#endif
				4592	break;
				4593	}
				4594	}
				4595	done:
				4596	if ((avail == 0) && (terminate)) {
				4597	htmlAutoClose(ctxt, NULL);
				4598	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4599	/*
				4600	* SAX: end of the document processing.
				4601	*/
				4602	ctxt->instate = XML_PARSER_EOF;
				4603	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4604	ctxt->sax->endDocument(ctxt->userData);
				4605	}
				4606	}
				4607	if ((ctxt->myDoc != NULL) &&
				4608	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4609	(ctxt->instate == XML_PARSER_EPILOG))) {
				4610	xmlDtdPtr dtd;
				4611	dtd = xmlGetIntSubset(ctxt->myDoc);
				4612	if (dtd == NULL)
				4613	ctxt->myDoc->intSubset =
				4614	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4615	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4616	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4617	}
				4618	#ifdef DEBUG_PUSH
				4619	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4620	#endif
				4621	return(ret);
				4622	}
				4623
				4624	/**
				4625	* htmlParseTry:
				4626	* @ctxt: an HTML parser context
				4627	*
				4628	* Try to progress on parsing
				4629	*
				4630	* Returns zero if no parsing was possible
				4631	*/
				4632	int
				4633	htmlParseTry(htmlParserCtxtPtr ctxt) {
				4634	return(htmlParseTryOrFinish(ctxt, 0));
				4635	}
				4636
				4637	/**
				4638	* htmlParseChunk:
				4639	* @ctxt: an XML parser context
				4640	* @chunk: an char array
				4641	* @size: the size in byte of the chunk
				4642	* @terminate: last chunk indicator
				4643	*
				4644	* Parse a Chunk of memory
				4645	*
				4646	* Returns zero if no error, the xmlParserErrors otherwise.
				4647	*/
				4648	int
				4649	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4650	int terminate) {
				4651	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4652	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4653	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4654	int cur = ctxt->input->cur - ctxt->input->base;
				4655
				4656	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4657	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4658	ctxt->input->cur = ctxt->input->base + cur;
				4659	#ifdef DEBUG_PUSH
				4660	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4661	#endif
				4662
				4663	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4664	htmlParseTryOrFinish(ctxt, terminate);
				4665	} else if (ctxt->instate != XML_PARSER_EOF) {
				4666	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4667	htmlParseTryOrFinish(ctxt, terminate);
				4668	}
				4669	if (terminate) {
				4670	if ((ctxt->instate != XML_PARSER_EOF) &&
				4671	(ctxt->instate != XML_PARSER_EPILOG) &&
				4672	(ctxt->instate != XML_PARSER_MISC)) {
				4673	ctxt->errNo = XML_ERR_DOCUMENT_END;
				4674	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4675	ctxt->sax->error(ctxt->userData,
				4676	"Extra content at the end of the document\n");
				4677	ctxt->wellFormed = 0;
				4678	}
				4679	if (ctxt->instate != XML_PARSER_EOF) {
				4680	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4681	ctxt->sax->endDocument(ctxt->userData);
				4682	}
				4683	ctxt->instate = XML_PARSER_EOF;
				4684	}
				4685	return((xmlParserErrors) ctxt->errNo);
				4686	}
				4687
				4688	/************************************************************************
				4689	* *
				4690	* User entry points *
				4691	* *
				4692	************************************************************************/
				4693
				4694	/**
				4695	* htmlCreatePushParserCtxt :
				4696	* @sax: a SAX handler
				4697	* @user_data: The user data returned on SAX callbacks
				4698	* @chunk: a pointer to an array of chars
				4699	* @size: number of chars in the array
				4700	* @filename: an optional file name or URI
				4701	* @enc: an optional encoding
				4702	*
				4703	* Create a parser context for using the HTML parser in push mode
				4704	* To allow content encoding detection, @size should be >= 4
				4705	* The value of @filename is used for fetching external entities
				4706	* and error/warning reports.
				4707	*
				4708	* Returns the new parser context or NULL
				4709	*/
				4710	htmlParserCtxtPtr
				4711	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4712	const char chunk, int size, const char filename,
				4713	xmlCharEncoding enc) {
				4714	htmlParserCtxtPtr ctxt;
				4715	htmlParserInputPtr inputStream;
				4716	xmlParserInputBufferPtr buf;
				4717
				4718	buf = xmlAllocParserInputBuffer(enc);
				4719	if (buf == NULL) return(NULL);
				4720
				4721	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4722	if (ctxt == NULL) {
				4723	xmlFree(buf);
				4724	return(NULL);
				4725	}
				4726	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4727	htmlInitParserCtxt(ctxt);
				4728	if (sax != NULL) {
				4729	if (ctxt->sax != &htmlDefaultSAXHandler)
				4730	xmlFree(ctxt->sax);
				4731	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4732	if (ctxt->sax == NULL) {
				4733	xmlFree(buf);
				4734	xmlFree(ctxt);
				4735	return(NULL);
				4736	}
				4737	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4738	if (user_data != NULL)
				4739	ctxt->userData = user_data;
				4740	}
				4741	if (filename == NULL) {
				4742	ctxt->directory = NULL;
				4743	} else {
				4744	ctxt->directory = xmlParserGetDirectory(filename);
				4745	}
				4746
				4747	inputStream = htmlNewInputStream(ctxt);
				4748	if (inputStream == NULL) {
				4749	xmlFreeParserCtxt(ctxt);
				4750	return(NULL);
				4751	}
				4752
				4753	if (filename == NULL)
				4754	inputStream->filename = NULL;
				4755	else
				4756	inputStream->filename = xmlMemStrdup(filename);
				4757	inputStream->buf = buf;
				4758	inputStream->base = inputStream->buf->buffer->content;
				4759	inputStream->cur = inputStream->buf->buffer->content;
				4760
				4761	inputPush(ctxt, inputStream);
				4762
				4763	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4764	(ctxt->input->buf != NULL)) {
				4765	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4766	#ifdef DEBUG_PUSH
				4767	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4768	#endif
				4769	}
				4770
				4771	return(ctxt);
				4772	}
				4773
				4774	/**
				4775	* htmlSAXParseDoc :
				4776	* @cur: a pointer to an array of xmlChar
				4777	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4778	* @sax: the SAX handler block
				4779	* @userData: if using SAX, this pointer will be provided on callbacks.
				4780	*
				4781	* parse an HTML in-memory document and build a tree.
				4782	* It use the given SAX function block to handle the parsing callback.
				4783	* If sax is NULL, fallback to the default DOM tree building routines.
				4784	*
				4785	* Returns the resulting document tree
				4786	*/
				4787
				4788	htmlDocPtr
				4789	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4790	htmlDocPtr ret;
				4791	htmlParserCtxtPtr ctxt;
				4792
				4793	if (cur == NULL) return(NULL);
				4794
				4795
				4796	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4797	if (ctxt == NULL) return(NULL);
				4798	if (sax != NULL) {
				4799	ctxt->sax = sax;
				4800	ctxt->userData = userData;
				4801	}
				4802
				4803	htmlParseDocument(ctxt);
				4804	ret = ctxt->myDoc;
				4805	if (sax != NULL) {
				4806	ctxt->sax = NULL;
				4807	ctxt->userData = NULL;
				4808	}
				4809	htmlFreeParserCtxt(ctxt);
				4810
				4811	return(ret);
				4812	}
				4813
				4814	/**
				4815	* htmlParseDoc :
				4816	* @cur: a pointer to an array of xmlChar
				4817	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4818	*
				4819	* parse an HTML in-memory document and build a tree.
				4820	*
				4821	* Returns the resulting document tree
				4822	*/
				4823
				4824	htmlDocPtr
				4825	htmlParseDoc(xmlChar cur, const char encoding) {
				4826	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4827	}
				4828
				4829
				4830	/**
				4831	* htmlCreateFileParserCtxt :
				4832	* @filename: the filename
				4833	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4834	*
				4835	* Create a parser context for a file content.
				4836	* Automatic support for ZLIB/Compress compressed document is provided
				4837	* by default if found at compile-time.
				4838	*
				4839	* Returns the new parser context or NULL
				4840	*/
				4841	htmlParserCtxtPtr
				4842	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4843	{
				4844	htmlParserCtxtPtr ctxt;
				4845	htmlParserInputPtr inputStream;
				4846	xmlParserInputBufferPtr buf;
				4847	/* htmlCharEncoding enc; */
				4848	xmlChar content, content_line = (xmlChar *) "charset=";
				4849
				4850	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4851	if (buf == NULL) return(NULL);
				4852
				4853	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4854	if (ctxt == NULL) {
				4855	perror("malloc");
				4856	return(NULL);
				4857	}
				4858	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4859	htmlInitParserCtxt(ctxt);
				4860	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4861	if (inputStream == NULL) {
				4862	perror("malloc");
				4863	xmlFree(ctxt);
				4864	return(NULL);
				4865	}
				4866	memset(inputStream, 0, sizeof(htmlParserInput));
				4867
				4868	inputStream->filename = xmlMemStrdup(filename);
				4869	inputStream->line = 1;
				4870	inputStream->col = 1;
				4871	inputStream->buf = buf;
				4872	inputStream->directory = NULL;
				4873
				4874	inputStream->base = inputStream->buf->buffer->content;
				4875	inputStream->cur = inputStream->buf->buffer->content;
				4876	inputStream->free = NULL;
				4877
				4878	inputPush(ctxt, inputStream);
				4879
				4880	/* set encoding */
				4881	if (encoding) {
				4882	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4883	if (content) {
				4884	strcpy ((char )content, (char )content_line);
				4885	strcat ((char )content, (char )encoding);
				4886	htmlCheckEncoding (ctxt, content);
				4887	xmlFree (content);
				4888	}
				4889	}
				4890
				4891	return(ctxt);
				4892	}
				4893
				4894	/**
				4895	* htmlSAXParseFile :
				4896	* @filename: the filename
				4897	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4898	* @sax: the SAX handler block
				4899	* @userData: if using SAX, this pointer will be provided on callbacks.
				4900	*
				4901	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4902	* compressed document is provided by default if found at compile-time.
				4903	* It use the given SAX function block to handle the parsing callback.
				4904	* If sax is NULL, fallback to the default DOM tree building routines.
				4905	*
				4906	* Returns the resulting document tree
				4907	*/
				4908
				4909	htmlDocPtr
				4910	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4911	void *userData) {
				4912	htmlDocPtr ret;
				4913	htmlParserCtxtPtr ctxt;
				4914	htmlSAXHandlerPtr oldsax = NULL;
				4915
				4916	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4917	if (ctxt == NULL) return(NULL);
				4918	if (sax != NULL) {
				4919	oldsax = ctxt->sax;
				4920	ctxt->sax = sax;
				4921	ctxt->userData = userData;
				4922	}
				4923
				4924	htmlParseDocument(ctxt);
				4925
				4926	ret = ctxt->myDoc;
				4927	if (sax != NULL) {
				4928	ctxt->sax = oldsax;
				4929	ctxt->userData = NULL;
				4930	}
				4931	htmlFreeParserCtxt(ctxt);
				4932
				4933	return(ret);
				4934	}
				4935
				4936	/**
				4937	* htmlParseFile :
				4938	* @filename: the filename
				4939	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4940	*
				4941	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4942	* compressed document is provided by default if found at compile-time.
				4943	*
				4944	* Returns the resulting document tree
				4945	*/
				4946
				4947	htmlDocPtr
				4948	htmlParseFile(const char filename, const char encoding) {
				4949	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4950	}
				4951
				4952	/**
				4953	* htmlHandleOmittedElem:
				4954	* @val: int 0 or 1
				4955	*
				4956	* Set and return the previous value for handling HTML omitted tags.
				4957	*
				4958	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4959	*/
				4960
				4961	int
				4962	htmlHandleOmittedElem(int val) {
				4963	int old = htmlOmittedDefaultValue;
				4964
				4965	htmlOmittedDefaultValue = val;
				4966	return(old);
				4967	}
				4968
				4969	#endif /* LIBXML_HTML_ENABLED */