Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: 4b3bac867d6ddb0788e5f9bc8569fa34313763d7 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* Daniel.Veillard@w3.org
				7	*/
				8
				9	#ifdef WIN32
				10	#include "win32config.h"
				11	#else
				12	#include "config.h"
				13	#endif
				14
				15	#include <libxml/xmlversion.h>
				16	#ifdef LIBXML_HTML_ENABLED
				17	#include <stdio.h>
				18	#include <string.h>
				19	#ifdef HAVE_CTYPE_H
				20	#include <ctype.h>
				21	#endif
				22	#ifdef HAVE_STDLIB_H
				23	#include <stdlib.h>
				24	#endif
				25	#ifdef HAVE_SYS_STAT_H
				26	#include <sys/stat.h>
				27	#endif
				28	#ifdef HAVE_FCNTL_H
				29	#include <fcntl.h>
				30	#endif
				31	#ifdef HAVE_UNISTD_H
				32	#include <unistd.h>
				33	#endif
				34	#ifdef HAVE_ZLIB_H
				35	#include <zlib.h>
				36	#endif
				37
				38	#include <libxml/xmlmemory.h>
				39	#include <libxml/tree.h>
				40	#include <libxml/parser.h>
				41	#include <libxml/parserInternals.h>
				42	#include <libxml/xmlerror.h>
				43	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	44	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	45	#include <libxml/entities.h>
				46	#include <libxml/encoding.h>
				47	#include <libxml/valid.h>
				48	#include <libxml/xmlIO.h>
				49
				50	#define HTML_MAX_NAMELEN 1000
				51	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				52	#define HTML_PARSER_BUFFER_SIZE 100
				53
				54	/* #define DEBUG */
				55	/* #define DEBUG_PUSH */
				56
				57	int htmlOmittedDefaultValue = 1;
				58
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	59	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				60	xmlChar end, xmlChar end2, xmlChar end3);
				61
				62	/************************************************************************
				63	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	64	* Parser stacks related functions and macros *
				65	* *
				66	************************************************************************/
				67
				68	/*
				69	* Generic function for accessing stacks in the Parser Context
				70	*/
				71
				72	#define PUSH_AND_POP(scope, type, name) \
				73	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				74	if (ctxt->name##Nr >= ctxt->name##Max) { \
				75	ctxt->name##Max *= 2; \
				76	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				77	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				78	if (ctxt->name##Tab == NULL) { \
				79	xmlGenericError(xmlGenericErrorContext, \
				80	"realloc failed !\n"); \
				81	return(0); \
				82	} \
				83	} \
				84	ctxt->name##Tab[ctxt->name##Nr] = value; \
				85	ctxt->name = value; \
				86	return(ctxt->name##Nr++); \
				87	} \
				88	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				89	type ret; \
				90	if (ctxt->name##Nr < 0) return(0); \
				91	ctxt->name##Nr--; \
				92	if (ctxt->name##Nr < 0) return(0); \
				93	if (ctxt->name##Nr > 0) \
				94	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				95	else \
				96	ctxt->name = NULL; \
				97	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				98	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				99	return(ret); \
				100	} \
				101
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	102	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				103	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	104
				105	/*
				106	* Macros for accessing the content. Those should be used only by the parser,
				107	* and not exported.
				108	*
				109	* Dirty macros, i.e. one need to make assumption on the context to use them
				110	*
				111	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				112	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				113	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				114	* in UNICODE mode. This should be used internally by the parser
				115	* only to compare to ASCII values otherwise it would break when
				116	* running with UTF-8 encoding.
				117	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				118	* to compare on ASCII based substring.
				119	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				120	* it should be used only to compare on ASCII based substring.
				121	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				122	* strings within the parser.
				123	*
				124	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				125	*
				126	* CURRENT Returns the current char value, with the full decoding of
				127	* UTF-8 if we are using this mode. It returns an int.
				128	* NEXT Skip to the next character, this does the proper decoding
				129	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				130	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				131	*/
				132
				133	#define UPPER (toupper(*ctxt->input->cur))
				134
				135	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				136
				137	#define NXT(val) ctxt->input->cur[(val)]
				138
				139	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				140
				141	#define CUR_PTR ctxt->input->cur
				142
				143	#define SHRINK xmlParserInputShrink(ctxt->input)
				144
				145	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				146
				147	#define CURRENT ((int) (*ctxt->input->cur))
				148
				149	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				150
				151	/* Inported from XML */
				152
				153	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				154	#define CUR ((int) (*ctxt->input->cur))
				155	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				156
				157	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				158	#define NXT(val) ctxt->input->cur[(val)]
				159	#define CUR_PTR ctxt->input->cur
				160
				161
				162	#define NEXTL(l) do { \
				163	if (*(ctxt->input->cur) == '\n') { \
				164	ctxt->input->line++; ctxt->input->col = 1; \
				165	} else ctxt->input->col++; \
				166	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				167	} while (0)
				168
				169	/************
				170	\
				171	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				172	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				173	************/
				174
				175	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				176	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				177
				178	#define COPY_BUF(l,b,i,v) \
				179	if (l == 1) b[i++] = (xmlChar) v; \
				180	else i += xmlCopyChar(l,&b[i],v)
				181
				182	/**
				183	* htmlCurrentChar:
				184	* @ctxt: the HTML parser context
				185	* @len: pointer to the length of the char read
				186	*
				187	* The current char value, if using UTF-8 this may actaully span multiple
				188	* bytes in the input buffer. Implement the end of line normalization:
				189	* 2.11 End-of-Line Handling
				190	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				191	* char, then the encoding converter is plugged in automatically.
				192	*
				193	* Returns the current char value and its lenght
				194	*/
				195
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	196	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	197	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				198	if (ctxt->instate == XML_PARSER_EOF)
				199	return(0);
				200
				201	if (ctxt->token != 0) {
				202	*len = 0;
				203	return(ctxt->token);
				204	}
				205	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				206	/*
				207	* We are supposed to handle UTF8, check it's valid
				208	* From rfc2044: encoding of the Unicode values on UTF-8:
				209	*
				210	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				211	* 0000 0000-0000 007F 0xxxxxxx
				212	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				213	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				214	*
				215	* Check for the 0x110000 limit too
				216	*/
				217	const unsigned char *cur = ctxt->input->cur;
				218	unsigned char c;
				219	unsigned int val;
				220
				221	c = *cur;
				222	if (c & 0x80) {
				223	if (cur[1] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[1] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xe0) == 0xe0) {
				228
				229	if (cur[2] == 0)
				230	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				231	if ((cur[2] & 0xc0) != 0x80)
				232	goto encoding_error;
				233	if ((c & 0xf0) == 0xf0) {
				234	if (cur[3] == 0)
				235	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				236	if (((c & 0xf8) != 0xf0) \|\|
				237	((cur[3] & 0xc0) != 0x80))
				238	goto encoding_error;
				239	/* 4-byte code */
				240	*len = 4;
				241	val = (cur[0] & 0x7) << 18;
				242	val \|= (cur[1] & 0x3f) << 12;
				243	val \|= (cur[2] & 0x3f) << 6;
				244	val \|= cur[3] & 0x3f;
				245	} else {
				246	/* 3-byte code */
				247	*len = 3;
				248	val = (cur[0] & 0xf) << 12;
				249	val \|= (cur[1] & 0x3f) << 6;
				250	val \|= cur[2] & 0x3f;
				251	}
				252	} else {
				253	/* 2-byte code */
				254	*len = 2;
				255	val = (cur[0] & 0x1f) << 6;
				256	val \|= cur[1] & 0x3f;
				257	}
				258	if (!IS_CHAR(val)) {
				259	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				260	if ((ctxt->sax != NULL) &&
				261	(ctxt->sax->error != NULL))
				262	ctxt->sax->error(ctxt->userData,
				263	"Char 0x%X out of allowed range\n", val);
				264	ctxt->wellFormed = 0;
				265	ctxt->disableSAX = 1;
				266	}
				267	return(val);
				268	} else {
				269	/* 1-byte code */
				270	*len = 1;
				271	return((int) *ctxt->input->cur);
				272	}
				273	}
				274	/*
				275	* Assume it's a fixed lenght encoding (1) with
				276	* a compatibke encoding for the ASCII set, since
				277	* XML constructs only use < 128 chars
				278	*/
				279	*len = 1;
				280	if ((int) *ctxt->input->cur < 0x80)
				281	return((int) *ctxt->input->cur);
				282
				283	/*
				284	* Humm this is bad, do an automatic flow conversion
				285	*/
				286	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				287	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				288	return(xmlCurrentChar(ctxt, len));
				289
				290	encoding_error:
				291	/*
				292	* If we detect an UTF8 error that probably mean that the
				293	* input encoding didn't get properly advertized in the
				294	* declaration header. Report the error and switch the encoding
				295	* to ISO-Latin-1 (if you don't like this policy, just declare the
				296	* encoding !)
				297	*/
				298	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				299	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				300	ctxt->sax->error(ctxt->userData,
				301	"Input is not proper UTF-8, indicate encoding !\n");
				302	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				303	ctxt->input->cur[0], ctxt->input->cur[1],
				304	ctxt->input->cur[2], ctxt->input->cur[3]);
				305	}
				306
				307	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				308	*len = 1;
				309	return((int) *ctxt->input->cur);
				310	}
				311
				312	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	313	* htmlSkipBlankChars:
				314	* @ctxt: the HTML parser context
				315	*
				316	* skip all blanks character found at that point in the input streams.
				317	*
				318	* Returns the number of space chars skipped
				319	*/
				320
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	321	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	322	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				323	int res = 0;
				324
				325	while (IS_BLANK(*(ctxt->input->cur))) {
				326	if ((*ctxt->input->cur == 0) &&
				327	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				328	xmlPopInput(ctxt);
				329	} else {
				330	if (*(ctxt->input->cur) == '\n') {
				331	ctxt->input->line++; ctxt->input->col = 1;
				332	} else ctxt->input->col++;
				333	ctxt->input->cur++;
				334	ctxt->nbChars++;
				335	if (*ctxt->input->cur == 0)
				336	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				337	}
				338	res++;
				339	}
				340	return(res);
				341	}
				342
				343
				344
				345	/************************************************************************
				346	* *
				347	* The list of HTML elements and their properties *
				348	* *
				349	************************************************************************/
				350
				351	/*
				352	* Start Tag: 1 means the start tag can be ommited
				353	* End Tag: 1 means the end tag can be ommited
				354	* 2 means it's forbidden (empty elements)
				355	* Depr: this element is deprecated
				356	* DTD: 1 means that this element is valid only in the Loose DTD
				357	* 2 means that this element is valid only in the Frameset DTD
				358	*
				359	* Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
				360	*/
				361	htmlElemDesc html40ElementTable[] = {
				362	{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
				363	{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
				364	{ "acronym", 0, 0, 0, 0, 0, 0, "" },
				365	{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
				366	{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
				367	{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
				368	{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
				369	{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
				370	{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
				371	{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
				372	{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
				373	{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
				374	{ "body", 1, 1, 0, 0, 0, 0, "document body " },
				375	{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
				376	{ "button", 0, 0, 0, 0, 0, 0, "push button " },
				377	{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
				378	{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
				379	{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
				380	{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
				381	{ "col", 0, 2, 2, 1, 0, 0, "table column " },
				382	{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
				383	{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
				384	{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
				385	{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
				386	{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
				387	{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
				388	{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
				389	{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
				390	{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
				391	{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
				392	{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
				393	{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
				394	{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
				395	{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
				396	{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
				398	{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
				399	{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
				400	{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
				401	{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
				402	{ "head", 1, 1, 0, 0, 0, 0, "document head " },
				403	{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
				404	{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
				405	{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
				406	{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
				407	{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
				408	{ "input", 0, 2, 2, 1, 0, 0, "form control " },
				409	{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
				410	{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
				411	{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
				412	{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
				413	{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
				414	{ "li", 0, 1, 1, 0, 0, 0, "list item " },
				415	{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
				416	{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
				417	{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
				418	{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
				419	{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
				420	{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				421	{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
				422	{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
				423	{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
				424	{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
				425	{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
				426	{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
				427	{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
				428	{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
				429	{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
				430	{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
				431	{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
				432	{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
				433	{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
				434	{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
				435	{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
				436	{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
				437	{ "style", 0, 0, 0, 0, 0, 0, "style info " },
				438	{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
				439	{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
				440	{ "table", 0, 0, 0, 0, 0, 0, " " },
				441	{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
				442	{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
				443	{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
				444	{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
				445	{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
				446	{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
				447	{ "title", 0, 0, 0, 0, 0, 0, "document title " },
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	448	{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	449	{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
				450	{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
				451	{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
				452	{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
				453	};
				454
				455	/*
				456	* start tags that imply the end of a current element
				457	* any tag of each line implies the end of the current element if the type of
				458	* that element is in the same line
				459	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	460	const char *htmlEquEnd[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	461	"dt", "dd", "li", "option", NULL,
				462	"h1", "h2", "h3", "h4", "h5", "h6", NULL,
				463	"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
				464	NULL
				465	};
				466	/*
				467	* acording the HTML DTD, HR should be added to the 2nd line above, as it
				468	* is not allowed within a H1, H2, H3, etc. But we should tolerate that case
				469	* because many documents contain rules in headings...
				470	*/
				471
				472	/*
				473	* start tags that imply the end of current element
				474	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	475	const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	476	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				477	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				478	"listing", "xmp", "head", NULL,
				479	"head", "p", NULL,
				480	"title", "p", NULL,
				481	"body", "head", "style", "link", "title", "p", NULL,
				482	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				483	"pre", "listing", "xmp", "head", "li", NULL,
				484	"hr", "p", "head", NULL,
				485	"h1", "p", "head", NULL,
				486	"h2", "p", "head", NULL,
				487	"h3", "p", "head", NULL,
				488	"h4", "p", "head", NULL,
				489	"h5", "p", "head", NULL,
				490	"h6", "p", "head", NULL,
				491	"dir", "p", "head", NULL,
				492	"address", "p", "head", "ul", NULL,
				493	"pre", "p", "head", "ul", NULL,
				494	"listing", "p", "head", NULL,
				495	"xmp", "p", "head", NULL,
				496	"blockquote", "p", "head", NULL,
				497	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				498	"xmp", "head", NULL,
				499	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				500	"head", "dd", NULL,
				501	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				502	"head", "dt", NULL,
				503	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				504	"listing", "xmp", NULL,
				505	"ol", "p", "head", "ul", NULL,
				506	"menu", "p", "head", "ul", NULL,
				507	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				508	"div", "p", "head", NULL,
				509	"noscript", "p", "head", NULL,
				510	"center", "font", "b", "i", "p", "head", NULL,
				511	"a", "a", NULL,
				512	"caption", "p", NULL,
				513	"colgroup", "caption", "colgroup", "col", "p", NULL,
				514	"col", "caption", "col", "p", NULL,
				515	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				516	"listing", "xmp", "a", NULL,
				517	"th", "th", "td", NULL,
				518	"td", "th", "td", "p", NULL,
				519	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				520	"thead", "caption", "col", "colgroup", NULL,
				521	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				522	"tbody", "p", NULL,
				523	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				524	"tfoot", "tbody", "p", NULL,
				525	"optgroup", "option", NULL,
				526	"option", "option", NULL,
				527	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				528	"pre", "listing", "xmp", "a", NULL,
				529	NULL
				530	};
				531
				532	/*
				533	* The list of HTML elements which are supposed not to have
				534	* CDATA content and where a p element will be implied
				535	*
				536	* TODO: extend that list by reading the HTML SGML DtD on
				537	* implied paragraph
				538	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	539	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	540	"html",
				541	"head",
				542	"body",
				543	NULL
				544	};
				545
				546	/*
				547	* The list of HTML attributes which are of content %Script;
				548	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				549	* it assumes the name starts with 'on'
				550	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	551	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	552	"onclick",
				553	"ondblclick",
				554	"onmousedown",
				555	"onmouseup",
				556	"onmouseover",
				557	"onmousemove",
				558	"onmouseout",
				559	"onkeypress",
				560	"onkeydown",
				561	"onkeyup",
				562	"onload",
				563	"onunload",
				564	"onfocus",
				565	"onblur",
				566	"onsubmit",
				567	"onrest",
				568	"onchange",
				569	"onselect"
				570	};
				571
				572
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	573	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	574	static int htmlStartCloseIndexinitialized = 0;
				575
				576	/************************************************************************
				577	* *
				578	* functions to handle HTML specific data *
				579	* *
				580	************************************************************************/
				581
				582	/**
				583	* htmlInitAutoClose:
				584	*
				585	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				586	* This is not reentrant. Call xmlInitParser() once before processing in
				587	* case of use in multithreaded programs.
				588	*/
				589	void
				590	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	591	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	592
				593	if (htmlStartCloseIndexinitialized) return;
				594
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	595	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				596	indx = 0;
				597	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				598	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	599	while (htmlStartClose[i] != NULL) i++;
				600	i++;
				601	}
				602	htmlStartCloseIndexinitialized = 1;
				603	}
				604
				605	/**
				606	* htmlTagLookup:
				607	* @tag: The tag name in lowercase
				608	*
				609	* Lookup the HTML tag in the ElementTable
				610	*
				611	* Returns the related htmlElemDescPtr or NULL if not found.
				612	*/
				613	htmlElemDescPtr
				614	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	615	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	616
				617	for (i = 0; i < (sizeof(html40ElementTable) /
				618	sizeof(html40ElementTable[0]));i++) {
				619	if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
				620	return(&html40ElementTable[i]);
				621	}
				622	return(NULL);
				623	}
				624
				625	/**
				626	* htmlCheckAutoClose:
				627	* @newtag: The new tag name
				628	* @oldtag: The old tag name
				629	*
				630	* Checks wether the new tag is one of the registered valid tags for closing old.
				631	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				632	*
				633	* Returns 0 if no, 1 if yes.
				634	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	635	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	636	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	637	int i, indx;
				638	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	639
				640	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				641
				642	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	643	for (indx = 0; indx < 100;indx++) {
				644	closed = htmlStartCloseIndex[indx];
				645	if (closed == NULL) return(0);
				646	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	647	}
				648
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	649	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	650	i++;
				651	while (htmlStartClose[i] != NULL) {
				652	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				653	return(1);
				654	}
				655	i++;
				656	}
				657	return(0);
				658	}
				659
				660	/**
				661	* htmlAutoCloseOnClose:
				662	* @ctxt: an HTML parser context
				663	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	664	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	665	*
				666	* The HTmL DtD allows an ending tag to implicitely close other tags.
				667	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	668	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	669	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				670	htmlElemDescPtr info;
				671	xmlChar *oldname;
				672	int i;
				673
				674	#ifdef DEBUG
				675	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				676	for (i = 0;i < ctxt->nameNr;i++)
				677	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				678	#endif
				679
				680	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				681	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
				682	}
				683	if (i < 0) return;
				684
				685	while (!xmlStrEqual(newtag, ctxt->name)) {
				686	info = htmlTagLookup(ctxt->name);
				687	if ((info == NULL) \|\| (info->endTag == 1)) {
				688	#ifdef DEBUG
				689	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				690	#endif
				691	} else {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	692	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	693	}
				694	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				695	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				696	oldname = htmlnamePop(ctxt);
				697	if (oldname != NULL) {
				698	#ifdef DEBUG
				699	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				700	#endif
				701	xmlFree(oldname);
				702	}
				703	}
				704	}
				705
				706	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	707	* htmlAutoCloseOnEnd:
				708	* @ctxt: an HTML parser context
				709	*
				710	* Close all remaining tags at the end of the stream
				711	*/
				712	static void
				713	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				714	xmlChar *oldname;
				715	int i;
				716
				717	if (ctxt->nameNr == 0)
				718	return;
				719	#ifdef DEBUG
				720	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				721	#endif
				722
				723	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				724	#ifdef DEBUG
				725	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				726	#endif
				727	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				728	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				729	oldname = htmlnamePop(ctxt);
				730	if (oldname != NULL) {
				731	#ifdef DEBUG
				732	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				733	#endif
				734	xmlFree(oldname);
				735	}
				736	}
				737	}
				738
				739	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	740	* htmlAutoClose:
				741	* @ctxt: an HTML parser context
				742	* @newtag: The new tag name or NULL
				743	*
				744	* The HTmL DtD allows a tag to implicitely close other tags.
				745	* The list is kept in htmlStartClose array. This function is
				746	* called when a new tag has been detected and generates the
				747	* appropriates closes if possible/needed.
				748	* If newtag is NULL this mean we are at the end of the resource
				749	* and we should check
				750	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	751	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	752	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				753	xmlChar *oldname;
				754	while ((newtag != NULL) && (ctxt->name != NULL) &&
				755	(htmlCheckAutoClose(newtag, ctxt->name))) {
				756	#ifdef DEBUG
				757	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				758	#endif
				759	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				760	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				761	oldname = htmlnamePop(ctxt);
				762	if (oldname != NULL) {
				763	#ifdef DEBUG
				764	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				765	#endif
				766	xmlFree(oldname);
				767	}
				768	}
				769	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	770	htmlAutoCloseOnEnd(ctxt);
				771	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	772	}
				773	while ((newtag == NULL) && (ctxt->name != NULL) &&
				774	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				775	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				776	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				777	#ifdef DEBUG
				778	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				779	#endif
				780	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				781	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				782	oldname = htmlnamePop(ctxt);
				783	if (oldname != NULL) {
				784	#ifdef DEBUG
				785	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				786	#endif
				787	xmlFree(oldname);
				788	}
				789	}
				790
				791	}
				792
				793	/**
				794	* htmlAutoCloseTag:
				795	* @doc: the HTML document
				796	* @name: The tag name
				797	* @elem: the HTML element
				798	*
				799	* The HTmL DtD allows a tag to implicitely close other tags.
				800	* The list is kept in htmlStartClose array. This function checks
				801	* if the element or one of it's children would autoclose the
				802	* given tag.
				803	*
				804	* Returns 1 if autoclose, 0 otherwise
				805	*/
				806	int
				807	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				808	htmlNodePtr child;
				809
				810	if (elem == NULL) return(1);
				811	if (xmlStrEqual(name, elem->name)) return(0);
				812	if (htmlCheckAutoClose(elem->name, name)) return(1);
				813	child = elem->children;
				814	while (child != NULL) {
				815	if (htmlAutoCloseTag(doc, name, child)) return(1);
				816	child = child->next;
				817	}
				818	return(0);
				819	}
				820
				821	/**
				822	* htmlIsAutoClosed:
				823	* @doc: the HTML document
				824	* @elem: the HTML element
				825	*
				826	* The HTmL DtD allows a tag to implicitely close other tags.
				827	* The list is kept in htmlStartClose array. This function checks
				828	* if a tag is autoclosed by one of it's child
				829	*
				830	* Returns 1 if autoclosed, 0 otherwise
				831	*/
				832	int
				833	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				834	htmlNodePtr child;
				835
				836	if (elem == NULL) return(1);
				837	child = elem->children;
				838	while (child != NULL) {
				839	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				840	child = child->next;
				841	}
				842	return(0);
				843	}
				844
				845	/**
				846	* htmlCheckImplied:
				847	* @ctxt: an HTML parser context
				848	* @newtag: The new tag name
				849	*
				850	* The HTML DtD allows a tag to exists only implicitely
				851	* called when a new tag has been detected and generates the
				852	* appropriates implicit tags if missing
				853	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	854	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	855	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				856	if (!htmlOmittedDefaultValue)
				857	return;
				858	if (xmlStrEqual(newtag, BAD_CAST"html"))
				859	return;
				860	if (ctxt->nameNr <= 0) {
				861	#ifdef DEBUG
				862	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				863	#endif
				864	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				865	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				866	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				867	}
				868	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				869	return;
				870	if ((ctxt->nameNr <= 1) &&
				871	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				872	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				873	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				874	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				875	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				876	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				877	/*
				878	* dropped OBJECT ... i you put it first BODY will be
				879	* assumed !
				880	*/
				881	#ifdef DEBUG
				882	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				883	#endif
				884	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				885	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				886	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				887	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				888	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				889	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				890	int i;
				891	for (i = 0;i < ctxt->nameNr;i++) {
				892	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				893	return;
				894	}
				895	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				896	return;
				897	}
				898	}
				899
				900	#ifdef DEBUG
				901	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				902	#endif
				903	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				904	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				905	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				906	}
				907	}
				908
				909	/**
				910	* htmlCheckParagraph
				911	* @ctxt: an HTML parser context
				912	*
				913	* Check whether a p element need to be implied before inserting
				914	* characters in the current element.
				915	*
				916	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				917	* in case of error.
				918	*/
				919
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	920	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	921	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				922	const xmlChar *tag;
				923	int i;
				924
				925	if (ctxt == NULL)
				926	return(-1);
				927	tag = ctxt->name;
				928	if (tag == NULL) {
				929	htmlAutoClose(ctxt, BAD_CAST"p");
				930	htmlCheckImplied(ctxt, BAD_CAST"p");
				931	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				932	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				933	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				934	return(1);
				935	}
				936	if (!htmlOmittedDefaultValue)
				937	return(0);
				938	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				939	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				940	#ifdef DEBUG
				941	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				942	#endif
				943	htmlAutoClose(ctxt, BAD_CAST"p");
				944	htmlCheckImplied(ctxt, BAD_CAST"p");
				945	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				946	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				947	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				948	return(1);
				949	}
				950	}
				951	return(0);
				952	}
				953
				954	/**
				955	* htmlIsScriptAttribute:
				956	* @name: an attribute name
				957	*
				958	* Check if an attribute is of content type Script
				959	*
				960	* Returns 1 is the attribute is a script 0 otherwise
				961	*/
				962	int
				963	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	964	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	965
				966	if (name == NULL)
				967	return(0);
				968	/*
				969	* all script attributes start with 'on'
				970	*/
				971	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				972	return(0);
				973	for (i = 0;
				974	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				975	i++) {
				976	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				977	return(1);
				978	}
				979	return(0);
				980	}
				981
				982	/************************************************************************
				983	* *
				984	* The list of HTML predefined entities *
				985	* *
				986	************************************************************************/
				987
				988
				989	htmlEntityDesc html40EntitiesTable[] = {
				990	/*
				991	* the 4 absolute ones, plus apostrophe.
				992	*/
				993	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				994	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				995	{ 39, "apos", "single quote" },
				996	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				997	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				998
				999	/*
				1000	* A bunch still in the 128-255 range
				1001	* Replacing them depend really on the charset used.
				1002	*/
				1003	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1004	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1005	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1006	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1007	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1008	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1009	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1010	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1011	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1012	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1013	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1014	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1015	{ 172, "not", "not sign, U+00AC ISOnum" },
				1016	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1017	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1018	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1019	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1020	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1021	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1022	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1023	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1024	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1025	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1026	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1027	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1028	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1029	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1030	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1031	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1032	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1033	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1034	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1035	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1036	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1037	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1038	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1039	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1040	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1041	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1042	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1043	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1044	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1045	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1046	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1047	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1048	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1049	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1050	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1051	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1052	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1053	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1054	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1055	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1056	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1057	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1058	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1059	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1060	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1061	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1062	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1063	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1064	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1065	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1066	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1067	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1068	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1069	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1070	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1071	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1072	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1073	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1074	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1075	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1076	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1077	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1078	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1079	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1080	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1081	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1082	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1083	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1084	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1085	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1086	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1087	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1088	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1089	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1090	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1091	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1092	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1093	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1094	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1095	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1096	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1097	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1098	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1099
				1100	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1101	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1102	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1103	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1104	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1105
				1106	/*
				1107	* Anything below should really be kept as entities references
				1108	*/
				1109	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1110
				1111	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1112	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1113
				1114	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1115	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1116	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1117	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1118	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1119	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1120	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1121	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1122	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1123	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1124	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1125	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1126	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1127	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1128	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1129	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1130	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1131	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1132	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1133	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1134	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1135	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1136	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1137	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1138
				1139	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1140	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1141	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1142	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1143	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1144	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1145	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1146	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1147	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1148	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1149	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1150	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1151	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1152	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1153	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1154	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1155	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1156	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1157	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1158	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1159	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1160	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1161	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1162	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1163	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1164	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1165	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1166	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1167
				1168	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1169	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1170	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1171	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1172	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1173	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1174	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1175	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1176	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1177	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1178	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1179	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1180	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1181	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1182	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1183	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1184	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1185
				1186	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1187	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1188
				1189	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1190
				1191	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1192	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1193
				1194	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1195	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1196
				1197	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1198	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1199
				1200	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1201
				1202	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1203	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1204	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1205	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1206	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1207	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1208	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1209	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1210	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1211	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1212	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1213	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1214	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1215	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1216	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1217	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1218
				1219	{ 8704, "forall","for all, U+2200 ISOtech" },
				1220	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1221	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1222	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1223	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1224	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1225	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1226	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1227	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1228	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1229	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1230	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1231	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1232	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1233	{ 8734, "infin","infinity, U+221E ISOtech" },
				1234	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1235	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1236	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1237	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1238	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1239	{ 8747, "int", "integral, U+222B ISOtech" },
				1240	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1241	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1242	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1243	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1244	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1245	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1246	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1247	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1248	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1249	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1250	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1251	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1252	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1253	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1254	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1255	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1256	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1257	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1258	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1259	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1260	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1261	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1262	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1263	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1264
				1265	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1266	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1267	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1268	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1269
				1270	};
				1271
				1272	/************************************************************************
				1273	* *
				1274	* Commodity functions to handle entities *
				1275	* *
				1276	************************************************************************/
				1277
				1278	/*
				1279	* Macro used to grow the current buffer.
				1280	*/
				1281	#define growBuffer(buffer) { \
				1282	buffer##_size *= 2; \
				1283	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1284	if (buffer == NULL) { \
				1285	perror("realloc failed"); \
				1286	return(NULL); \
				1287	} \
				1288	}
				1289
				1290	/**
				1291	* htmlEntityLookup:
				1292	* @name: the entity name
				1293	*
				1294	* Lookup the given entity in EntitiesTable
				1295	*
				1296	* TODO: the linear scan is really ugly, an hash table is really needed.
				1297	*
				1298	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1299	*/
				1300	htmlEntityDescPtr
				1301	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1302	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1303
				1304	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1305	sizeof(html40EntitiesTable[0]));i++) {
				1306	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1307	#ifdef DEBUG
				1308	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1309	#endif
				1310	return(&html40EntitiesTable[i]);
				1311	}
				1312	}
				1313	return(NULL);
				1314	}
				1315
				1316	/**
				1317	* htmlEntityValueLookup:
				1318	* @value: the entity's unicode value
				1319	*
				1320	* Lookup the given entity in EntitiesTable
				1321	*
				1322	* TODO: the linear scan is really ugly, an hash table is really needed.
				1323	*
				1324	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1325	*/
				1326	htmlEntityDescPtr
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1327	htmlEntityValueLookup(unsigned int value) {
				1328	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1329	#ifdef DEBUG
				1330	int lv = 0;
				1331	#endif
				1332
				1333	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1334	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1335	if (html40EntitiesTable[i].value >= value) {
				1336	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1337	break;
				1338	#ifdef DEBUG
				1339	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1340	#endif
				1341	return(&html40EntitiesTable[i]);
				1342	}
				1343	#ifdef DEBUG
				1344	if (lv > html40EntitiesTable[i].value) {
				1345	xmlGenericError(xmlGenericErrorContext,
				1346	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1347	lv, html40EntitiesTable[i].value);
				1348	}
				1349	lv = html40EntitiesTable[i].value;
				1350	#endif
				1351	}
				1352	return(NULL);
				1353	}
				1354
				1355	/**
				1356	* UTF8ToHtml:
				1357	* @out: a pointer to an array of bytes to store the result
				1358	* @outlen: the length of @out
				1359	* @in: a pointer to an array of UTF-8 chars
				1360	* @inlen: the length of @in
				1361	*
				1362	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1363	* plus HTML entities block of chars out.
				1364	*
				1365	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1366	* The value of @inlen after return is the number of octets consumed
				1367	* as the return value is positive, else unpredictiable.
				1368	* The value of @outlen after return is the number of octets consumed.
				1369	*/
				1370	int
				1371	UTF8ToHtml(unsigned char* out, int *outlen,
				1372	const unsigned char* in, int *inlen) {
				1373	const unsigned char* processed = in;
				1374	const unsigned char* outend;
				1375	const unsigned char* outstart = out;
				1376	const unsigned char* instart = in;
				1377	const unsigned char* inend;
				1378	unsigned int c, d;
				1379	int trailing;
				1380
				1381	if (in == NULL) {
				1382	/*
				1383	* initialization nothing to do
				1384	*/
				1385	*outlen = 0;
				1386	*inlen = 0;
				1387	return(0);
				1388	}
				1389	inend = in + (*inlen);
				1390	outend = out + (*outlen);
				1391	while (in < inend) {
				1392	d = *in++;
				1393	if (d < 0x80) { c= d; trailing= 0; }
				1394	else if (d < 0xC0) {
				1395	/* trailing byte in leading position */
				1396	*outlen = out - outstart;
				1397	*inlen = processed - instart;
				1398	return(-2);
				1399	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1400	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1401	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1402	else {
				1403	/* no chance for this in Ascii */
				1404	*outlen = out - outstart;
				1405	*inlen = processed - instart;
				1406	return(-2);
				1407	}
				1408
				1409	if (inend - in < trailing) {
				1410	break;
				1411	}
				1412
				1413	for ( ; trailing; trailing--) {
				1414	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1415	break;
				1416	c <<= 6;
				1417	c \|= d & 0x3F;
				1418	}
				1419
				1420	/* assertion: c is a single UTF-4 value */
				1421	if (c < 0x80) {
				1422	if (out + 1 >= outend)
				1423	break;
				1424	*out++ = c;
				1425	} else {
				1426	int len;
				1427	htmlEntityDescPtr ent;
				1428
				1429	/*
				1430	* Try to lookup a predefined HTML entity for it
				1431	*/
				1432
				1433	ent = htmlEntityValueLookup(c);
				1434	if (ent == NULL) {
				1435	/* no chance for this in Ascii */
				1436	*outlen = out - outstart;
				1437	*inlen = processed - instart;
				1438	return(-2);
				1439	}
				1440	len = strlen(ent->name);
				1441	if (out + 2 + len >= outend)
				1442	break;
				1443	*out++ = '&';
				1444	memcpy(out, ent->name, len);
				1445	out += len;
				1446	*out++ = ';';
				1447	}
				1448	processed = in;
				1449	}
				1450	*outlen = out - outstart;
				1451	*inlen = processed - instart;
				1452	return(0);
				1453	}
				1454
				1455	/**
				1456	* htmlEncodeEntities:
				1457	* @out: a pointer to an array of bytes to store the result
				1458	* @outlen: the length of @out
				1459	* @in: a pointer to an array of UTF-8 chars
				1460	* @inlen: the length of @in
				1461	* @quoteChar: the quote character to escape (' or ") or zero.
				1462	*
				1463	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1464	* plus HTML entities block of chars out.
				1465	*
				1466	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1467	* The value of @inlen after return is the number of octets consumed
				1468	* as the return value is positive, else unpredictiable.
				1469	* The value of @outlen after return is the number of octets consumed.
				1470	*/
				1471	int
				1472	htmlEncodeEntities(unsigned char* out, int *outlen,
				1473	const unsigned char* in, int *inlen, int quoteChar) {
				1474	const unsigned char* processed = in;
				1475	const unsigned char* outend = out + (*outlen);
				1476	const unsigned char* outstart = out;
				1477	const unsigned char* instart = in;
				1478	const unsigned char* inend = in + (*inlen);
				1479	unsigned int c, d;
				1480	int trailing;
				1481
				1482	while (in < inend) {
				1483	d = *in++;
				1484	if (d < 0x80) { c= d; trailing= 0; }
				1485	else if (d < 0xC0) {
				1486	/* trailing byte in leading position */
				1487	*outlen = out - outstart;
				1488	*inlen = processed - instart;
				1489	return(-2);
				1490	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1491	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1492	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1493	else {
				1494	/* no chance for this in Ascii */
				1495	*outlen = out - outstart;
				1496	*inlen = processed - instart;
				1497	return(-2);
				1498	}
				1499
				1500	if (inend - in < trailing)
				1501	break;
				1502
				1503	while (trailing--) {
				1504	if (((d= *in++) & 0xC0) != 0x80) {
				1505	*outlen = out - outstart;
				1506	*inlen = processed - instart;
				1507	return(-2);
				1508	}
				1509	c <<= 6;
				1510	c \|= d & 0x3F;
				1511	}
				1512
				1513	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1514	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1515	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1516	if (out >= outend)
				1517	break;
				1518	*out++ = c;
				1519	} else {
				1520	htmlEntityDescPtr ent;
				1521	const char *cp;
				1522	char nbuf[16];
				1523	int len;
				1524
				1525	/*
				1526	* Try to lookup a predefined HTML entity for it
				1527	*/
				1528	ent = htmlEntityValueLookup(c);
				1529	if (ent == NULL) {
				1530	sprintf(nbuf, "#%u", c);
				1531	cp = nbuf;
				1532	}
				1533	else
				1534	cp = ent->name;
				1535	len = strlen(cp);
				1536	if (out + 2 + len > outend)
				1537	break;
				1538	*out++ = '&';
				1539	memcpy(out, cp, len);
				1540	out += len;
				1541	*out++ = ';';
				1542	}
				1543	processed = in;
				1544	}
				1545	*outlen = out - outstart;
				1546	*inlen = processed - instart;
				1547	return(0);
				1548	}
				1549
				1550	/**
				1551	* htmlDecodeEntities:
				1552	* @ctxt: the parser context
				1553	* @len: the len to decode (in bytes !), -1 for no size limit
				1554	* @end: an end marker xmlChar, 0 if none
				1555	* @end2: an end marker xmlChar, 0 if none
				1556	* @end3: an end marker xmlChar, 0 if none
				1557	*
				1558	* Subtitute the HTML entities by their value
				1559	*
				1560	* DEPRECATED !!!!
				1561	*
				1562	* Returns A newly allocated string with the substitution done. The caller
				1563	* must deallocate it !
				1564	*/
				1565	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1566	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1567	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1568	static int deprecated = 0;
				1569	if (!deprecated) {
				1570	xmlGenericError(xmlGenericErrorContext,
				1571	"htmlDecodeEntities() deprecated function reached\n");
				1572	deprecated = 1;
				1573	}
				1574	return(NULL);
				1575	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1576	xmlChar *name = NULL;
				1577	xmlChar *buffer = NULL;
				1578	unsigned int buffer_size = 0;
				1579	unsigned int nbchars = 0;
				1580	htmlEntityDescPtr ent;
				1581	unsigned int max = (unsigned int) len;
				1582	int c,l;
				1583
				1584	if (ctxt->depth > 40) {
				1585	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1586	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1587	ctxt->sax->error(ctxt->userData,
				1588	"Detected entity reference loop\n");
				1589	ctxt->wellFormed = 0;
				1590	ctxt->disableSAX = 1;
				1591	return(NULL);
				1592	}
				1593
				1594	/*
				1595	* allocate a translation buffer.
				1596	*/
				1597	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1598	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1599	if (buffer == NULL) {
				1600	perror("xmlDecodeEntities: malloc failed");
				1601	return(NULL);
				1602	}
				1603
				1604	/*
				1605	* Ok loop until we reach one of the ending char or a size limit.
				1606	*/
				1607	c = CUR_CHAR(l);
				1608	while ((nbchars < max) && (c != end) &&
				1609	(c != end2) && (c != end3)) {
				1610
				1611	if (c == 0) break;
				1612	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1613	int val = htmlParseCharRef(ctxt);
				1614	COPY_BUF(0,buffer,nbchars,val);
				1615	NEXTL(l);
				1616	} else if ((c == '&') && (ctxt->token != '&')) {
				1617	ent = htmlParseEntityRef(ctxt, &name);
				1618	if (name != NULL) {
				1619	if (ent != NULL) {
				1620	int val = ent->value;
				1621	COPY_BUF(0,buffer,nbchars,val);
				1622	NEXTL(l);
				1623	} else {
				1624	const xmlChar *cur = name;
				1625
				1626	buffer[nbchars++] = '&';
				1627	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1628	growBuffer(buffer);
				1629	}
				1630	while (*cur != 0) {
				1631	buffer[nbchars++] = *cur++;
				1632	}
				1633	buffer[nbchars++] = ';';
				1634	}
				1635	}
				1636	} else {
				1637	COPY_BUF(l,buffer,nbchars,c);
				1638	NEXTL(l);
				1639	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1640	growBuffer(buffer);
				1641	}
				1642	}
				1643	c = CUR_CHAR(l);
				1644	}
				1645	buffer[nbchars++] = 0;
				1646	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1647	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1648	}
				1649
				1650	/************************************************************************
				1651	* *
				1652	* Commodity functions to handle streams *
				1653	* *
				1654	************************************************************************/
				1655
				1656	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1657	* htmlNewInputStream:
				1658	* @ctxt: an HTML parser context
				1659	*
				1660	* Create a new input stream structure
				1661	* Returns the new input stream or NULL
				1662	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1663	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1664	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1665	htmlParserInputPtr input;
				1666
				1667	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1668	if (input == NULL) {
				1669	ctxt->errNo = XML_ERR_NO_MEMORY;
				1670	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1671	ctxt->sax->error(ctxt->userData,
				1672	"malloc: couldn't allocate a new input stream\n");
				1673	return(NULL);
				1674	}
				1675	memset(input, 0, sizeof(htmlParserInput));
				1676	input->filename = NULL;
				1677	input->directory = NULL;
				1678	input->base = NULL;
				1679	input->cur = NULL;
				1680	input->buf = NULL;
				1681	input->line = 1;
				1682	input->col = 1;
				1683	input->buf = NULL;
				1684	input->free = NULL;
				1685	input->version = NULL;
				1686	input->consumed = 0;
				1687	input->length = 0;
				1688	return(input);
				1689	}
				1690
				1691
				1692	/************************************************************************
				1693	* *
				1694	* Commodity functions, cleanup needed ? *
				1695	* *
				1696	************************************************************************/
				1697
				1698	/**
				1699	* areBlanks:
				1700	* @ctxt: an HTML parser context
				1701	* @str: a xmlChar *
				1702	* @len: the size of @str
				1703	*
				1704	* Is this a sequence of blank chars that one can ignore ?
				1705	*
				1706	* Returns 1 if ignorable 0 otherwise.
				1707	*/
				1708
				1709	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1710	int i;
				1711	xmlNodePtr lastChild;
				1712
				1713	for (i = 0;i < len;i++)
				1714	if (!(IS_BLANK(str[i]))) return(0);
				1715
				1716	if (CUR == 0) return(1);
				1717	if (CUR != '<') return(0);
				1718	if (ctxt->name == NULL)
				1719	return(1);
				1720	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1721	return(1);
				1722	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1723	return(1);
				1724	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1725	return(1);
				1726	if (ctxt->node == NULL) return(0);
				1727	lastChild = xmlGetLastChild(ctxt->node);
				1728	if (lastChild == NULL) {
				1729	if (ctxt->node->content != NULL) return(0);
				1730	} else if (xmlNodeIsText(lastChild)) {
				1731	return(0);
				1732	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1733	return(0);
				1734	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1735	return(0);
				1736	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1737	return(0);
				1738	}
				1739	return(1);
				1740	}
				1741
				1742	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1743	* htmlNewDocNoDtD:
				1744	* @URI: URI for the dtd, or NULL
				1745	* @ExternalID: the external ID of the DTD, or NULL
				1746	*
				1747	* Returns a new document, do not intialize the DTD if not provided
				1748	*/
				1749	htmlDocPtr
				1750	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1751	xmlDocPtr cur;
				1752
				1753	/*
				1754	* Allocate a new document and fill the fields.
				1755	*/
				1756	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1757	if (cur == NULL) {
				1758	xmlGenericError(xmlGenericErrorContext,
				1759	"xmlNewDoc : malloc failed\n");
				1760	return(NULL);
				1761	}
				1762	memset(cur, 0, sizeof(xmlDoc));
				1763
				1764	cur->type = XML_HTML_DOCUMENT_NODE;
				1765	cur->version = NULL;
				1766	cur->intSubset = NULL;
				1767	if ((ExternalID != NULL) \|\|
				1768	(URI != NULL))
				1769	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1770	cur->doc = cur;
				1771	cur->name = NULL;
				1772	cur->children = NULL;
				1773	cur->extSubset = NULL;
				1774	cur->oldNs = NULL;
				1775	cur->encoding = NULL;
				1776	cur->standalone = 1;
				1777	cur->compression = 0;
				1778	cur->ids = NULL;
				1779	cur->refs = NULL;
				1780	#ifndef XML_WITHOUT_CORBA
				1781	cur->_private = NULL;
				1782	#endif
				1783	return(cur);
				1784	}
				1785
				1786	/**
				1787	* htmlNewDoc:
				1788	* @URI: URI for the dtd, or NULL
				1789	* @ExternalID: the external ID of the DTD, or NULL
				1790	*
				1791	* Returns a new document
				1792	*/
				1793	htmlDocPtr
				1794	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1795	if ((URI == NULL) && (ExternalID == NULL))
				1796	return(htmlNewDocNoDtD(
				1797	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				1798	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
				1799
				1800	return(htmlNewDocNoDtD(URI, ExternalID));
				1801	}
				1802
				1803
				1804	/************************************************************************
				1805	* *
				1806	* The parser itself *
				1807	* Relates to http://www.w3.org/TR/html40 *
				1808	* *
				1809	************************************************************************/
				1810
				1811	/************************************************************************
				1812	* *
				1813	* The parser itself *
				1814	* *
				1815	************************************************************************/
				1816
				1817	/**
				1818	* htmlParseHTMLName:
				1819	* @ctxt: an HTML parser context
				1820	*
				1821	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1822	* since HTML names are not case-sensitive.
				1823	*
				1824	* Returns the Tag Name parsed or NULL
				1825	*/
				1826
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1827	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1828	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1829	xmlChar *ret = NULL;
				1830	int i = 0;
				1831	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1832
				1833	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1834	(CUR != ':')) return(NULL);
				1835
				1836	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1837	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1838	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1839	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1840	else loc[i] = CUR;
				1841	i++;
				1842
				1843	NEXT;
				1844	}
				1845
				1846	ret = xmlStrndup(loc, i);
				1847
				1848	return(ret);
				1849	}
				1850
				1851	/**
				1852	* htmlParseName:
				1853	* @ctxt: an HTML parser context
				1854	*
				1855	* parse an HTML name, this routine is case sensistive.
				1856	*
				1857	* Returns the Name parsed or NULL
				1858	*/
				1859
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1860	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1861	htmlParseName(htmlParserCtxtPtr ctxt) {
				1862	xmlChar buf[HTML_MAX_NAMELEN];
				1863	int len = 0;
				1864
				1865	GROW;
				1866	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1867	return(NULL);
				1868	}
				1869
				1870	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1871	(CUR == '.') \|\| (CUR == '-') \|\|
				1872	(CUR == '_') \|\| (CUR == ':') \|\|
				1873	(IS_COMBINING(CUR)) \|\|
				1874	(IS_EXTENDER(CUR))) {
				1875	buf[len++] = CUR;
				1876	NEXT;
				1877	if (len >= HTML_MAX_NAMELEN) {
				1878	xmlGenericError(xmlGenericErrorContext,
				1879	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1880	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1881	(CUR == '.') \|\| (CUR == '-') \|\|
				1882	(CUR == '_') \|\| (CUR == ':') \|\|
				1883	(IS_COMBINING(CUR)) \|\|
				1884	(IS_EXTENDER(CUR)))
				1885	NEXT;
				1886	break;
				1887	}
				1888	}
				1889	return(xmlStrndup(buf, len));
				1890	}
				1891
				1892	/**
				1893	* htmlParseHTMLAttribute:
				1894	* @ctxt: an HTML parser context
				1895	* @stop: a char stop value
				1896	*
				1897	* parse an HTML attribute value till the stop (quote), if
				1898	* stop is 0 then it stops at the first space
				1899	*
				1900	* Returns the attribute parsed or NULL
				1901	*/
				1902
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1903	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1904	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1905	xmlChar *buffer = NULL;
				1906	int buffer_size = 0;
				1907	xmlChar *out = NULL;
				1908	xmlChar *name = NULL;
				1909
				1910	xmlChar *cur = NULL;
				1911	htmlEntityDescPtr ent;
				1912
				1913	/*
				1914	* allocate a translation buffer.
				1915	*/
				1916	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1917	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1918	if (buffer == NULL) {
				1919	perror("htmlParseHTMLAttribute: malloc failed");
				1920	return(NULL);
				1921	}
				1922	out = buffer;
				1923
				1924	/*
				1925	* Ok loop until we reach one of the ending chars
				1926	*/
				1927	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1928	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1929	if (CUR == '&') {
				1930	if (NXT(1) == '#') {
				1931	unsigned int c;
				1932	int bits;
				1933
				1934	c = htmlParseCharRef(ctxt);
				1935	if (c < 0x80)
				1936	{ *out++ = c; bits= -6; }
				1937	else if (c < 0x800)
				1938	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1939	else if (c < 0x10000)
				1940	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1941	else
				1942	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1943
				1944	for ( ; bits >= 0; bits-= 6) {
				1945	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1946	}
				1947	} else {
				1948	ent = htmlParseEntityRef(ctxt, &name);
				1949	if (name == NULL) {
				1950	*out++ = '&';
				1951	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1952	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1953
				1954	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1955	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1956	}
				1957	} else if (ent == NULL) {
				1958	*out++ = '&';
				1959	cur = name;
				1960	while (*cur != 0) {
				1961	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1962	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1963
				1964	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1965	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1966	}
				1967	out++ = cur++;
				1968	}
				1969	xmlFree(name);
				1970	} else {
				1971	unsigned int c;
				1972	int bits;
				1973
				1974	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1975	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1976
				1977	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1978	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1979	}
				1980	c = (xmlChar)ent->value;
				1981	if (c < 0x80)
				1982	{ *out++ = c; bits= -6; }
				1983	else if (c < 0x800)
				1984	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1985	else if (c < 0x10000)
				1986	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1987	else
				1988	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1989
				1990	for ( ; bits >= 0; bits-= 6) {
				1991	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1992	}
				1993	xmlFree(name);
				1994	}
				1995	}
				1996	} else {
				1997	unsigned int c;
				1998	int bits, l;
				1999
				2000	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2001	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2002
				2003	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2004	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2005	}
				2006	c = CUR_CHAR(l);
				2007	if (c < 0x80)
				2008	{ *out++ = c; bits= -6; }
				2009	else if (c < 0x800)
				2010	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2011	else if (c < 0x10000)
				2012	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2013	else
				2014	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2015
				2016	for ( ; bits >= 0; bits-= 6) {
				2017	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2018	}
				2019	NEXT;
				2020	}
				2021	}
				2022	*out++ = 0;
				2023	return(buffer);
				2024	}
				2025
				2026	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2027	* htmlParseEntityRef:
				2028	* @ctxt: an HTML parser context
				2029	* @str: location to store the entity name
				2030	*
				2031	* parse an HTML ENTITY references
				2032	*
				2033	* [68] EntityRef ::= '&' Name ';'
				2034	*
				2035	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2036	* if non-NULL *str will have to be freed by the caller.
				2037	*/
				2038	htmlEntityDescPtr
				2039	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2040	xmlChar *name;
				2041	htmlEntityDescPtr ent = NULL;
				2042	*str = NULL;
				2043
				2044	if (CUR == '&') {
				2045	NEXT;
				2046	name = htmlParseName(ctxt);
				2047	if (name == NULL) {
				2048	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2049	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2050	ctxt->wellFormed = 0;
				2051	} else {
				2052	GROW;
				2053	if (CUR == ';') {
				2054	*str = name;
				2055
				2056	/*
				2057	* Lookup the entity in the table.
				2058	*/
				2059	ent = htmlEntityLookup(name);
				2060	if (ent != NULL) /* OK that's ugly !!! */
				2061	NEXT;
				2062	} else {
				2063	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2064	ctxt->sax->error(ctxt->userData,
				2065	"htmlParseEntityRef: expecting ';'\n");
				2066	*str = name;
				2067	}
				2068	}
				2069	}
				2070	return(ent);
				2071	}
				2072
				2073	/**
				2074	* htmlParseAttValue:
				2075	* @ctxt: an HTML parser context
				2076	*
				2077	* parse a value for an attribute
				2078	* Note: the parser won't do substitution of entities here, this
				2079	* will be handled later in xmlStringGetNodeList, unless it was
				2080	* asked for ctxt->replaceEntities != 0
				2081	*
				2082	* Returns the AttValue parsed or NULL.
				2083	*/
				2084
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2085	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2086	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2087	xmlChar *ret = NULL;
				2088
				2089	if (CUR == '"') {
				2090	NEXT;
				2091	ret = htmlParseHTMLAttribute(ctxt, '"');
				2092	if (CUR != '"') {
				2093	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2094	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2095	ctxt->wellFormed = 0;
				2096	} else
				2097	NEXT;
				2098	} else if (CUR == '\'') {
				2099	NEXT;
				2100	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2101	if (CUR != '\'') {
				2102	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2103	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2104	ctxt->wellFormed = 0;
				2105	} else
				2106	NEXT;
				2107	} else {
				2108	/*
				2109	* That's an HTMLism, the attribute value may not be quoted
				2110	*/
				2111	ret = htmlParseHTMLAttribute(ctxt, 0);
				2112	if (ret == NULL) {
				2113	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2114	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2115	ctxt->wellFormed = 0;
				2116	}
				2117	}
				2118	return(ret);
				2119	}
				2120
				2121	/**
				2122	* htmlParseSystemLiteral:
				2123	* @ctxt: an HTML parser context
				2124	*
				2125	* parse an HTML Literal
				2126	*
				2127	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2128	*
				2129	* Returns the SystemLiteral parsed or NULL
				2130	*/
				2131
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2132	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2133	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2134	const xmlChar *q;
				2135	xmlChar *ret = NULL;
				2136
				2137	if (CUR == '"') {
				2138	NEXT;
				2139	q = CUR_PTR;
				2140	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2141	NEXT;
				2142	if (!IS_CHAR(CUR)) {
				2143	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2144	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2145	ctxt->wellFormed = 0;
				2146	} else {
				2147	ret = xmlStrndup(q, CUR_PTR - q);
				2148	NEXT;
				2149	}
				2150	} else if (CUR == '\'') {
				2151	NEXT;
				2152	q = CUR_PTR;
				2153	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2154	NEXT;
				2155	if (!IS_CHAR(CUR)) {
				2156	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2157	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2158	ctxt->wellFormed = 0;
				2159	} else {
				2160	ret = xmlStrndup(q, CUR_PTR - q);
				2161	NEXT;
				2162	}
				2163	} else {
				2164	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2165	ctxt->sax->error(ctxt->userData,
				2166	"SystemLiteral \" or ' expected\n");
				2167	ctxt->wellFormed = 0;
				2168	}
				2169
				2170	return(ret);
				2171	}
				2172
				2173	/**
				2174	* htmlParsePubidLiteral:
				2175	* @ctxt: an HTML parser context
				2176	*
				2177	* parse an HTML public literal
				2178	*
				2179	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2180	*
				2181	* Returns the PubidLiteral parsed or NULL.
				2182	*/
				2183
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2184	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2185	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2186	const xmlChar *q;
				2187	xmlChar *ret = NULL;
				2188	/*
				2189	* Name ::= (Letter \| '_') (NameChar)*
				2190	*/
				2191	if (CUR == '"') {
				2192	NEXT;
				2193	q = CUR_PTR;
				2194	while (IS_PUBIDCHAR(CUR)) NEXT;
				2195	if (CUR != '"') {
				2196	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2197	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2198	ctxt->wellFormed = 0;
				2199	} else {
				2200	ret = xmlStrndup(q, CUR_PTR - q);
				2201	NEXT;
				2202	}
				2203	} else if (CUR == '\'') {
				2204	NEXT;
				2205	q = CUR_PTR;
				2206	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2207	NEXT;
				2208	if (!IS_LETTER(CUR)) {
				2209	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2210	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2211	ctxt->wellFormed = 0;
				2212	} else {
				2213	ret = xmlStrndup(q, CUR_PTR - q);
				2214	NEXT;
				2215	}
				2216	} else {
				2217	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2218	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2219	ctxt->wellFormed = 0;
				2220	}
				2221
				2222	return(ret);
				2223	}
				2224
				2225	/**
				2226	* htmlParseScript:
				2227	* @ctxt: an HTML parser context
				2228	*
				2229	* parse the content of an HTML SCRIPT or STYLE element
				2230	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2231	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2232	* http://www.w3.org/TR/html4/types.html#type-script
				2233	* http://www.w3.org/TR/html4/types.html#h-6.15
				2234	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2235	*
				2236	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2237	* element and the value of intrinsic event attributes. User agents must
				2238	* not evaluate script data as HTML markup but instead must pass it on as
				2239	* data to a script engine.
				2240	* NOTES:
				2241	* - The content is passed like CDATA
				2242	* - the attributes for style and scripting "onXXX" are also described
				2243	* as CDATA but SGML allows entities references in attributes so their
				2244	* processing is identical as other attributes
				2245	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2246	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2247	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2248	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2249	int nbchar = 0;
				2250	xmlChar cur;
				2251
				2252	SHRINK;
				2253	cur = CUR;
				2254	while (IS_CHAR(cur)) {
				2255	if ((cur == '<') && (NXT(1) == '/')) {
				2256	/*
				2257	* One should break here, the specification is clear:
				2258	* Authors should therefore escape "</" within the content.
				2259	* Escape mechanisms are specific to each scripting or
				2260	* style sheet language.
				2261	*/
				2262	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2263	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2264	break; /* while */
				2265	}
				2266	buf[nbchar++] = cur;
				2267	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2268	if (ctxt->sax->cdataBlock!= NULL) {
				2269	/*
				2270	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2271	*/
				2272	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2273	}
				2274	nbchar = 0;
				2275	}
				2276	NEXT;
				2277	cur = CUR;
				2278	}
				2279	if (!(IS_CHAR(cur))) {
				2280	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2281	ctxt->sax->error(ctxt->userData,
				2282	"Invalid char in CDATA 0x%X\n", cur);
				2283	ctxt->wellFormed = 0;
				2284	NEXT;
				2285	}
				2286
				2287	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2288	if (ctxt->sax->cdataBlock!= NULL) {
				2289	/*
				2290	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2291	*/
				2292	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2293	}
				2294	}
				2295	}
				2296
				2297
				2298	/**
				2299	* htmlParseCharData:
				2300	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2301	*
				2302	* parse a CharData section.
				2303	* if we are within a CDATA section ']]>' marks an end of section.
				2304	*
				2305	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2306	*/
				2307
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2308	static void
				2309	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2310	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2311	int nbchar = 0;
				2312	int cur, l;
				2313
				2314	SHRINK;
				2315	cur = CUR_CHAR(l);
				2316	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2317	((cur != '&') \|\| (ctxt->token == '&')) &&
				2318	(IS_CHAR(cur))) {
				2319	COPY_BUF(l,buf,nbchar,cur);
				2320	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2321	/*
				2322	* Ok the segment is to be consumed as chars.
				2323	*/
				2324	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2325	if (areBlanks(ctxt, buf, nbchar)) {
				2326	if (ctxt->sax->ignorableWhitespace != NULL)
				2327	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2328	buf, nbchar);
				2329	} else {
				2330	htmlCheckParagraph(ctxt);
				2331	if (ctxt->sax->characters != NULL)
				2332	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2333	}
				2334	}
				2335	nbchar = 0;
				2336	}
				2337	NEXTL(l);
				2338	cur = CUR_CHAR(l);
				2339	}
				2340	if (nbchar != 0) {
				2341	/*
				2342	* Ok the segment is to be consumed as chars.
				2343	*/
				2344	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2345	if (areBlanks(ctxt, buf, nbchar)) {
				2346	if (ctxt->sax->ignorableWhitespace != NULL)
				2347	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2348	} else {
				2349	htmlCheckParagraph(ctxt);
				2350	if (ctxt->sax->characters != NULL)
				2351	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2352	}
				2353	}
				2354	}
				2355	}
				2356
				2357	/**
				2358	* htmlParseExternalID:
				2359	* @ctxt: an HTML parser context
				2360	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2361	*
				2362	* Parse an External ID or a Public ID
				2363	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2364	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2365	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2366	*
				2367	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2368	*
				2369	* Returns the function returns SystemLiteral and in the second
				2370	* case publicID receives PubidLiteral, is strict is off
				2371	* it is possible to return NULL and have publicID set.
				2372	*/
				2373
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2374	static xmlChar *
				2375	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2376	xmlChar *URI = NULL;
				2377
				2378	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2379	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2380	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2381	SKIP(6);
				2382	if (!IS_BLANK(CUR)) {
				2383	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2384	ctxt->sax->error(ctxt->userData,
				2385	"Space required after 'SYSTEM'\n");
				2386	ctxt->wellFormed = 0;
				2387	}
				2388	SKIP_BLANKS;
				2389	URI = htmlParseSystemLiteral(ctxt);
				2390	if (URI == NULL) {
				2391	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2392	ctxt->sax->error(ctxt->userData,
				2393	"htmlParseExternalID: SYSTEM, no URI\n");
				2394	ctxt->wellFormed = 0;
				2395	}
				2396	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2397	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2398	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2399	SKIP(6);
				2400	if (!IS_BLANK(CUR)) {
				2401	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2402	ctxt->sax->error(ctxt->userData,
				2403	"Space required after 'PUBLIC'\n");
				2404	ctxt->wellFormed = 0;
				2405	}
				2406	SKIP_BLANKS;
				2407	*publicID = htmlParsePubidLiteral(ctxt);
				2408	if (*publicID == NULL) {
				2409	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2410	ctxt->sax->error(ctxt->userData,
				2411	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2412	ctxt->wellFormed = 0;
				2413	}
				2414	SKIP_BLANKS;
				2415	if ((CUR == '"') \|\| (CUR == '\'')) {
				2416	URI = htmlParseSystemLiteral(ctxt);
				2417	}
				2418	}
				2419	return(URI);
				2420	}
				2421
				2422	/**
				2423	* htmlParseComment:
				2424	* @ctxt: an HTML parser context
				2425	*
				2426	* Parse an XML (SGML) comment <!-- .... -->
				2427	*
				2428	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2429	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2430	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2431	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2432	xmlChar *buf = NULL;
				2433	int len;
				2434	int size = HTML_PARSER_BUFFER_SIZE;
				2435	int q, ql;
				2436	int r, rl;
				2437	int cur, l;
				2438	xmlParserInputState state;
				2439
				2440	/*
				2441	* Check that there is a comment right here.
				2442	*/
				2443	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2444	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2445
				2446	state = ctxt->instate;
				2447	ctxt->instate = XML_PARSER_COMMENT;
				2448	SHRINK;
				2449	SKIP(4);
				2450	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2451	if (buf == NULL) {
				2452	xmlGenericError(xmlGenericErrorContext,
				2453	"malloc of %d byte failed\n", size);
				2454	ctxt->instate = state;
				2455	return;
				2456	}
				2457	q = CUR_CHAR(ql);
				2458	NEXTL(ql);
				2459	r = CUR_CHAR(rl);
				2460	NEXTL(rl);
				2461	cur = CUR_CHAR(l);
				2462	len = 0;
				2463	while (IS_CHAR(cur) &&
				2464	((cur != '>') \|\|
				2465	(r != '-') \|\| (q != '-'))) {
				2466	if (len + 5 >= size) {
				2467	size *= 2;
				2468	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2469	if (buf == NULL) {
				2470	xmlGenericError(xmlGenericErrorContext,
				2471	"realloc of %d byte failed\n", size);
				2472	ctxt->instate = state;
				2473	return;
				2474	}
				2475	}
				2476	COPY_BUF(ql,buf,len,q);
				2477	q = r;
				2478	ql = rl;
				2479	r = cur;
				2480	rl = l;
				2481	NEXTL(l);
				2482	cur = CUR_CHAR(l);
				2483	if (cur == 0) {
				2484	SHRINK;
				2485	GROW;
				2486	cur = CUR_CHAR(l);
				2487	}
				2488	}
				2489	buf[len] = 0;
				2490	if (!IS_CHAR(cur)) {
				2491	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2492	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2493	ctxt->sax->error(ctxt->userData,
				2494	"Comment not terminated \n<!--%.50s\n", buf);
				2495	ctxt->wellFormed = 0;
				2496	xmlFree(buf);
				2497	} else {
				2498	NEXT;
				2499	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2500	(!ctxt->disableSAX))
				2501	ctxt->sax->comment(ctxt->userData, buf);
				2502	xmlFree(buf);
				2503	}
				2504	ctxt->instate = state;
				2505	}
				2506
				2507	/**
				2508	* htmlParseCharRef:
				2509	* @ctxt: an HTML parser context
				2510	*
				2511	* parse Reference declarations
				2512	*
				2513	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2514	* '&#x' [0-9a-fA-F]+ ';'
				2515	*
				2516	* Returns the value parsed (as an int)
				2517	*/
				2518	int
				2519	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2520	int val = 0;
				2521
				2522	if ((CUR == '&') && (NXT(1) == '#') &&
				2523	(NXT(2) == 'x')) {
				2524	SKIP(3);
				2525	while (CUR != ';') {
				2526	if ((CUR >= '0') && (CUR <= '9'))
				2527	val = val * 16 + (CUR - '0');
				2528	else if ((CUR >= 'a') && (CUR <= 'f'))
				2529	val = val * 16 + (CUR - 'a') + 10;
				2530	else if ((CUR >= 'A') && (CUR <= 'F'))
				2531	val = val * 16 + (CUR - 'A') + 10;
				2532	else {
				2533	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2534	ctxt->sax->error(ctxt->userData,
				2535	"htmlParseCharRef: invalid hexadecimal value\n");
				2536	ctxt->wellFormed = 0;
				2537	return(0);
				2538	}
				2539	NEXT;
				2540	}
				2541	if (CUR == ';')
				2542	NEXT;
				2543	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2544	SKIP(2);
				2545	while (CUR != ';') {
				2546	if ((CUR >= '0') && (CUR <= '9'))
				2547	val = val * 10 + (CUR - '0');
				2548	else {
				2549	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2550	ctxt->sax->error(ctxt->userData,
				2551	"htmlParseCharRef: invalid decimal value\n");
				2552	ctxt->wellFormed = 0;
				2553	return(0);
				2554	}
				2555	NEXT;
				2556	}
				2557	if (CUR == ';')
				2558	NEXT;
				2559	} else {
				2560	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2561	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2562	ctxt->wellFormed = 0;
				2563	}
				2564	/*
				2565	* Check the value IS_CHAR ...
				2566	*/
				2567	if (IS_CHAR(val)) {
				2568	return(val);
				2569	} else {
				2570	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2571	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2572	val);
				2573	ctxt->wellFormed = 0;
				2574	}
				2575	return(0);
				2576	}
				2577
				2578
				2579	/**
				2580	* htmlParseDocTypeDecl :
				2581	* @ctxt: an HTML parser context
				2582	*
				2583	* parse a DOCTYPE declaration
				2584	*
				2585	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2586	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2587	*/
				2588
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2589	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2590	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2591	xmlChar *name;
				2592	xmlChar *ExternalID = NULL;
				2593	xmlChar *URI = NULL;
				2594
				2595	/*
				2596	* We know that '<!DOCTYPE' has been detected.
				2597	*/
				2598	SKIP(9);
				2599
				2600	SKIP_BLANKS;
				2601
				2602	/*
				2603	* Parse the DOCTYPE name.
				2604	*/
				2605	name = htmlParseName(ctxt);
				2606	if (name == NULL) {
				2607	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2608	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2609	ctxt->wellFormed = 0;
				2610	}
				2611	/*
				2612	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2613	*/
				2614
				2615	SKIP_BLANKS;
				2616
				2617	/*
				2618	* Check for SystemID and ExternalID
				2619	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2620	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2621	SKIP_BLANKS;
				2622
				2623	/*
				2624	* We should be at the end of the DOCTYPE declaration.
				2625	*/
				2626	if (CUR != '>') {
				2627	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2628	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2629	ctxt->wellFormed = 0;
				2630	/* We shouldn't try to resynchronize ... */
				2631	}
				2632	NEXT;
				2633
				2634	/*
				2635	* Create or update the document accordingly to the DOCTYPE
				2636	*/
				2637	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2638	(!ctxt->disableSAX))
				2639	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2640
				2641	/*
				2642	* Cleanup, since we don't use all those identifiers
				2643	*/
				2644	if (URI != NULL) xmlFree(URI);
				2645	if (ExternalID != NULL) xmlFree(ExternalID);
				2646	if (name != NULL) xmlFree(name);
				2647	}
				2648
				2649	/**
				2650	* htmlParseAttribute:
				2651	* @ctxt: an HTML parser context
				2652	* @value: a xmlChar ** used to store the value of the attribute
				2653	*
				2654	* parse an attribute
				2655	*
				2656	* [41] Attribute ::= Name Eq AttValue
				2657	*
				2658	* [25] Eq ::= S? '=' S?
				2659	*
				2660	* With namespace:
				2661	*
				2662	* [NS 11] Attribute ::= QName Eq AttValue
				2663	*
				2664	* Also the case QName == xmlns:??? is handled independently as a namespace
				2665	* definition.
				2666	*
				2667	* Returns the attribute name, and the value in *value.
				2668	*/
				2669
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2670	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2671	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2672	xmlChar name, val = NULL;
				2673
				2674	*value = NULL;
				2675	name = htmlParseHTMLName(ctxt);
				2676	if (name == NULL) {
				2677	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2678	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2679	ctxt->wellFormed = 0;
				2680	return(NULL);
				2681	}
				2682
				2683	/*
				2684	* read the value
				2685	*/
				2686	SKIP_BLANKS;
				2687	if (CUR == '=') {
				2688	NEXT;
				2689	SKIP_BLANKS;
				2690	val = htmlParseAttValue(ctxt);
				2691	/******
				2692	} else {
				2693	* TODO : some attribute must have values, some may not
				2694	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2695	ctxt->sax->warning(ctxt->userData,
				2696	"No value for attribute %s\n", name); */
				2697	}
				2698
				2699	*value = val;
				2700	return(name);
				2701	}
				2702
				2703	/**
				2704	* htmlCheckEncoding:
				2705	* @ctxt: an HTML parser context
				2706	* @attvalue: the attribute value
				2707	*
				2708	* Checks an http-equiv attribute from a Meta tag to detect
				2709	* the encoding
				2710	* If a new encoding is detected the parser is switched to decode
				2711	* it and pass UTF8
				2712	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2713	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2714	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2715	const xmlChar *encoding;
				2716
				2717	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2718	return;
				2719
				2720	/* do not change encoding */
				2721	if (ctxt->input->encoding != NULL)
				2722	return;
				2723
				2724	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2725	if (encoding != NULL) {
				2726	encoding += 8;
				2727	} else {
				2728	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2729	if (encoding != NULL)
				2730	encoding += 9;
				2731	}
				2732	if (encoding != NULL) {
				2733	xmlCharEncoding enc;
				2734	xmlCharEncodingHandlerPtr handler;
				2735
				2736	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2737
				2738	if (ctxt->input->encoding != NULL)
				2739	xmlFree((xmlChar *) ctxt->input->encoding);
				2740	ctxt->input->encoding = xmlStrdup(encoding);
				2741
				2742	enc = xmlParseCharEncoding((const char *) encoding);
				2743	/*
				2744	* registered set of known encodings
				2745	*/
				2746	if (enc != XML_CHAR_ENCODING_ERROR) {
				2747	xmlSwitchEncoding(ctxt, enc);
				2748	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2749	} else {
				2750	/*
				2751	* fallback for unknown encodings
				2752	*/
				2753	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2754	if (handler != NULL) {
				2755	xmlSwitchToEncoding(ctxt, handler);
				2756	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2757	} else {
				2758	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2759	}
				2760	}
				2761
				2762	if ((ctxt->input->buf != NULL) &&
				2763	(ctxt->input->buf->encoder != NULL) &&
				2764	(ctxt->input->buf->raw != NULL) &&
				2765	(ctxt->input->buf->buffer != NULL)) {
				2766	int nbchars;
				2767	int processed;
				2768
				2769	/*
				2770	* convert as much as possible to the parser reading buffer.
				2771	*/
				2772	processed = ctxt->input->cur - ctxt->input->base;
				2773	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2774	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2775	ctxt->input->buf->buffer,
				2776	ctxt->input->buf->raw);
				2777	if (nbchars < 0) {
				2778	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2779	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2780	ctxt->sax->error(ctxt->userData,
				2781	"htmlCheckEncoding: encoder error\n");
				2782	}
				2783	ctxt->input->base =
				2784	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2785	}
				2786	}
				2787	}
				2788
				2789	/**
				2790	* htmlCheckMeta:
				2791	* @ctxt: an HTML parser context
				2792	* @atts: the attributes values
				2793	*
				2794	* Checks an attributes from a Meta tag
				2795	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2796	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2797	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2798	int i;
				2799	const xmlChar att, value;
				2800	int http = 0;
				2801	const xmlChar *content = NULL;
				2802
				2803	if ((ctxt == NULL) \|\| (atts == NULL))
				2804	return;
				2805
				2806	i = 0;
				2807	att = atts[i++];
				2808	while (att != NULL) {
				2809	value = atts[i++];
				2810	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2811	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2812	http = 1;
				2813	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2814	content = value;
				2815	att = atts[i++];
				2816	}
				2817	if ((http) && (content != NULL))
				2818	htmlCheckEncoding(ctxt, content);
				2819
				2820	}
				2821
				2822	/**
				2823	* htmlParseStartTag:
				2824	* @ctxt: an HTML parser context
				2825	*
				2826	* parse a start of tag either for rule element or
				2827	* EmptyElement. In both case we don't parse the tag closing chars.
				2828	*
				2829	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2830	*
				2831	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2832	*
				2833	* With namespace:
				2834	*
				2835	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2836	*
				2837	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2838	*
				2839	*/
				2840
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2841	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2842	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2843	xmlChar *name;
				2844	xmlChar *attname;
				2845	xmlChar *attvalue;
				2846	const xmlChar **atts = NULL;
				2847	int nbatts = 0;
				2848	int maxatts = 0;
				2849	int meta = 0;
				2850	int i;
				2851
				2852	if (CUR != '<') return;
				2853	NEXT;
				2854
				2855	GROW;
				2856	name = htmlParseHTMLName(ctxt);
				2857	if (name == NULL) {
				2858	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2859	ctxt->sax->error(ctxt->userData,
				2860	"htmlParseStartTag: invalid element name\n");
				2861	ctxt->wellFormed = 0;
				2862	/* Dump the bogus tag like browsers do */
				2863	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2864	NEXT;
				2865	return;
				2866	}
				2867	if (xmlStrEqual(name, BAD_CAST"meta"))
				2868	meta = 1;
				2869
				2870	/*
				2871	* Check for auto-closure of HTML elements.
				2872	*/
				2873	htmlAutoClose(ctxt, name);
				2874
				2875	/*
				2876	* Check for implied HTML elements.
				2877	*/
				2878	htmlCheckImplied(ctxt, name);
				2879
				2880	/*
				2881	* Avoid html at any level > 0, head at any level != 1
				2882	* or any attempt to recurse body
				2883	*/
				2884	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2885	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2886	ctxt->sax->error(ctxt->userData,
				2887	"htmlParseStartTag: misplaced <html> tag\n");
				2888	ctxt->wellFormed = 0;
				2889	xmlFree(name);
				2890	return;
				2891	}
				2892	if ((ctxt->nameNr != 1) &&
				2893	(xmlStrEqual(name, BAD_CAST"head"))) {
				2894	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2895	ctxt->sax->error(ctxt->userData,
				2896	"htmlParseStartTag: misplaced <head> tag\n");
				2897	ctxt->wellFormed = 0;
				2898	xmlFree(name);
				2899	return;
				2900	}
				2901	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2902	int indx;
				2903	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2904	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2905	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2906	ctxt->sax->error(ctxt->userData,
				2907	"htmlParseStartTag: misplaced <body> tag\n");
				2908	ctxt->wellFormed = 0;
				2909	xmlFree(name);
				2910	return;
				2911	}
				2912	}
				2913	}
				2914
				2915	/*
				2916	* Now parse the attributes, it ends up with the ending
				2917	*
				2918	* (S Attribute)* S?
				2919	*/
				2920	SKIP_BLANKS;
				2921	while ((IS_CHAR(CUR)) &&
				2922	(CUR != '>') &&
				2923	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2924	long cons = ctxt->nbChars;
				2925
				2926	GROW;
				2927	attname = htmlParseAttribute(ctxt, &attvalue);
				2928	if (attname != NULL) {
				2929
				2930	/*
				2931	* Well formedness requires at most one declaration of an attribute
				2932	*/
				2933	for (i = 0; i < nbatts;i += 2) {
				2934	if (xmlStrEqual(atts[i], attname)) {
				2935	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2936	ctxt->sax->error(ctxt->userData,
				2937	"Attribute %s redefined\n",
				2938	attname);
				2939	ctxt->wellFormed = 0;
				2940	xmlFree(attname);
				2941	if (attvalue != NULL)
				2942	xmlFree(attvalue);
				2943	goto failed;
				2944	}
				2945	}
				2946
				2947	/*
				2948	* Add the pair to atts
				2949	*/
				2950	if (atts == NULL) {
				2951	maxatts = 10;
				2952	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				2953	if (atts == NULL) {
				2954	xmlGenericError(xmlGenericErrorContext,
				2955	"malloc of %ld byte failed\n",
				2956	maxatts * (long)sizeof(xmlChar *));
				2957	if (name != NULL) xmlFree(name);
				2958	return;
				2959	}
				2960	} else if (nbatts + 4 > maxatts) {
				2961	maxatts *= 2;
				2962	atts = (const xmlChar *) xmlRealloc((void ) atts,
				2963	maxatts * sizeof(xmlChar *));
				2964	if (atts == NULL) {
				2965	xmlGenericError(xmlGenericErrorContext,
				2966	"realloc of %ld byte failed\n",
				2967	maxatts * (long)sizeof(xmlChar *));
				2968	if (name != NULL) xmlFree(name);
				2969	return;
				2970	}
				2971	}
				2972	atts[nbatts++] = attname;
				2973	atts[nbatts++] = attvalue;
				2974	atts[nbatts] = NULL;
				2975	atts[nbatts + 1] = NULL;
				2976	}
				2977	else {
				2978	/* Dump the bogus attribute string up to the next blank or
				2979	* the end of the tag. */
				2980	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				2981	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				2982	NEXT;
				2983	}
				2984
				2985	failed:
				2986	SKIP_BLANKS;
				2987	if (cons == ctxt->nbChars) {
				2988	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2989	ctxt->sax->error(ctxt->userData,
				2990	"htmlParseStartTag: problem parsing attributes\n");
				2991	ctxt->wellFormed = 0;
				2992	break;
				2993	}
				2994	}
				2995
				2996	/*
				2997	* Handle specific association to the META tag
				2998	*/
				2999	if (meta)
				3000	htmlCheckMeta(ctxt, atts);
				3001
				3002	/*
				3003	* SAX: Start of Element !
				3004	*/
				3005	htmlnamePush(ctxt, xmlStrdup(name));
				3006	#ifdef DEBUG
				3007	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3008	#endif
				3009	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3010	ctxt->sax->startElement(ctxt->userData, name, atts);
				3011
				3012	if (atts != NULL) {
				3013	for (i = 0;i < nbatts;i++) {
				3014	if (atts[i] != NULL)
				3015	xmlFree((xmlChar *) atts[i]);
				3016	}
				3017	xmlFree((void *) atts);
				3018	}
				3019	if (name != NULL) xmlFree(name);
				3020	}
				3021
				3022	/**
				3023	* htmlParseEndTag:
				3024	* @ctxt: an HTML parser context
				3025	*
				3026	* parse an end of tag
				3027	*
				3028	* [42] ETag ::= '</' Name S? '>'
				3029	*
				3030	* With namespace
				3031	*
				3032	* [NS 9] ETag ::= '</' QName S? '>'
				3033	*/
				3034
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3035	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3036	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3037	xmlChar *name;
				3038	xmlChar *oldname;
				3039	int i;
				3040
				3041	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3042	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3043	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3044	ctxt->wellFormed = 0;
				3045	return;
				3046	}
				3047	SKIP(2);
				3048
				3049	name = htmlParseHTMLName(ctxt);
				3050	if (name == NULL) return;
				3051
				3052	/*
				3053	* We should definitely be at the ending "S? '>'" part
				3054	*/
				3055	SKIP_BLANKS;
				3056	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3057	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3058	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3059	ctxt->wellFormed = 0;
				3060	} else
				3061	NEXT;
				3062
				3063	/*
				3064	* If the name read is not one of the element in the parsing stack
				3065	* then return, it's just an error.
				3066	*/
				3067	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3068	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3069	}
				3070	if (i < 0) {
				3071	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3072	ctxt->sax->error(ctxt->userData,
				3073	"Unexpected end tag : %s\n", name);
				3074	xmlFree(name);
				3075	ctxt->wellFormed = 0;
				3076	return;
				3077	}
				3078
				3079
				3080	/*
				3081	* Check for auto-closure of HTML elements.
				3082	*/
				3083
				3084	htmlAutoCloseOnClose(ctxt, name);
				3085
				3086	/*
				3087	* Well formedness constraints, opening and closing must match.
				3088	* With the exception that the autoclose may have popped stuff out
				3089	* of the stack.
				3090	*/
				3091	if (!xmlStrEqual(name, ctxt->name)) {
				3092	#ifdef DEBUG
				3093	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3094	#endif
				3095	if ((ctxt->name != NULL) &&
				3096	(!xmlStrEqual(ctxt->name, name))) {
				3097	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3098	ctxt->sax->error(ctxt->userData,
				3099	"Opening and ending tag mismatch: %s and %s\n",
				3100	name, ctxt->name);
				3101	ctxt->wellFormed = 0;
				3102	}
				3103	}
				3104
				3105	/*
				3106	* SAX: End of Tag
				3107	*/
				3108	oldname = ctxt->name;
				3109	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3110	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3111	ctxt->sax->endElement(ctxt->userData, name);
				3112	oldname = htmlnamePop(ctxt);
				3113	if (oldname != NULL) {
				3114	#ifdef DEBUG
				3115	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3116	#endif
				3117	xmlFree(oldname);
				3118	#ifdef DEBUG
				3119	} else {
				3120	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3121	#endif
				3122	}
				3123	}
				3124
				3125	if (name != NULL)
				3126	xmlFree(name);
				3127
				3128	return;
				3129	}
				3130
				3131
				3132	/**
				3133	* htmlParseReference:
				3134	* @ctxt: an HTML parser context
				3135	*
				3136	* parse and handle entity references in content,
				3137	* this will end-up in a call to character() since this is either a
				3138	* CharRef, or a predefined entity.
				3139	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3140	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3141	htmlParseReference(htmlParserCtxtPtr ctxt) {
				3142	htmlEntityDescPtr ent;
				3143	xmlChar out[6];
				3144	xmlChar *name;
				3145	if (CUR != '&') return;
				3146
				3147	if (NXT(1) == '#') {
				3148	unsigned int c;
				3149	int bits, i = 0;
				3150
				3151	c = htmlParseCharRef(ctxt);
				3152	if (c == 0)
				3153	return;
				3154
				3155	if (c < 0x80) { out[i++]= c; bits= -6; }
				3156	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3157	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3158	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3159
				3160	for ( ; bits >= 0; bits-= 6) {
				3161	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3162	}
				3163	out[i] = 0;
				3164
				3165	htmlCheckParagraph(ctxt);
				3166	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3167	ctxt->sax->characters(ctxt->userData, out, i);
				3168	} else {
				3169	ent = htmlParseEntityRef(ctxt, &name);
				3170	if (name == NULL) {
				3171	htmlCheckParagraph(ctxt);
				3172	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3173	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3174	return;
				3175	}
				3176	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3177	htmlCheckParagraph(ctxt);
				3178	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3179	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3180	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3181	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3182	}
				3183	} else {
				3184	unsigned int c;
				3185	int bits, i = 0;
				3186
				3187	c = ent->value;
				3188	if (c < 0x80)
				3189	{ out[i++]= c; bits= -6; }
				3190	else if (c < 0x800)
				3191	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3192	else if (c < 0x10000)
				3193	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3194	else
				3195	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3196
				3197	for ( ; bits >= 0; bits-= 6) {
				3198	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3199	}
				3200	out[i] = 0;
				3201
				3202	htmlCheckParagraph(ctxt);
				3203	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3204	ctxt->sax->characters(ctxt->userData, out, i);
				3205	}
				3206	xmlFree(name);
				3207	}
				3208	}
				3209
				3210	/**
				3211	* htmlParseContent:
				3212	* @ctxt: an HTML parser context
				3213	* @name: the node name
				3214	*
				3215	* Parse a content: comment, sub-element, reference or text.
				3216	*
				3217	*/
				3218
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3219	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3220	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3221	xmlChar *currentNode;
				3222	int depth;
				3223
				3224	currentNode = xmlStrdup(ctxt->name);
				3225	depth = ctxt->nameNr;
				3226	while (1) {
				3227	long cons = ctxt->nbChars;
				3228
				3229	GROW;
				3230	/*
				3231	* Our tag or one of it's parent or children is ending.
				3232	*/
				3233	if ((CUR == '<') && (NXT(1) == '/')) {
				3234	htmlParseEndTag(ctxt);
				3235	if (currentNode != NULL) xmlFree(currentNode);
				3236	return;
				3237	}
				3238
				3239	/*
				3240	* Has this node been popped out during parsing of
				3241	* the next element
				3242	*/
				3243	if ((!xmlStrEqual(currentNode, ctxt->name)) &&
				3244	(depth >= ctxt->nameNr)) {
				3245	if (currentNode != NULL) xmlFree(currentNode);
				3246	return;
				3247	}
				3248
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3249	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3250	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3251	/*
				3252	* Handle SCRIPT/STYLE separately
				3253	*/
				3254	htmlParseScript(ctxt);
				3255	} else {
				3256	/*
				3257	* Sometimes DOCTYPE arrives in the middle of the document
				3258	*/
				3259	if ((CUR == '<') && (NXT(1) == '!') &&
				3260	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3261	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3262	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3263	(UPP(8) == 'E')) {
				3264	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3265	ctxt->sax->error(ctxt->userData,
				3266	"Misplaced DOCTYPE declaration\n");
				3267	ctxt->wellFormed = 0;
				3268	htmlParseDocTypeDecl(ctxt);
				3269	}
				3270
				3271	/*
				3272	* First case : a comment
				3273	*/
				3274	if ((CUR == '<') && (NXT(1) == '!') &&
				3275	(NXT(2) == '-') && (NXT(3) == '-')) {
				3276	htmlParseComment(ctxt);
				3277	}
				3278
				3279	/*
				3280	* Second case : a sub-element.
				3281	*/
				3282	else if (CUR == '<') {
				3283	htmlParseElement(ctxt);
				3284	}
				3285
				3286	/*
				3287	* Third case : a reference. If if has not been resolved,
				3288	* parsing returns it's Name, create the node
				3289	*/
				3290	else if (CUR == '&') {
				3291	htmlParseReference(ctxt);
				3292	}
				3293
				3294	/*
				3295	* Fourth : end of the resource
				3296	*/
				3297	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	3298	htmlAutoCloseOnEnd(ctxt);
				3299	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3300	}
				3301
				3302	/*
				3303	* Last case, text. Note that References are handled directly.
				3304	*/
				3305	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3306	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3307	}
				3308
				3309	if (cons == ctxt->nbChars) {
				3310	if (ctxt->node != NULL) {
				3311	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3312	ctxt->sax->error(ctxt->userData,
				3313	"detected an error in element content\n");
				3314	ctxt->wellFormed = 0;
				3315	}
				3316	break;
				3317	}
				3318	}
				3319	GROW;
				3320	}
				3321	if (currentNode != NULL) xmlFree(currentNode);
				3322	}
				3323
				3324	/**
				3325	* htmlParseElement:
				3326	* @ctxt: an HTML parser context
				3327	*
				3328	* parse an HTML element, this is highly recursive
				3329	*
				3330	* [39] element ::= EmptyElemTag \| STag content ETag
				3331	*
				3332	* [41] Attribute ::= Name Eq AttValue
				3333	*/
				3334
				3335	void
				3336	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3337	xmlChar *name;
				3338	xmlChar *currentNode = NULL;
				3339	htmlElemDescPtr info;
				3340	htmlParserNodeInfo node_info;
				3341	xmlChar *oldname;
				3342	int depth = ctxt->nameNr;
				3343
				3344	/* Capture start position */
				3345	if (ctxt->record_info) {
				3346	node_info.begin_pos = ctxt->input->consumed +
				3347	(CUR_PTR - ctxt->input->base);
				3348	node_info.begin_line = ctxt->input->line;
				3349	}
				3350
				3351	oldname = xmlStrdup(ctxt->name);
				3352	htmlParseStartTag(ctxt);
				3353	name = ctxt->name;
				3354	#ifdef DEBUG
				3355	if (oldname == NULL)
				3356	xmlGenericError(xmlGenericErrorContext,
				3357	"Start of element %s\n", name);
				3358	else if (name == NULL)
				3359	xmlGenericError(xmlGenericErrorContext,
				3360	"Start of element failed, was %s\n", oldname);
				3361	else
				3362	xmlGenericError(xmlGenericErrorContext,
				3363	"Start of element %s, was %s\n", name, oldname);
				3364	#endif
				3365	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3366	(name == NULL)) {
				3367	if (CUR == '>')
				3368	NEXT;
				3369	if (oldname != NULL)
				3370	xmlFree(oldname);
				3371	return;
				3372	}
				3373	if (oldname != NULL)
				3374	xmlFree(oldname);
				3375
				3376	/*
				3377	* Lookup the info for that element.
				3378	*/
				3379	info = htmlTagLookup(name);
				3380	if (info == NULL) {
				3381	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3382	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3383	name);
				3384	ctxt->wellFormed = 0;
				3385	} else if (info->depr) {
				3386	/***************************
				3387	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3388	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3389	name);
				3390	***************************/
				3391	}
				3392
				3393	/*
				3394	* Check for an Empty Element labelled the XML/SGML way
				3395	*/
				3396	if ((CUR == '/') && (NXT(1) == '>')) {
				3397	SKIP(2);
				3398	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3399	ctxt->sax->endElement(ctxt->userData, name);
				3400	oldname = htmlnamePop(ctxt);
				3401	#ifdef DEBUG
				3402	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3403	#endif
				3404	if (oldname != NULL)
				3405	xmlFree(oldname);
				3406	return;
				3407	}
				3408
				3409	if (CUR == '>') {
				3410	NEXT;
				3411	} else {
				3412	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3413	ctxt->sax->error(ctxt->userData,
				3414	"Couldn't find end of Start Tag %s\n",
				3415	name);
				3416	ctxt->wellFormed = 0;
				3417
				3418	/*
				3419	* end of parsing of this node.
				3420	*/
				3421	if (xmlStrEqual(name, ctxt->name)) {
				3422	nodePop(ctxt);
				3423	oldname = htmlnamePop(ctxt);
				3424	#ifdef DEBUG
				3425	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3426	#endif
				3427	if (oldname != NULL)
				3428	xmlFree(oldname);
				3429	}
				3430
				3431	/*
				3432	* Capture end position and add node
				3433	*/
				3434	if ( currentNode != NULL && ctxt->record_info ) {
				3435	node_info.end_pos = ctxt->input->consumed +
				3436	(CUR_PTR - ctxt->input->base);
				3437	node_info.end_line = ctxt->input->line;
				3438	node_info.node = ctxt->node;
				3439	xmlParserAddNodeInfo(ctxt, &node_info);
				3440	}
				3441	return;
				3442	}
				3443
				3444	/*
				3445	* Check for an Empty Element from DTD definition
				3446	*/
				3447	if ((info != NULL) && (info->empty)) {
				3448	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3449	ctxt->sax->endElement(ctxt->userData, name);
				3450	oldname = htmlnamePop(ctxt);
				3451	#ifdef DEBUG
				3452	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3453	#endif
				3454	if (oldname != NULL)
				3455	xmlFree(oldname);
				3456	return;
				3457	}
				3458
				3459	/*
				3460	* Parse the content of the element:
				3461	*/
				3462	currentNode = xmlStrdup(ctxt->name);
				3463	depth = ctxt->nameNr;
				3464	while (IS_CHAR(CUR)) {
				3465	htmlParseContent(ctxt);
				3466	if (ctxt->nameNr < depth) break;
				3467	}
				3468
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3469	/*
				3470	* Capture end position and add node
				3471	*/
				3472	if ( currentNode != NULL && ctxt->record_info ) {
				3473	node_info.end_pos = ctxt->input->consumed +
				3474	(CUR_PTR - ctxt->input->base);
				3475	node_info.end_line = ctxt->input->line;
				3476	node_info.node = ctxt->node;
				3477	xmlParserAddNodeInfo(ctxt, &node_info);
				3478	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	3479	if (!IS_CHAR(CUR)) {
				3480	htmlAutoCloseOnEnd(ctxt);
				3481	}
				3482
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3483	if (currentNode != NULL)
				3484	xmlFree(currentNode);
				3485	}
				3486
				3487	/**
				3488	* htmlParseDocument :
				3489	* @ctxt: an HTML parser context
				3490	*
				3491	* parse an HTML document (and build a tree if using the standard SAX
				3492	* interface).
				3493	*
				3494	* Returns 0, -1 in case of error. the parser context is augmented
				3495	* as a result of the parsing.
				3496	*/
				3497
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3498	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3499	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3500	xmlDtdPtr dtd;
				3501
				3502	htmlDefaultSAXHandlerInit();
				3503	ctxt->html = 1;
				3504
				3505	GROW;
				3506	/*
				3507	* SAX: beginning of the document processing.
				3508	*/
				3509	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3510	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3511
				3512	/*
				3513	* Wipe out everything which is before the first '<'
				3514	*/
				3515	SKIP_BLANKS;
				3516	if (CUR == 0) {
				3517	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3518	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3519	ctxt->wellFormed = 0;
				3520	}
				3521
				3522	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3523	ctxt->sax->startDocument(ctxt->userData);
				3524
				3525
				3526	/*
				3527	* Parse possible comments before any content
				3528	*/
				3529	while ((CUR == '<') && (NXT(1) == '!') &&
				3530	(NXT(2) == '-') && (NXT(3) == '-')) {
				3531	htmlParseComment(ctxt);
				3532	SKIP_BLANKS;
				3533	}
				3534
				3535
				3536	/*
				3537	* Then possibly doc type declaration(s) and more Misc
				3538	* (doctypedecl Misc*)?
				3539	*/
				3540	if ((CUR == '<') && (NXT(1) == '!') &&
				3541	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3542	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3543	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3544	(UPP(8) == 'E')) {
				3545	htmlParseDocTypeDecl(ctxt);
				3546	}
				3547	SKIP_BLANKS;
				3548
				3549	/*
				3550	* Parse possible comments before any content
				3551	*/
				3552	while ((CUR == '<') && (NXT(1) == '!') &&
				3553	(NXT(2) == '-') && (NXT(3) == '-')) {
				3554	htmlParseComment(ctxt);
				3555	SKIP_BLANKS;
				3556	}
				3557
				3558	/*
				3559	* Time to start parsing the tree itself
				3560	*/
				3561	htmlParseContent(ctxt);
				3562
				3563	/*
				3564	* autoclose
				3565	*/
				3566	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	3567	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3568
				3569
				3570	/*
				3571	* SAX: end of the document processing.
				3572	*/
				3573	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3574	ctxt->sax->endDocument(ctxt->userData);
				3575
				3576	if (ctxt->myDoc != NULL) {
				3577	dtd = xmlGetIntSubset(ctxt->myDoc);
				3578	if (dtd == NULL)
				3579	ctxt->myDoc->intSubset =
				3580	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3581	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3582	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3583	}
				3584	if (! ctxt->wellFormed) return(-1);
				3585	return(0);
				3586	}
				3587
				3588
				3589	/************************************************************************
				3590	* *
				3591	* Parser contexts handling *
				3592	* *
				3593	************************************************************************/
				3594
				3595	/**
				3596	* xmlInitParserCtxt:
				3597	* @ctxt: an HTML parser context
				3598	*
				3599	* Initialize a parser context
				3600	*/
				3601
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3602	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3603	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3604	{
				3605	htmlSAXHandler *sax;
				3606
				3607	if (ctxt == NULL) return;
				3608	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3609
				3610	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3611	if (sax == NULL) {
				3612	xmlGenericError(xmlGenericErrorContext,
				3613	"htmlInitParserCtxt: out of memory\n");
				3614	}
				3615	else
				3616	memset(sax, 0, sizeof(htmlSAXHandler));
				3617
				3618	/* Allocate the Input stack */
				3619	ctxt->inputTab = (htmlParserInputPtr *)
				3620	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3621	if (ctxt->inputTab == NULL) {
				3622	xmlGenericError(xmlGenericErrorContext,
				3623	"htmlInitParserCtxt: out of memory\n");
				3624	ctxt->inputNr = 0;
				3625	ctxt->inputMax = 0;
				3626	ctxt->input = NULL;
				3627	return;
				3628	}
				3629	ctxt->inputNr = 0;
				3630	ctxt->inputMax = 5;
				3631	ctxt->input = NULL;
				3632	ctxt->version = NULL;
				3633	ctxt->encoding = NULL;
				3634	ctxt->standalone = -1;
				3635	ctxt->instate = XML_PARSER_START;
				3636
				3637	/* Allocate the Node stack */
				3638	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3639	if (ctxt->nodeTab == NULL) {
				3640	xmlGenericError(xmlGenericErrorContext,
				3641	"htmlInitParserCtxt: out of memory\n");
				3642	ctxt->nodeNr = 0;
				3643	ctxt->nodeMax = 0;
				3644	ctxt->node = NULL;
				3645	ctxt->inputNr = 0;
				3646	ctxt->inputMax = 0;
				3647	ctxt->input = NULL;
				3648	return;
				3649	}
				3650	ctxt->nodeNr = 0;
				3651	ctxt->nodeMax = 10;
				3652	ctxt->node = NULL;
				3653
				3654	/* Allocate the Name stack */
				3655	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3656	if (ctxt->nameTab == NULL) {
				3657	xmlGenericError(xmlGenericErrorContext,
				3658	"htmlInitParserCtxt: out of memory\n");
				3659	ctxt->nameNr = 0;
				3660	ctxt->nameMax = 10;
				3661	ctxt->name = NULL;
				3662	ctxt->nodeNr = 0;
				3663	ctxt->nodeMax = 0;
				3664	ctxt->node = NULL;
				3665	ctxt->inputNr = 0;
				3666	ctxt->inputMax = 0;
				3667	ctxt->input = NULL;
				3668	return;
				3669	}
				3670	ctxt->nameNr = 0;
				3671	ctxt->nameMax = 10;
				3672	ctxt->name = NULL;
				3673
				3674	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3675	else {
				3676	ctxt->sax = sax;
				3677	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3678	}
				3679	ctxt->userData = ctxt;
				3680	ctxt->myDoc = NULL;
				3681	ctxt->wellFormed = 1;
				3682	ctxt->replaceEntities = 0;
				3683	ctxt->html = 1;
				3684	ctxt->record_info = 0;
				3685	ctxt->validate = 0;
				3686	ctxt->nbChars = 0;
				3687	ctxt->checkIndex = 0;
				3688	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3689	}
				3690
				3691	/**
				3692	* htmlFreeParserCtxt:
				3693	* @ctxt: an HTML parser context
				3694	*
				3695	* Free all the memory used by a parser context. However the parsed
				3696	* document in ctxt->myDoc is not freed.
				3697	*/
				3698
				3699	void
				3700	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3701	{
				3702	xmlFreeParserCtxt(ctxt);
				3703	}
				3704
				3705	/**
				3706	* htmlCreateDocParserCtxt :
				3707	* @cur: a pointer to an array of xmlChar
				3708	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3709	*
				3710	* Create a parser context for an HTML document.
				3711	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3712	* TODO: check the need to add encoding handling there
				3713	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3714	* Returns the new parser context or NULL
				3715	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3716	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3717	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3718	htmlParserCtxtPtr ctxt;
				3719	htmlParserInputPtr input;
				3720	/* htmlCharEncoding enc; */
				3721
				3722	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3723	if (ctxt == NULL) {
				3724	perror("malloc");
				3725	return(NULL);
				3726	}
				3727	htmlInitParserCtxt(ctxt);
				3728	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3729	if (input == NULL) {
				3730	perror("malloc");
				3731	xmlFree(ctxt);
				3732	return(NULL);
				3733	}
				3734	memset(input, 0, sizeof(htmlParserInput));
				3735
				3736	input->line = 1;
				3737	input->col = 1;
				3738	input->base = cur;
				3739	input->cur = cur;
				3740
				3741	inputPush(ctxt, input);
				3742	return(ctxt);
				3743	}
				3744
				3745	/************************************************************************
				3746	* *
				3747	* Progressive parsing interfaces *
				3748	* *
				3749	************************************************************************/
				3750
				3751	/**
				3752	* htmlParseLookupSequence:
				3753	* @ctxt: an HTML parser context
				3754	* @first: the first char to lookup
				3755	* @next: the next char to lookup or zero
				3756	* @third: the next char to lookup or zero
				3757	*
				3758	* Try to find if a sequence (first, next, third) or just (first next) or
				3759	* (first) is available in the input stream.
				3760	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3761	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3762	* parser, do not use liberally.
				3763	* This is basically similar to xmlParseLookupSequence()
				3764	*
				3765	* Returns the index to the current parsing point if the full sequence
				3766	* is available, -1 otherwise.
				3767	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3768	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3769	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3770	xmlChar next, xmlChar third) {
				3771	int base, len;
				3772	htmlParserInputPtr in;
				3773	const xmlChar *buf;
				3774
				3775	in = ctxt->input;
				3776	if (in == NULL) return(-1);
				3777	base = in->cur - in->base;
				3778	if (base < 0) return(-1);
				3779	if (ctxt->checkIndex > base)
				3780	base = ctxt->checkIndex;
				3781	if (in->buf == NULL) {
				3782	buf = in->base;
				3783	len = in->length;
				3784	} else {
				3785	buf = in->buf->buffer->content;
				3786	len = in->buf->buffer->use;
				3787	}
				3788	/* take into account the sequence length */
				3789	if (third) len -= 2;
				3790	else if (next) len --;
				3791	for (;base < len;base++) {
				3792	if (buf[base] == first) {
				3793	if (third != 0) {
				3794	if ((buf[base + 1] != next) \|\|
				3795	(buf[base + 2] != third)) continue;
				3796	} else if (next != 0) {
				3797	if (buf[base + 1] != next) continue;
				3798	}
				3799	ctxt->checkIndex = 0;
				3800	#ifdef DEBUG_PUSH
				3801	if (next == 0)
				3802	xmlGenericError(xmlGenericErrorContext,
				3803	"HPP: lookup '%c' found at %d\n",
				3804	first, base);
				3805	else if (third == 0)
				3806	xmlGenericError(xmlGenericErrorContext,
				3807	"HPP: lookup '%c%c' found at %d\n",
				3808	first, next, base);
				3809	else
				3810	xmlGenericError(xmlGenericErrorContext,
				3811	"HPP: lookup '%c%c%c' found at %d\n",
				3812	first, next, third, base);
				3813	#endif
				3814	return(base - (in->cur - in->base));
				3815	}
				3816	}
				3817	ctxt->checkIndex = base;
				3818	#ifdef DEBUG_PUSH
				3819	if (next == 0)
				3820	xmlGenericError(xmlGenericErrorContext,
				3821	"HPP: lookup '%c' failed\n", first);
				3822	else if (third == 0)
				3823	xmlGenericError(xmlGenericErrorContext,
				3824	"HPP: lookup '%c%c' failed\n", first, next);
				3825	else
				3826	xmlGenericError(xmlGenericErrorContext,
				3827	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3828	#endif
				3829	return(-1);
				3830	}
				3831
				3832	/**
				3833	* htmlParseTryOrFinish:
				3834	* @ctxt: an HTML parser context
				3835	* @terminate: last chunk indicator
				3836	*
				3837	* Try to progress on parsing
				3838	*
				3839	* Returns zero if no parsing was possible
				3840	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3841	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3842	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3843	int ret = 0;
				3844	htmlParserInputPtr in;
				3845	int avail = 0;
				3846	xmlChar cur, next;
				3847
				3848	#ifdef DEBUG_PUSH
				3849	switch (ctxt->instate) {
				3850	case XML_PARSER_EOF:
				3851	xmlGenericError(xmlGenericErrorContext,
				3852	"HPP: try EOF\n"); break;
				3853	case XML_PARSER_START:
				3854	xmlGenericError(xmlGenericErrorContext,
				3855	"HPP: try START\n"); break;
				3856	case XML_PARSER_MISC:
				3857	xmlGenericError(xmlGenericErrorContext,
				3858	"HPP: try MISC\n");break;
				3859	case XML_PARSER_COMMENT:
				3860	xmlGenericError(xmlGenericErrorContext,
				3861	"HPP: try COMMENT\n");break;
				3862	case XML_PARSER_PROLOG:
				3863	xmlGenericError(xmlGenericErrorContext,
				3864	"HPP: try PROLOG\n");break;
				3865	case XML_PARSER_START_TAG:
				3866	xmlGenericError(xmlGenericErrorContext,
				3867	"HPP: try START_TAG\n");break;
				3868	case XML_PARSER_CONTENT:
				3869	xmlGenericError(xmlGenericErrorContext,
				3870	"HPP: try CONTENT\n");break;
				3871	case XML_PARSER_CDATA_SECTION:
				3872	xmlGenericError(xmlGenericErrorContext,
				3873	"HPP: try CDATA_SECTION\n");break;
				3874	case XML_PARSER_END_TAG:
				3875	xmlGenericError(xmlGenericErrorContext,
				3876	"HPP: try END_TAG\n");break;
				3877	case XML_PARSER_ENTITY_DECL:
				3878	xmlGenericError(xmlGenericErrorContext,
				3879	"HPP: try ENTITY_DECL\n");break;
				3880	case XML_PARSER_ENTITY_VALUE:
				3881	xmlGenericError(xmlGenericErrorContext,
				3882	"HPP: try ENTITY_VALUE\n");break;
				3883	case XML_PARSER_ATTRIBUTE_VALUE:
				3884	xmlGenericError(xmlGenericErrorContext,
				3885	"HPP: try ATTRIBUTE_VALUE\n");break;
				3886	case XML_PARSER_DTD:
				3887	xmlGenericError(xmlGenericErrorContext,
				3888	"HPP: try DTD\n");break;
				3889	case XML_PARSER_EPILOG:
				3890	xmlGenericError(xmlGenericErrorContext,
				3891	"HPP: try EPILOG\n");break;
				3892	case XML_PARSER_PI:
				3893	xmlGenericError(xmlGenericErrorContext,
				3894	"HPP: try PI\n");break;
				3895	case XML_PARSER_SYSTEM_LITERAL:
				3896	xmlGenericError(xmlGenericErrorContext,
				3897	"HPP: try SYSTEM_LITERAL\n");break;
				3898	}
				3899	#endif
				3900
				3901	while (1) {
				3902
				3903	in = ctxt->input;
				3904	if (in == NULL) break;
				3905	if (in->buf == NULL)
				3906	avail = in->length - (in->cur - in->base);
				3907	else
				3908	avail = in->buf->buffer->use - (in->cur - in->base);
				3909	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	3910	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3911	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3912	/*
				3913	* SAX: end of the document processing.
				3914	*/
				3915	ctxt->instate = XML_PARSER_EOF;
				3916	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3917	ctxt->sax->endDocument(ctxt->userData);
				3918	}
				3919	}
				3920	if (avail < 1)
				3921	goto done;
				3922	switch (ctxt->instate) {
				3923	case XML_PARSER_EOF:
				3924	/*
				3925	* Document parsing is done !
				3926	*/
				3927	goto done;
				3928	case XML_PARSER_START:
				3929	/*
				3930	* Very first chars read from the document flow.
				3931	*/
				3932	cur = in->cur[0];
				3933	if (IS_BLANK(cur)) {
				3934	SKIP_BLANKS;
				3935	if (in->buf == NULL)
				3936	avail = in->length - (in->cur - in->base);
				3937	else
				3938	avail = in->buf->buffer->use - (in->cur - in->base);
				3939	}
				3940	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3941	ctxt->sax->setDocumentLocator(ctxt->userData,
				3942	&xmlDefaultSAXLocator);
				3943	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				3944	(!ctxt->disableSAX))
				3945	ctxt->sax->startDocument(ctxt->userData);
				3946
				3947	cur = in->cur[0];
				3948	next = in->cur[1];
				3949	if ((cur == '<') && (next == '!') &&
				3950	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3951	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3952	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3953	(UPP(8) == 'E')) {
				3954	if ((!terminate) &&
				3955	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				3956	goto done;
				3957	#ifdef DEBUG_PUSH
				3958	xmlGenericError(xmlGenericErrorContext,
				3959	"HPP: Parsing internal subset\n");
				3960	#endif
				3961	htmlParseDocTypeDecl(ctxt);
				3962	ctxt->instate = XML_PARSER_PROLOG;
				3963	#ifdef DEBUG_PUSH
				3964	xmlGenericError(xmlGenericErrorContext,
				3965	"HPP: entering PROLOG\n");
				3966	#endif
				3967	} else {
				3968	ctxt->instate = XML_PARSER_MISC;
				3969	}
				3970	#ifdef DEBUG_PUSH
				3971	xmlGenericError(xmlGenericErrorContext,
				3972	"HPP: entering MISC\n");
				3973	#endif
				3974	break;
				3975	case XML_PARSER_MISC:
				3976	SKIP_BLANKS;
				3977	if (in->buf == NULL)
				3978	avail = in->length - (in->cur - in->base);
				3979	else
				3980	avail = in->buf->buffer->use - (in->cur - in->base);
				3981	if (avail < 2)
				3982	goto done;
				3983	cur = in->cur[0];
				3984	next = in->cur[1];
				3985	if ((cur == '<') && (next == '!') &&
				3986	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				3987	if ((!terminate) &&
				3988	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				3989	goto done;
				3990	#ifdef DEBUG_PUSH
				3991	xmlGenericError(xmlGenericErrorContext,
				3992	"HPP: Parsing Comment\n");
				3993	#endif
				3994	htmlParseComment(ctxt);
				3995	ctxt->instate = XML_PARSER_MISC;
				3996	} else if ((cur == '<') && (next == '!') &&
				3997	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3998	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3999	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4000	(UPP(8) == 'E')) {
				4001	if ((!terminate) &&
				4002	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4003	goto done;
				4004	#ifdef DEBUG_PUSH
				4005	xmlGenericError(xmlGenericErrorContext,
				4006	"HPP: Parsing internal subset\n");
				4007	#endif
				4008	htmlParseDocTypeDecl(ctxt);
				4009	ctxt->instate = XML_PARSER_PROLOG;
				4010	#ifdef DEBUG_PUSH
				4011	xmlGenericError(xmlGenericErrorContext,
				4012	"HPP: entering PROLOG\n");
				4013	#endif
				4014	} else if ((cur == '<') && (next == '!') &&
				4015	(avail < 9)) {
				4016	goto done;
				4017	} else {
				4018	ctxt->instate = XML_PARSER_START_TAG;
				4019	#ifdef DEBUG_PUSH
				4020	xmlGenericError(xmlGenericErrorContext,
				4021	"HPP: entering START_TAG\n");
				4022	#endif
				4023	}
				4024	break;
				4025	case XML_PARSER_PROLOG:
				4026	SKIP_BLANKS;
				4027	if (in->buf == NULL)
				4028	avail = in->length - (in->cur - in->base);
				4029	else
				4030	avail = in->buf->buffer->use - (in->cur - in->base);
				4031	if (avail < 2)
				4032	goto done;
				4033	cur = in->cur[0];
				4034	next = in->cur[1];
				4035	if ((cur == '<') && (next == '!') &&
				4036	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4037	if ((!terminate) &&
				4038	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4039	goto done;
				4040	#ifdef DEBUG_PUSH
				4041	xmlGenericError(xmlGenericErrorContext,
				4042	"HPP: Parsing Comment\n");
				4043	#endif
				4044	htmlParseComment(ctxt);
				4045	ctxt->instate = XML_PARSER_PROLOG;
				4046	} else if ((cur == '<') && (next == '!') &&
				4047	(avail < 4)) {
				4048	goto done;
				4049	} else {
				4050	ctxt->instate = XML_PARSER_START_TAG;
				4051	#ifdef DEBUG_PUSH
				4052	xmlGenericError(xmlGenericErrorContext,
				4053	"HPP: entering START_TAG\n");
				4054	#endif
				4055	}
				4056	break;
				4057	case XML_PARSER_EPILOG:
				4058	if (in->buf == NULL)
				4059	avail = in->length - (in->cur - in->base);
				4060	else
				4061	avail = in->buf->buffer->use - (in->cur - in->base);
				4062	if (avail < 1)
				4063	goto done;
				4064	cur = in->cur[0];
				4065	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4066	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4067	goto done;
				4068	}
				4069	if (avail < 2)
				4070	goto done;
				4071	next = in->cur[1];
				4072	if ((cur == '<') && (next == '!') &&
				4073	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4074	if ((!terminate) &&
				4075	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4076	goto done;
				4077	#ifdef DEBUG_PUSH
				4078	xmlGenericError(xmlGenericErrorContext,
				4079	"HPP: Parsing Comment\n");
				4080	#endif
				4081	htmlParseComment(ctxt);
				4082	ctxt->instate = XML_PARSER_EPILOG;
				4083	} else if ((cur == '<') && (next == '!') &&
				4084	(avail < 4)) {
				4085	goto done;
				4086	} else {
				4087	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4088	ctxt->wellFormed = 0;
				4089	ctxt->instate = XML_PARSER_EOF;
				4090	#ifdef DEBUG_PUSH
				4091	xmlGenericError(xmlGenericErrorContext,
				4092	"HPP: entering EOF\n");
				4093	#endif
				4094	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4095	ctxt->sax->endDocument(ctxt->userData);
				4096	goto done;
				4097	}
				4098	break;
				4099	case XML_PARSER_START_TAG: {
				4100	xmlChar name, oldname;
				4101	int depth = ctxt->nameNr;
				4102	htmlElemDescPtr info;
				4103
				4104	if (avail < 2)
				4105	goto done;
				4106	cur = in->cur[0];
				4107	if (cur != '<') {
				4108	ctxt->instate = XML_PARSER_CONTENT;
				4109	#ifdef DEBUG_PUSH
				4110	xmlGenericError(xmlGenericErrorContext,
				4111	"HPP: entering CONTENT\n");
				4112	#endif
				4113	break;
				4114	}
				4115	if ((!terminate) &&
				4116	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4117	goto done;
				4118
				4119	oldname = xmlStrdup(ctxt->name);
				4120	htmlParseStartTag(ctxt);
				4121	name = ctxt->name;
				4122	#ifdef DEBUG
				4123	if (oldname == NULL)
				4124	xmlGenericError(xmlGenericErrorContext,
				4125	"Start of element %s\n", name);
				4126	else if (name == NULL)
				4127	xmlGenericError(xmlGenericErrorContext,
				4128	"Start of element failed, was %s\n",
				4129	oldname);
				4130	else
				4131	xmlGenericError(xmlGenericErrorContext,
				4132	"Start of element %s, was %s\n",
				4133	name, oldname);
				4134	#endif
				4135	if (((depth == ctxt->nameNr) &&
				4136	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4137	(name == NULL)) {
				4138	if (CUR == '>')
				4139	NEXT;
				4140	if (oldname != NULL)
				4141	xmlFree(oldname);
				4142	break;
				4143	}
				4144	if (oldname != NULL)
				4145	xmlFree(oldname);
				4146
				4147	/*
				4148	* Lookup the info for that element.
				4149	*/
				4150	info = htmlTagLookup(name);
				4151	if (info == NULL) {
				4152	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4153	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4154	name);
				4155	ctxt->wellFormed = 0;
				4156	} else if (info->depr) {
				4157	/***************************
				4158	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4159	ctxt->sax->warning(ctxt->userData,
				4160	"Tag %s is deprecated\n",
				4161	name);
				4162	***************************/
				4163	}
				4164
				4165	/*
				4166	* Check for an Empty Element labelled the XML/SGML way
				4167	*/
				4168	if ((CUR == '/') && (NXT(1) == '>')) {
				4169	SKIP(2);
				4170	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4171	ctxt->sax->endElement(ctxt->userData, name);
				4172	oldname = htmlnamePop(ctxt);
				4173	#ifdef DEBUG
				4174	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4175	oldname);
				4176	#endif
				4177	if (oldname != NULL)
				4178	xmlFree(oldname);
				4179	ctxt->instate = XML_PARSER_CONTENT;
				4180	#ifdef DEBUG_PUSH
				4181	xmlGenericError(xmlGenericErrorContext,
				4182	"HPP: entering CONTENT\n");
				4183	#endif
				4184	break;
				4185	}
				4186
				4187	if (CUR == '>') {
				4188	NEXT;
				4189	} else {
				4190	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4191	ctxt->sax->error(ctxt->userData,
				4192	"Couldn't find end of Start Tag %s\n",
				4193	name);
				4194	ctxt->wellFormed = 0;
				4195
				4196	/*
				4197	* end of parsing of this node.
				4198	*/
				4199	if (xmlStrEqual(name, ctxt->name)) {
				4200	nodePop(ctxt);
				4201	oldname = htmlnamePop(ctxt);
				4202	#ifdef DEBUG
				4203	xmlGenericError(xmlGenericErrorContext,
				4204	"End of start tag problem: popping out %s\n", oldname);
				4205	#endif
				4206	if (oldname != NULL)
				4207	xmlFree(oldname);
				4208	}
				4209
				4210	ctxt->instate = XML_PARSER_CONTENT;
				4211	#ifdef DEBUG_PUSH
				4212	xmlGenericError(xmlGenericErrorContext,
				4213	"HPP: entering CONTENT\n");
				4214	#endif
				4215	break;
				4216	}
				4217
				4218	/*
				4219	* Check for an Empty Element from DTD definition
				4220	*/
				4221	if ((info != NULL) && (info->empty)) {
				4222	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4223	ctxt->sax->endElement(ctxt->userData, name);
				4224	oldname = htmlnamePop(ctxt);
				4225	#ifdef DEBUG
				4226	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4227	#endif
				4228	if (oldname != NULL)
				4229	xmlFree(oldname);
				4230	}
				4231	ctxt->instate = XML_PARSER_CONTENT;
				4232	#ifdef DEBUG_PUSH
				4233	xmlGenericError(xmlGenericErrorContext,
				4234	"HPP: entering CONTENT\n");
				4235	#endif
				4236	break;
				4237	}
				4238	case XML_PARSER_CONTENT: {
				4239	long cons;
				4240	/*
				4241	* Handle preparsed entities and charRef
				4242	*/
				4243	if (ctxt->token != 0) {
				4244	xmlChar chr[2] = { 0 , 0 } ;
				4245
				4246	chr[0] = (xmlChar) ctxt->token;
				4247	htmlCheckParagraph(ctxt);
				4248	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4249	ctxt->sax->characters(ctxt->userData, chr, 1);
				4250	ctxt->token = 0;
				4251	ctxt->checkIndex = 0;
				4252	}
				4253	if ((avail == 1) && (terminate)) {
				4254	cur = in->cur[0];
				4255	if ((cur != '<') && (cur != '&')) {
				4256	if (ctxt->sax != NULL) {
				4257	if (IS_BLANK(cur)) {
				4258	if (ctxt->sax->ignorableWhitespace != NULL)
				4259	ctxt->sax->ignorableWhitespace(
				4260	ctxt->userData, &cur, 1);
				4261	} else {
				4262	htmlCheckParagraph(ctxt);
				4263	if (ctxt->sax->characters != NULL)
				4264	ctxt->sax->characters(
				4265	ctxt->userData, &cur, 1);
				4266	}
				4267	}
				4268	ctxt->token = 0;
				4269	ctxt->checkIndex = 0;
				4270	NEXT;
				4271	}
				4272	break;
				4273	}
				4274	if (avail < 2)
				4275	goto done;
				4276	cur = in->cur[0];
				4277	next = in->cur[1];
				4278	cons = ctxt->nbChars;
				4279	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4280	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4281	/*
				4282	* Handle SCRIPT/STYLE separately
				4283	*/
				4284	if ((!terminate) &&
				4285	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4286	goto done;
				4287	htmlParseScript(ctxt);
				4288	if ((cur == '<') && (next == '/')) {
				4289	ctxt->instate = XML_PARSER_END_TAG;
				4290	ctxt->checkIndex = 0;
				4291	#ifdef DEBUG_PUSH
				4292	xmlGenericError(xmlGenericErrorContext,
				4293	"HPP: entering END_TAG\n");
				4294	#endif
				4295	break;
				4296	}
				4297	} else {
				4298	/*
				4299	* Sometimes DOCTYPE arrives in the middle of the document
				4300	*/
				4301	if ((cur == '<') && (next == '!') &&
				4302	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4303	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4304	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4305	(UPP(8) == 'E')) {
				4306	if ((!terminate) &&
				4307	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4308	goto done;
				4309	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4310	ctxt->sax->error(ctxt->userData,
				4311	"Misplaced DOCTYPE declaration\n");
				4312	ctxt->wellFormed = 0;
				4313	htmlParseDocTypeDecl(ctxt);
				4314	} else if ((cur == '<') && (next == '!') &&
				4315	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4316	if ((!terminate) &&
				4317	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4318	goto done;
				4319	#ifdef DEBUG_PUSH
				4320	xmlGenericError(xmlGenericErrorContext,
				4321	"HPP: Parsing Comment\n");
				4322	#endif
				4323	htmlParseComment(ctxt);
				4324	ctxt->instate = XML_PARSER_CONTENT;
				4325	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4326	goto done;
				4327	} else if ((cur == '<') && (next == '/')) {
				4328	ctxt->instate = XML_PARSER_END_TAG;
				4329	ctxt->checkIndex = 0;
				4330	#ifdef DEBUG_PUSH
				4331	xmlGenericError(xmlGenericErrorContext,
				4332	"HPP: entering END_TAG\n");
				4333	#endif
				4334	break;
				4335	} else if (cur == '<') {
				4336	ctxt->instate = XML_PARSER_START_TAG;
				4337	ctxt->checkIndex = 0;
				4338	#ifdef DEBUG_PUSH
				4339	xmlGenericError(xmlGenericErrorContext,
				4340	"HPP: entering START_TAG\n");
				4341	#endif
				4342	break;
				4343	} else if (cur == '&') {
				4344	if ((!terminate) &&
				4345	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4346	goto done;
				4347	#ifdef DEBUG_PUSH
				4348	xmlGenericError(xmlGenericErrorContext,
				4349	"HPP: Parsing Reference\n");
				4350	#endif
				4351	/* TODO: check generation of subtrees if noent !!! */
				4352	htmlParseReference(ctxt);
				4353	} else {
				4354	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4355	/*
				4356	* Goal of the following test is :
				4357	* - minimize calls to the SAX 'character' callback
				4358	* when they are mergeable
				4359	*/
				4360	if ((ctxt->inputNr == 1) &&
				4361	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4362	if ((!terminate) &&
				4363	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4364	goto done;
				4365	}
				4366	ctxt->checkIndex = 0;
				4367	#ifdef DEBUG_PUSH
				4368	xmlGenericError(xmlGenericErrorContext,
				4369	"HPP: Parsing char data\n");
				4370	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4371	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4372	}
				4373	}
				4374	if (cons == ctxt->nbChars) {
				4375	if (ctxt->node != NULL) {
				4376	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4377	ctxt->sax->error(ctxt->userData,
				4378	"detected an error in element content\n");
				4379	ctxt->wellFormed = 0;
				4380	}
				4381	NEXT;
				4382	break;
				4383	}
				4384
				4385	break;
				4386	}
				4387	case XML_PARSER_END_TAG:
				4388	if (avail < 2)
				4389	goto done;
				4390	if ((!terminate) &&
				4391	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4392	goto done;
				4393	htmlParseEndTag(ctxt);
				4394	if (ctxt->nameNr == 0) {
				4395	ctxt->instate = XML_PARSER_EPILOG;
				4396	} else {
				4397	ctxt->instate = XML_PARSER_CONTENT;
				4398	}
				4399	ctxt->checkIndex = 0;
				4400	#ifdef DEBUG_PUSH
				4401	xmlGenericError(xmlGenericErrorContext,
				4402	"HPP: entering CONTENT\n");
				4403	#endif
				4404	break;
				4405	case XML_PARSER_CDATA_SECTION:
				4406	xmlGenericError(xmlGenericErrorContext,
				4407	"HPP: internal error, state == CDATA\n");
				4408	ctxt->instate = XML_PARSER_CONTENT;
				4409	ctxt->checkIndex = 0;
				4410	#ifdef DEBUG_PUSH
				4411	xmlGenericError(xmlGenericErrorContext,
				4412	"HPP: entering CONTENT\n");
				4413	#endif
				4414	break;
				4415	case XML_PARSER_DTD:
				4416	xmlGenericError(xmlGenericErrorContext,
				4417	"HPP: internal error, state == DTD\n");
				4418	ctxt->instate = XML_PARSER_CONTENT;
				4419	ctxt->checkIndex = 0;
				4420	#ifdef DEBUG_PUSH
				4421	xmlGenericError(xmlGenericErrorContext,
				4422	"HPP: entering CONTENT\n");
				4423	#endif
				4424	break;
				4425	case XML_PARSER_COMMENT:
				4426	xmlGenericError(xmlGenericErrorContext,
				4427	"HPP: internal error, state == COMMENT\n");
				4428	ctxt->instate = XML_PARSER_CONTENT;
				4429	ctxt->checkIndex = 0;
				4430	#ifdef DEBUG_PUSH
				4431	xmlGenericError(xmlGenericErrorContext,
				4432	"HPP: entering CONTENT\n");
				4433	#endif
				4434	break;
				4435	case XML_PARSER_PI:
				4436	xmlGenericError(xmlGenericErrorContext,
				4437	"HPP: internal error, state == PI\n");
				4438	ctxt->instate = XML_PARSER_CONTENT;
				4439	ctxt->checkIndex = 0;
				4440	#ifdef DEBUG_PUSH
				4441	xmlGenericError(xmlGenericErrorContext,
				4442	"HPP: entering CONTENT\n");
				4443	#endif
				4444	break;
				4445	case XML_PARSER_ENTITY_DECL:
				4446	xmlGenericError(xmlGenericErrorContext,
				4447	"HPP: internal error, state == ENTITY_DECL\n");
				4448	ctxt->instate = XML_PARSER_CONTENT;
				4449	ctxt->checkIndex = 0;
				4450	#ifdef DEBUG_PUSH
				4451	xmlGenericError(xmlGenericErrorContext,
				4452	"HPP: entering CONTENT\n");
				4453	#endif
				4454	break;
				4455	case XML_PARSER_ENTITY_VALUE:
				4456	xmlGenericError(xmlGenericErrorContext,
				4457	"HPP: internal error, state == ENTITY_VALUE\n");
				4458	ctxt->instate = XML_PARSER_CONTENT;
				4459	ctxt->checkIndex = 0;
				4460	#ifdef DEBUG_PUSH
				4461	xmlGenericError(xmlGenericErrorContext,
				4462	"HPP: entering DTD\n");
				4463	#endif
				4464	break;
				4465	case XML_PARSER_ATTRIBUTE_VALUE:
				4466	xmlGenericError(xmlGenericErrorContext,
				4467	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4468	ctxt->instate = XML_PARSER_START_TAG;
				4469	ctxt->checkIndex = 0;
				4470	#ifdef DEBUG_PUSH
				4471	xmlGenericError(xmlGenericErrorContext,
				4472	"HPP: entering START_TAG\n");
				4473	#endif
				4474	break;
				4475	case XML_PARSER_SYSTEM_LITERAL:
				4476	xmlGenericError(xmlGenericErrorContext,
				4477	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4478	ctxt->instate = XML_PARSER_CONTENT;
				4479	ctxt->checkIndex = 0;
				4480	#ifdef DEBUG_PUSH
				4481	xmlGenericError(xmlGenericErrorContext,
				4482	"HPP: entering CONTENT\n");
				4483	#endif
				4484	break;
				4485	case XML_PARSER_IGNORE:
				4486	xmlGenericError(xmlGenericErrorContext,
				4487	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4488	ctxt->instate = XML_PARSER_CONTENT;
				4489	ctxt->checkIndex = 0;
				4490	#ifdef DEBUG_PUSH
				4491	xmlGenericError(xmlGenericErrorContext,
				4492	"HPP: entering CONTENT\n");
				4493	#endif
				4494	break;
				4495	}
				4496	}
				4497	done:
				4498	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame^]	4499	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4500	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4501	/*
				4502	* SAX: end of the document processing.
				4503	*/
				4504	ctxt->instate = XML_PARSER_EOF;
				4505	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4506	ctxt->sax->endDocument(ctxt->userData);
				4507	}
				4508	}
				4509	if ((ctxt->myDoc != NULL) &&
				4510	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4511	(ctxt->instate == XML_PARSER_EPILOG))) {
				4512	xmlDtdPtr dtd;
				4513	dtd = xmlGetIntSubset(ctxt->myDoc);
				4514	if (dtd == NULL)
				4515	ctxt->myDoc->intSubset =
				4516	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4517	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4518	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4519	}
				4520	#ifdef DEBUG_PUSH
				4521	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4522	#endif
				4523	return(ret);
				4524	}
				4525
				4526	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4527	* htmlParseChunk:
				4528	* @ctxt: an XML parser context
				4529	* @chunk: an char array
				4530	* @size: the size in byte of the chunk
				4531	* @terminate: last chunk indicator
				4532	*
				4533	* Parse a Chunk of memory
				4534	*
				4535	* Returns zero if no error, the xmlParserErrors otherwise.
				4536	*/
				4537	int
				4538	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4539	int terminate) {
				4540	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4541	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4542	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4543	int cur = ctxt->input->cur - ctxt->input->base;
				4544
				4545	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4546	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4547	ctxt->input->cur = ctxt->input->base + cur;
				4548	#ifdef DEBUG_PUSH
				4549	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4550	#endif
				4551
				4552	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4553	htmlParseTryOrFinish(ctxt, terminate);
				4554	} else if (ctxt->instate != XML_PARSER_EOF) {
				4555	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4556	htmlParseTryOrFinish(ctxt, terminate);
				4557	}
				4558	if (terminate) {
				4559	if ((ctxt->instate != XML_PARSER_EOF) &&
				4560	(ctxt->instate != XML_PARSER_EPILOG) &&
				4561	(ctxt->instate != XML_PARSER_MISC)) {
				4562	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4563	ctxt->wellFormed = 0;
				4564	}
				4565	if (ctxt->instate != XML_PARSER_EOF) {
				4566	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4567	ctxt->sax->endDocument(ctxt->userData);
				4568	}
				4569	ctxt->instate = XML_PARSER_EOF;
				4570	}
				4571	return((xmlParserErrors) ctxt->errNo);
				4572	}
				4573
				4574	/************************************************************************
				4575	* *
				4576	* User entry points *
				4577	* *
				4578	************************************************************************/
				4579
				4580	/**
				4581	* htmlCreatePushParserCtxt :
				4582	* @sax: a SAX handler
				4583	* @user_data: The user data returned on SAX callbacks
				4584	* @chunk: a pointer to an array of chars
				4585	* @size: number of chars in the array
				4586	* @filename: an optional file name or URI
				4587	* @enc: an optional encoding
				4588	*
				4589	* Create a parser context for using the HTML parser in push mode
				4590	* To allow content encoding detection, @size should be >= 4
				4591	* The value of @filename is used for fetching external entities
				4592	* and error/warning reports.
				4593	*
				4594	* Returns the new parser context or NULL
				4595	*/
				4596	htmlParserCtxtPtr
				4597	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4598	const char chunk, int size, const char filename,
				4599	xmlCharEncoding enc) {
				4600	htmlParserCtxtPtr ctxt;
				4601	htmlParserInputPtr inputStream;
				4602	xmlParserInputBufferPtr buf;
				4603
				4604	buf = xmlAllocParserInputBuffer(enc);
				4605	if (buf == NULL) return(NULL);
				4606
				4607	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4608	if (ctxt == NULL) {
				4609	xmlFree(buf);
				4610	return(NULL);
				4611	}
				4612	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4613	htmlInitParserCtxt(ctxt);
				4614	if (sax != NULL) {
				4615	if (ctxt->sax != &htmlDefaultSAXHandler)
				4616	xmlFree(ctxt->sax);
				4617	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4618	if (ctxt->sax == NULL) {
				4619	xmlFree(buf);
				4620	xmlFree(ctxt);
				4621	return(NULL);
				4622	}
				4623	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4624	if (user_data != NULL)
				4625	ctxt->userData = user_data;
				4626	}
				4627	if (filename == NULL) {
				4628	ctxt->directory = NULL;
				4629	} else {
				4630	ctxt->directory = xmlParserGetDirectory(filename);
				4631	}
				4632
				4633	inputStream = htmlNewInputStream(ctxt);
				4634	if (inputStream == NULL) {
				4635	xmlFreeParserCtxt(ctxt);
				4636	return(NULL);
				4637	}
				4638
				4639	if (filename == NULL)
				4640	inputStream->filename = NULL;
				4641	else
				4642	inputStream->filename = xmlMemStrdup(filename);
				4643	inputStream->buf = buf;
				4644	inputStream->base = inputStream->buf->buffer->content;
				4645	inputStream->cur = inputStream->buf->buffer->content;
				4646
				4647	inputPush(ctxt, inputStream);
				4648
				4649	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4650	(ctxt->input->buf != NULL)) {
				4651	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4652	#ifdef DEBUG_PUSH
				4653	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4654	#endif
				4655	}
				4656
				4657	return(ctxt);
				4658	}
				4659
				4660	/**
				4661	* htmlSAXParseDoc :
				4662	* @cur: a pointer to an array of xmlChar
				4663	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4664	* @sax: the SAX handler block
				4665	* @userData: if using SAX, this pointer will be provided on callbacks.
				4666	*
				4667	* parse an HTML in-memory document and build a tree.
				4668	* It use the given SAX function block to handle the parsing callback.
				4669	* If sax is NULL, fallback to the default DOM tree building routines.
				4670	*
				4671	* Returns the resulting document tree
				4672	*/
				4673
				4674	htmlDocPtr
				4675	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4676	htmlDocPtr ret;
				4677	htmlParserCtxtPtr ctxt;
				4678
				4679	if (cur == NULL) return(NULL);
				4680
				4681
				4682	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4683	if (ctxt == NULL) return(NULL);
				4684	if (sax != NULL) {
				4685	ctxt->sax = sax;
				4686	ctxt->userData = userData;
				4687	}
				4688
				4689	htmlParseDocument(ctxt);
				4690	ret = ctxt->myDoc;
				4691	if (sax != NULL) {
				4692	ctxt->sax = NULL;
				4693	ctxt->userData = NULL;
				4694	}
				4695	htmlFreeParserCtxt(ctxt);
				4696
				4697	return(ret);
				4698	}
				4699
				4700	/**
				4701	* htmlParseDoc :
				4702	* @cur: a pointer to an array of xmlChar
				4703	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4704	*
				4705	* parse an HTML in-memory document and build a tree.
				4706	*
				4707	* Returns the resulting document tree
				4708	*/
				4709
				4710	htmlDocPtr
				4711	htmlParseDoc(xmlChar cur, const char encoding) {
				4712	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4713	}
				4714
				4715
				4716	/**
				4717	* htmlCreateFileParserCtxt :
				4718	* @filename: the filename
				4719	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4720	*
				4721	* Create a parser context for a file content.
				4722	* Automatic support for ZLIB/Compress compressed document is provided
				4723	* by default if found at compile-time.
				4724	*
				4725	* Returns the new parser context or NULL
				4726	*/
				4727	htmlParserCtxtPtr
				4728	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4729	{
				4730	htmlParserCtxtPtr ctxt;
				4731	htmlParserInputPtr inputStream;
				4732	xmlParserInputBufferPtr buf;
				4733	/* htmlCharEncoding enc; */
				4734	xmlChar content, content_line = (xmlChar *) "charset=";
				4735
				4736	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4737	if (buf == NULL) return(NULL);
				4738
				4739	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4740	if (ctxt == NULL) {
				4741	perror("malloc");
				4742	return(NULL);
				4743	}
				4744	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4745	htmlInitParserCtxt(ctxt);
				4746	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4747	if (inputStream == NULL) {
				4748	perror("malloc");
				4749	xmlFree(ctxt);
				4750	return(NULL);
				4751	}
				4752	memset(inputStream, 0, sizeof(htmlParserInput));
				4753
				4754	inputStream->filename = xmlMemStrdup(filename);
				4755	inputStream->line = 1;
				4756	inputStream->col = 1;
				4757	inputStream->buf = buf;
				4758	inputStream->directory = NULL;
				4759
				4760	inputStream->base = inputStream->buf->buffer->content;
				4761	inputStream->cur = inputStream->buf->buffer->content;
				4762	inputStream->free = NULL;
				4763
				4764	inputPush(ctxt, inputStream);
				4765
				4766	/* set encoding */
				4767	if (encoding) {
				4768	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4769	if (content) {
				4770	strcpy ((char )content, (char )content_line);
				4771	strcat ((char )content, (char )encoding);
				4772	htmlCheckEncoding (ctxt, content);
				4773	xmlFree (content);
				4774	}
				4775	}
				4776
				4777	return(ctxt);
				4778	}
				4779
				4780	/**
				4781	* htmlSAXParseFile :
				4782	* @filename: the filename
				4783	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4784	* @sax: the SAX handler block
				4785	* @userData: if using SAX, this pointer will be provided on callbacks.
				4786	*
				4787	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4788	* compressed document is provided by default if found at compile-time.
				4789	* It use the given SAX function block to handle the parsing callback.
				4790	* If sax is NULL, fallback to the default DOM tree building routines.
				4791	*
				4792	* Returns the resulting document tree
				4793	*/
				4794
				4795	htmlDocPtr
				4796	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4797	void *userData) {
				4798	htmlDocPtr ret;
				4799	htmlParserCtxtPtr ctxt;
				4800	htmlSAXHandlerPtr oldsax = NULL;
				4801
				4802	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4803	if (ctxt == NULL) return(NULL);
				4804	if (sax != NULL) {
				4805	oldsax = ctxt->sax;
				4806	ctxt->sax = sax;
				4807	ctxt->userData = userData;
				4808	}
				4809
				4810	htmlParseDocument(ctxt);
				4811
				4812	ret = ctxt->myDoc;
				4813	if (sax != NULL) {
				4814	ctxt->sax = oldsax;
				4815	ctxt->userData = NULL;
				4816	}
				4817	htmlFreeParserCtxt(ctxt);
				4818
				4819	return(ret);
				4820	}
				4821
				4822	/**
				4823	* htmlParseFile :
				4824	* @filename: the filename
				4825	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4826	*
				4827	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4828	* compressed document is provided by default if found at compile-time.
				4829	*
				4830	* Returns the resulting document tree
				4831	*/
				4832
				4833	htmlDocPtr
				4834	htmlParseFile(const char filename, const char encoding) {
				4835	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4836	}
				4837
				4838	/**
				4839	* htmlHandleOmittedElem:
				4840	* @val: int 0 or 1
				4841	*
				4842	* Set and return the previous value for handling HTML omitted tags.
				4843	*
				4844	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4845	*/
				4846
				4847	int
				4848	htmlHandleOmittedElem(int val) {
				4849	int old = htmlOmittedDefaultValue;
				4850
				4851	htmlOmittedDefaultValue = val;
				4852	return(old);
				4853	}
				4854
				4855	#endif /* LIBXML_HTML_ENABLED */