Blame - HTMLparser.c - platform/external/libxml2

blob: ace49d9fcb952d202291092f6ec3badb7e28d3e0 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* Daniel.Veillard@w3.org
				7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
				43
				44	#define HTML_MAX_NAMELEN 1000
				45	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				46	#define HTML_PARSER_BUFFER_SIZE 100
				47
				48	/* #define DEBUG */
				49	/* #define DEBUG_PUSH */
				50
				51	int htmlOmittedDefaultValue = 1;
				52
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	53	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				54	xmlChar end, xmlChar end2, xmlChar end3);
				55
				56	/************************************************************************
				57	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	58	* Parser stacks related functions and macros *
				59	* *
				60	************************************************************************/
				61
				62	/*
				63	* Generic function for accessing stacks in the Parser Context
				64	*/
				65
				66	#define PUSH_AND_POP(scope, type, name) \
				67	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				68	if (ctxt->name##Nr >= ctxt->name##Max) { \
				69	ctxt->name##Max *= 2; \
				70	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				71	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				72	if (ctxt->name##Tab == NULL) { \
				73	xmlGenericError(xmlGenericErrorContext, \
				74	"realloc failed !\n"); \
				75	return(0); \
				76	} \
				77	} \
				78	ctxt->name##Tab[ctxt->name##Nr] = value; \
				79	ctxt->name = value; \
				80	return(ctxt->name##Nr++); \
				81	} \
				82	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				83	type ret; \
				84	if (ctxt->name##Nr < 0) return(0); \
				85	ctxt->name##Nr--; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	if (ctxt->name##Nr > 0) \
				88	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				89	else \
				90	ctxt->name = NULL; \
				91	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				92	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				93	return(ret); \
				94	} \
				95
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	96	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				97	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	98
				99	/*
				100	* Macros for accessing the content. Those should be used only by the parser,
				101	* and not exported.
				102	*
				103	* Dirty macros, i.e. one need to make assumption on the context to use them
				104	*
				105	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				106	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				107	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				108	* in UNICODE mode. This should be used internally by the parser
				109	* only to compare to ASCII values otherwise it would break when
				110	* running with UTF-8 encoding.
				111	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				112	* to compare on ASCII based substring.
				113	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				114	* it should be used only to compare on ASCII based substring.
				115	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				116	* strings within the parser.
				117	*
				118	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				119	*
				120	* CURRENT Returns the current char value, with the full decoding of
				121	* UTF-8 if we are using this mode. It returns an int.
				122	* NEXT Skip to the next character, this does the proper decoding
				123	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				124	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				125	*/
				126
				127	#define UPPER (toupper(*ctxt->input->cur))
				128
				129	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				130
				131	#define NXT(val) ctxt->input->cur[(val)]
				132
				133	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				134
				135	#define CUR_PTR ctxt->input->cur
				136
				137	#define SHRINK xmlParserInputShrink(ctxt->input)
				138
				139	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				140
				141	#define CURRENT ((int) (*ctxt->input->cur))
				142
				143	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				144
				145	/* Inported from XML */
				146
				147	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				148	#define CUR ((int) (*ctxt->input->cur))
				149	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				150
				151	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				152	#define NXT(val) ctxt->input->cur[(val)]
				153	#define CUR_PTR ctxt->input->cur
				154
				155
				156	#define NEXTL(l) do { \
				157	if (*(ctxt->input->cur) == '\n') { \
				158	ctxt->input->line++; ctxt->input->col = 1; \
				159	} else ctxt->input->col++; \
				160	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				161	} while (0)
				162
				163	/************
				164	\
				165	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				166	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				167	************/
				168
				169	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				170	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				171
				172	#define COPY_BUF(l,b,i,v) \
				173	if (l == 1) b[i++] = (xmlChar) v; \
				174	else i += xmlCopyChar(l,&b[i],v)
				175
				176	/**
				177	* htmlCurrentChar:
				178	* @ctxt: the HTML parser context
				179	* @len: pointer to the length of the char read
				180	*
				181	* The current char value, if using UTF-8 this may actaully span multiple
				182	* bytes in the input buffer. Implement the end of line normalization:
				183	* 2.11 End-of-Line Handling
				184	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				185	* char, then the encoding converter is plugged in automatically.
				186	*
				187	* Returns the current char value and its lenght
				188	*/
				189
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	190	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				192	if (ctxt->instate == XML_PARSER_EOF)
				193	return(0);
				194
				195	if (ctxt->token != 0) {
				196	*len = 0;
				197	return(ctxt->token);
				198	}
				199	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				200	/*
				201	* We are supposed to handle UTF8, check it's valid
				202	* From rfc2044: encoding of the Unicode values on UTF-8:
				203	*
				204	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				205	* 0000 0000-0000 007F 0xxxxxxx
				206	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				207	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				208	*
				209	* Check for the 0x110000 limit too
				210	*/
				211	const unsigned char *cur = ctxt->input->cur;
				212	unsigned char c;
				213	unsigned int val;
				214
				215	c = *cur;
				216	if (c & 0x80) {
				217	if (cur[1] == 0)
				218	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				219	if ((cur[1] & 0xc0) != 0x80)
				220	goto encoding_error;
				221	if ((c & 0xe0) == 0xe0) {
				222
				223	if (cur[2] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[2] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xf0) == 0xf0) {
				228	if (cur[3] == 0)
				229	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				230	if (((c & 0xf8) != 0xf0) \|\|
				231	((cur[3] & 0xc0) != 0x80))
				232	goto encoding_error;
				233	/* 4-byte code */
				234	*len = 4;
				235	val = (cur[0] & 0x7) << 18;
				236	val \|= (cur[1] & 0x3f) << 12;
				237	val \|= (cur[2] & 0x3f) << 6;
				238	val \|= cur[3] & 0x3f;
				239	} else {
				240	/* 3-byte code */
				241	*len = 3;
				242	val = (cur[0] & 0xf) << 12;
				243	val \|= (cur[1] & 0x3f) << 6;
				244	val \|= cur[2] & 0x3f;
				245	}
				246	} else {
				247	/* 2-byte code */
				248	*len = 2;
				249	val = (cur[0] & 0x1f) << 6;
				250	val \|= cur[1] & 0x3f;
				251	}
				252	if (!IS_CHAR(val)) {
				253	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				254	if ((ctxt->sax != NULL) &&
				255	(ctxt->sax->error != NULL))
				256	ctxt->sax->error(ctxt->userData,
				257	"Char 0x%X out of allowed range\n", val);
				258	ctxt->wellFormed = 0;
				259	ctxt->disableSAX = 1;
				260	}
				261	return(val);
				262	} else {
				263	/* 1-byte code */
				264	*len = 1;
				265	return((int) *ctxt->input->cur);
				266	}
				267	}
				268	/*
				269	* Assume it's a fixed lenght encoding (1) with
				270	* a compatibke encoding for the ASCII set, since
				271	* XML constructs only use < 128 chars
				272	*/
				273	*len = 1;
				274	if ((int) *ctxt->input->cur < 0x80)
				275	return((int) *ctxt->input->cur);
				276
				277	/*
				278	* Humm this is bad, do an automatic flow conversion
				279	*/
				280	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				281	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				282	return(xmlCurrentChar(ctxt, len));
				283
				284	encoding_error:
				285	/*
				286	* If we detect an UTF8 error that probably mean that the
				287	* input encoding didn't get properly advertized in the
				288	* declaration header. Report the error and switch the encoding
				289	* to ISO-Latin-1 (if you don't like this policy, just declare the
				290	* encoding !)
				291	*/
				292	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				293	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				294	ctxt->sax->error(ctxt->userData,
				295	"Input is not proper UTF-8, indicate encoding !\n");
				296	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				297	ctxt->input->cur[0], ctxt->input->cur[1],
				298	ctxt->input->cur[2], ctxt->input->cur[3]);
				299	}
				300
				301	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				302	*len = 1;
				303	return((int) *ctxt->input->cur);
				304	}
				305
				306	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	307	* htmlSkipBlankChars:
				308	* @ctxt: the HTML parser context
				309	*
				310	* skip all blanks character found at that point in the input streams.
				311	*
				312	* Returns the number of space chars skipped
				313	*/
				314
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	315	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	316	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				317	int res = 0;
				318
				319	while (IS_BLANK(*(ctxt->input->cur))) {
				320	if ((*ctxt->input->cur == 0) &&
				321	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				322	xmlPopInput(ctxt);
				323	} else {
				324	if (*(ctxt->input->cur) == '\n') {
				325	ctxt->input->line++; ctxt->input->col = 1;
				326	} else ctxt->input->col++;
				327	ctxt->input->cur++;
				328	ctxt->nbChars++;
				329	if (*ctxt->input->cur == 0)
				330	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				331	}
				332	res++;
				333	}
				334	return(res);
				335	}
				336
				337
				338
				339	/************************************************************************
				340	* *
				341	* The list of HTML elements and their properties *
				342	* *
				343	************************************************************************/
				344
				345	/*
				346	* Start Tag: 1 means the start tag can be ommited
				347	* End Tag: 1 means the end tag can be ommited
				348	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	349	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	350	* Depr: this element is deprecated
				351	* DTD: 1 means that this element is valid only in the Loose DTD
				352	* 2 means that this element is valid only in the Frameset DTD
				353	*
				354	* Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
				355	*/
				356	htmlElemDesc html40ElementTable[] = {
				357	{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
				358	{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
				359	{ "acronym", 0, 0, 0, 0, 0, 0, "" },
				360	{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
				361	{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
				362	{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	363	{ "b", 0, 3, 0, 0, 0, 0, "bold text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	364	{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
				365	{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
				366	{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	367	{ "big", 0, 3, 0, 0, 0, 0, "large text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	368	{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
				369	{ "body", 1, 1, 0, 0, 0, 0, "document body " },
				370	{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
				371	{ "button", 0, 0, 0, 0, 0, 0, "push button " },
				372	{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	373	{ "center", 0, 3, 0, 0, 1, 1, "shorthand for div align=center " },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	374	{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
				375	{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
				376	{ "col", 0, 2, 2, 1, 0, 0, "table column " },
				377	{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
				378	{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
				379	{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
				380	{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
				381	{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
				382	{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
				383	{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
				384	{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	385	{ "em", 0, 3, 0, 0, 0, 0, "emphasis" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	386	{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	387	{ "font", 0, 3, 0, 0, 1, 1, "local change to font " },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	388	{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
				389	{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
				390	{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
				391	{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
				392	{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
				393	{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "head", 1, 1, 0, 0, 0, 0, "document head " },
				398	{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
				399	{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	400	{ "i", 0, 3, 0, 0, 0, 0, "italic text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	401	{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
				402	{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
				403	{ "input", 0, 2, 2, 1, 0, 0, "form control " },
				404	{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
				405	{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
				406	{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
				407	{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
				408	{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
				409	{ "li", 0, 1, 1, 0, 0, 0, "list item " },
				410	{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
				411	{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
				412	{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
				413	{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
				414	{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
				415	{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				416	{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
				417	{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
				418	{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
				419	{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
				420	{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
				421	{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
				422	{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
				423	{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	424	{ "s", 0, 3, 0, 0, 1, 1, "strike-through text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	425	{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
				426	{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
				427	{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	428	{ "small", 0, 3, 0, 0, 0, 0, "small text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	429	{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	430	{ "strike", 0, 3, 0, 0, 1, 1, "strike-through text" },
				431	{ "strong", 0, 3, 0, 0, 0, 0, "strong emphasis" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	432	{ "style", 0, 0, 0, 0, 0, 0, "style info " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	433	{ "sub", 0, 3, 0, 0, 0, 0, "subscript" },
				434	{ "sup", 0, 3, 0, 0, 0, 0, "superscript " },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	435	{ "table", 0, 0, 0, 0, 0, 0, " " },
				436	{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
				437	{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
				438	{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
				439	{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
				440	{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
				441	{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
				442	{ "title", 0, 0, 0, 0, 0, 0, "document title " },
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	443	{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	444	{ "tt", 0, 3, 0, 0, 0, 0, "teletype or monospaced text style" },
				445	{ "u", 0, 3, 0, 0, 1, 1, "underlined text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	446	{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
				447	{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
				448	};
				449
				450	/*
				451	* start tags that imply the end of a current element
				452	* any tag of each line implies the end of the current element if the type of
				453	* that element is in the same line
				454	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	455	const char *htmlEquEnd[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	456	"dt", "dd", "li", "option", NULL,
				457	"h1", "h2", "h3", "h4", "h5", "h6", NULL,
				458	"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
				459	NULL
				460	};
				461	/*
				462	* acording the HTML DTD, HR should be added to the 2nd line above, as it
				463	* is not allowed within a H1, H2, H3, etc. But we should tolerate that case
				464	* because many documents contain rules in headings...
				465	*/
				466
				467	/*
				468	* start tags that imply the end of current element
				469	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	470	const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	471	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				472	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				473	"listing", "xmp", "head", NULL,
				474	"head", "p", NULL,
				475	"title", "p", NULL,
				476	"body", "head", "style", "link", "title", "p", NULL,
				477	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				478	"pre", "listing", "xmp", "head", "li", NULL,
				479	"hr", "p", "head", NULL,
				480	"h1", "p", "head", NULL,
				481	"h2", "p", "head", NULL,
				482	"h3", "p", "head", NULL,
				483	"h4", "p", "head", NULL,
				484	"h5", "p", "head", NULL,
				485	"h6", "p", "head", NULL,
				486	"dir", "p", "head", NULL,
				487	"address", "p", "head", "ul", NULL,
				488	"pre", "p", "head", "ul", NULL,
				489	"listing", "p", "head", NULL,
				490	"xmp", "p", "head", NULL,
				491	"blockquote", "p", "head", NULL,
				492	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				493	"xmp", "head", NULL,
				494	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				495	"head", "dd", NULL,
				496	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				497	"head", "dt", NULL,
				498	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				499	"listing", "xmp", NULL,
				500	"ol", "p", "head", "ul", NULL,
				501	"menu", "p", "head", "ul", NULL,
				502	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				503	"div", "p", "head", NULL,
				504	"noscript", "p", "head", NULL,
				505	"center", "font", "b", "i", "p", "head", NULL,
				506	"a", "a", NULL,
				507	"caption", "p", NULL,
				508	"colgroup", "caption", "colgroup", "col", "p", NULL,
				509	"col", "caption", "col", "p", NULL,
				510	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				511	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	512	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				513	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	514	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				515	"thead", "caption", "col", "colgroup", NULL,
				516	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				517	"tbody", "p", NULL,
				518	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				519	"tfoot", "tbody", "p", NULL,
				520	"optgroup", "option", NULL,
				521	"option", "option", NULL,
				522	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				523	"pre", "listing", "xmp", "a", NULL,
				524	NULL
				525	};
				526
				527	/*
				528	* The list of HTML elements which are supposed not to have
				529	* CDATA content and where a p element will be implied
				530	*
				531	* TODO: extend that list by reading the HTML SGML DtD on
				532	* implied paragraph
				533	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	534	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	535	"html",
				536	"head",
				537	"body",
				538	NULL
				539	};
				540
				541	/*
				542	* The list of HTML attributes which are of content %Script;
				543	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				544	* it assumes the name starts with 'on'
				545	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	546	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	547	"onclick",
				548	"ondblclick",
				549	"onmousedown",
				550	"onmouseup",
				551	"onmouseover",
				552	"onmousemove",
				553	"onmouseout",
				554	"onkeypress",
				555	"onkeydown",
				556	"onkeyup",
				557	"onload",
				558	"onunload",
				559	"onfocus",
				560	"onblur",
				561	"onsubmit",
				562	"onrest",
				563	"onchange",
				564	"onselect"
				565	};
				566
				567
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	568	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	569	static int htmlStartCloseIndexinitialized = 0;
				570
				571	/************************************************************************
				572	* *
				573	* functions to handle HTML specific data *
				574	* *
				575	************************************************************************/
				576
				577	/**
				578	* htmlInitAutoClose:
				579	*
				580	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				581	* This is not reentrant. Call xmlInitParser() once before processing in
				582	* case of use in multithreaded programs.
				583	*/
				584	void
				585	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	586	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	587
				588	if (htmlStartCloseIndexinitialized) return;
				589
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	590	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				591	indx = 0;
				592	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				593	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	594	while (htmlStartClose[i] != NULL) i++;
				595	i++;
				596	}
				597	htmlStartCloseIndexinitialized = 1;
				598	}
				599
				600	/**
				601	* htmlTagLookup:
				602	* @tag: The tag name in lowercase
				603	*
				604	* Lookup the HTML tag in the ElementTable
				605	*
				606	* Returns the related htmlElemDescPtr or NULL if not found.
				607	*/
				608	htmlElemDescPtr
				609	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	610	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	611
				612	for (i = 0; i < (sizeof(html40ElementTable) /
				613	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	614	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	615	return(&html40ElementTable[i]);
				616	}
				617	return(NULL);
				618	}
				619
				620	/**
				621	* htmlCheckAutoClose:
				622	* @newtag: The new tag name
				623	* @oldtag: The old tag name
				624	*
				625	* Checks wether the new tag is one of the registered valid tags for closing old.
				626	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				627	*
				628	* Returns 0 if no, 1 if yes.
				629	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	630	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	631	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	632	int i, indx;
				633	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	634
				635	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				636
				637	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	638	for (indx = 0; indx < 100;indx++) {
				639	closed = htmlStartCloseIndex[indx];
				640	if (closed == NULL) return(0);
				641	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	642	}
				643
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	644	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	645	i++;
				646	while (htmlStartClose[i] != NULL) {
				647	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				648	return(1);
				649	}
				650	i++;
				651	}
				652	return(0);
				653	}
				654
				655	/**
				656	* htmlAutoCloseOnClose:
				657	* @ctxt: an HTML parser context
				658	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	659	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	660	*
				661	* The HTmL DtD allows an ending tag to implicitely close other tags.
				662	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	663	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	664	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				665	htmlElemDescPtr info;
				666	xmlChar *oldname;
				667	int i;
				668
				669	#ifdef DEBUG
				670	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				671	for (i = 0;i < ctxt->nameNr;i++)
				672	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				673	#endif
				674
				675	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				676	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
				677	}
				678	if (i < 0) return;
				679
				680	while (!xmlStrEqual(newtag, ctxt->name)) {
				681	info = htmlTagLookup(ctxt->name);
				682	if ((info == NULL) \|\| (info->endTag == 1)) {
				683	#ifdef DEBUG
				684	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				685	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame^]	686	} else if (info->endTag == 3) {
				687	#ifdef DEBUG
				688	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				689	#endif
				690	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				691	ctxt->sax->error(ctxt->userData,
				692	"Opening and ending tag mismatch: %s and %s\n",
				693	newtag, ctxt->name);
				694	ctxt->wellFormed = 0;
				695	} else {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	696	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	697	}
				698	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				699	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				700	oldname = htmlnamePop(ctxt);
				701	if (oldname != NULL) {
				702	#ifdef DEBUG
				703	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				704	#endif
				705	xmlFree(oldname);
				706	}
				707	}
				708	}
				709
				710	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	711	* htmlAutoCloseOnEnd:
				712	* @ctxt: an HTML parser context
				713	*
				714	* Close all remaining tags at the end of the stream
				715	*/
				716	static void
				717	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				718	xmlChar *oldname;
				719	int i;
				720
				721	if (ctxt->nameNr == 0)
				722	return;
				723	#ifdef DEBUG
				724	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				725	#endif
				726
				727	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				728	#ifdef DEBUG
				729	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				730	#endif
				731	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				732	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				733	oldname = htmlnamePop(ctxt);
				734	if (oldname != NULL) {
				735	#ifdef DEBUG
				736	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				737	#endif
				738	xmlFree(oldname);
				739	}
				740	}
				741	}
				742
				743	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	744	* htmlAutoClose:
				745	* @ctxt: an HTML parser context
				746	* @newtag: The new tag name or NULL
				747	*
				748	* The HTmL DtD allows a tag to implicitely close other tags.
				749	* The list is kept in htmlStartClose array. This function is
				750	* called when a new tag has been detected and generates the
				751	* appropriates closes if possible/needed.
				752	* If newtag is NULL this mean we are at the end of the resource
				753	* and we should check
				754	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	755	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	756	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				757	xmlChar *oldname;
				758	while ((newtag != NULL) && (ctxt->name != NULL) &&
				759	(htmlCheckAutoClose(newtag, ctxt->name))) {
				760	#ifdef DEBUG
				761	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				762	#endif
				763	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				764	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				765	oldname = htmlnamePop(ctxt);
				766	if (oldname != NULL) {
				767	#ifdef DEBUG
				768	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				769	#endif
				770	xmlFree(oldname);
				771	}
				772	}
				773	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	774	htmlAutoCloseOnEnd(ctxt);
				775	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	776	}
				777	while ((newtag == NULL) && (ctxt->name != NULL) &&
				778	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				779	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				780	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				781	#ifdef DEBUG
				782	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				783	#endif
				784	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				785	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				786	oldname = htmlnamePop(ctxt);
				787	if (oldname != NULL) {
				788	#ifdef DEBUG
				789	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				790	#endif
				791	xmlFree(oldname);
				792	}
				793	}
				794
				795	}
				796
				797	/**
				798	* htmlAutoCloseTag:
				799	* @doc: the HTML document
				800	* @name: The tag name
				801	* @elem: the HTML element
				802	*
				803	* The HTmL DtD allows a tag to implicitely close other tags.
				804	* The list is kept in htmlStartClose array. This function checks
				805	* if the element or one of it's children would autoclose the
				806	* given tag.
				807	*
				808	* Returns 1 if autoclose, 0 otherwise
				809	*/
				810	int
				811	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				812	htmlNodePtr child;
				813
				814	if (elem == NULL) return(1);
				815	if (xmlStrEqual(name, elem->name)) return(0);
				816	if (htmlCheckAutoClose(elem->name, name)) return(1);
				817	child = elem->children;
				818	while (child != NULL) {
				819	if (htmlAutoCloseTag(doc, name, child)) return(1);
				820	child = child->next;
				821	}
				822	return(0);
				823	}
				824
				825	/**
				826	* htmlIsAutoClosed:
				827	* @doc: the HTML document
				828	* @elem: the HTML element
				829	*
				830	* The HTmL DtD allows a tag to implicitely close other tags.
				831	* The list is kept in htmlStartClose array. This function checks
				832	* if a tag is autoclosed by one of it's child
				833	*
				834	* Returns 1 if autoclosed, 0 otherwise
				835	*/
				836	int
				837	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				838	htmlNodePtr child;
				839
				840	if (elem == NULL) return(1);
				841	child = elem->children;
				842	while (child != NULL) {
				843	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				844	child = child->next;
				845	}
				846	return(0);
				847	}
				848
				849	/**
				850	* htmlCheckImplied:
				851	* @ctxt: an HTML parser context
				852	* @newtag: The new tag name
				853	*
				854	* The HTML DtD allows a tag to exists only implicitely
				855	* called when a new tag has been detected and generates the
				856	* appropriates implicit tags if missing
				857	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	858	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	859	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				860	if (!htmlOmittedDefaultValue)
				861	return;
				862	if (xmlStrEqual(newtag, BAD_CAST"html"))
				863	return;
				864	if (ctxt->nameNr <= 0) {
				865	#ifdef DEBUG
				866	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				867	#endif
				868	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				869	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				870	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				871	}
				872	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				873	return;
				874	if ((ctxt->nameNr <= 1) &&
				875	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				876	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				877	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				878	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				879	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				880	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				881	/*
				882	* dropped OBJECT ... i you put it first BODY will be
				883	* assumed !
				884	*/
				885	#ifdef DEBUG
				886	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				887	#endif
				888	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				889	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				890	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				891	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				892	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				893	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				894	int i;
				895	for (i = 0;i < ctxt->nameNr;i++) {
				896	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				897	return;
				898	}
				899	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				900	return;
				901	}
				902	}
				903
				904	#ifdef DEBUG
				905	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				906	#endif
				907	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				908	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				909	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				910	}
				911	}
				912
				913	/**
				914	* htmlCheckParagraph
				915	* @ctxt: an HTML parser context
				916	*
				917	* Check whether a p element need to be implied before inserting
				918	* characters in the current element.
				919	*
				920	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				921	* in case of error.
				922	*/
				923
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	924	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	925	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				926	const xmlChar *tag;
				927	int i;
				928
				929	if (ctxt == NULL)
				930	return(-1);
				931	tag = ctxt->name;
				932	if (tag == NULL) {
				933	htmlAutoClose(ctxt, BAD_CAST"p");
				934	htmlCheckImplied(ctxt, BAD_CAST"p");
				935	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				936	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				937	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				938	return(1);
				939	}
				940	if (!htmlOmittedDefaultValue)
				941	return(0);
				942	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				943	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				944	#ifdef DEBUG
				945	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				946	#endif
				947	htmlAutoClose(ctxt, BAD_CAST"p");
				948	htmlCheckImplied(ctxt, BAD_CAST"p");
				949	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				950	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				951	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				952	return(1);
				953	}
				954	}
				955	return(0);
				956	}
				957
				958	/**
				959	* htmlIsScriptAttribute:
				960	* @name: an attribute name
				961	*
				962	* Check if an attribute is of content type Script
				963	*
				964	* Returns 1 is the attribute is a script 0 otherwise
				965	*/
				966	int
				967	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	968	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	969
				970	if (name == NULL)
				971	return(0);
				972	/*
				973	* all script attributes start with 'on'
				974	*/
				975	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				976	return(0);
				977	for (i = 0;
				978	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				979	i++) {
				980	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				981	return(1);
				982	}
				983	return(0);
				984	}
				985
				986	/************************************************************************
				987	* *
				988	* The list of HTML predefined entities *
				989	* *
				990	************************************************************************/
				991
				992
				993	htmlEntityDesc html40EntitiesTable[] = {
				994	/*
				995	* the 4 absolute ones, plus apostrophe.
				996	*/
				997	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				998	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				999	{ 39, "apos", "single quote" },
				1000	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1001	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1002
				1003	/*
				1004	* A bunch still in the 128-255 range
				1005	* Replacing them depend really on the charset used.
				1006	*/
				1007	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1008	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1009	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1010	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1011	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1012	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1013	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1014	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1015	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1016	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1017	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1018	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1019	{ 172, "not", "not sign, U+00AC ISOnum" },
				1020	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1021	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1022	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1023	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1024	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1025	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1026	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1027	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1028	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1029	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1030	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1031	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1032	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1033	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1034	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1035	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1036	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1037	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1038	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1039	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1040	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1041	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1042	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1043	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1044	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1045	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1046	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1047	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1048	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1049	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1050	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1051	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1052	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1053	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1054	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1055	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1056	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1057	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1058	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1059	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1060	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1061	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1062	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1063	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1064	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1065	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1066	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1067	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1068	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1069	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1070	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1071	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1072	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1073	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1074	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1075	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1076	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1077	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1078	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1079	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1080	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1081	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1082	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1083	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1084	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1085	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1086	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1087	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1088	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1089	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1090	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1091	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1092	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1093	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1094	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1095	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1096	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1097	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1098	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1099	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1100	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1101	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1102	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1103
				1104	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1105	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1106	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1107	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1108	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1109
				1110	/*
				1111	* Anything below should really be kept as entities references
				1112	*/
				1113	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1114
				1115	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1116	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1117
				1118	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1119	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1120	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1121	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1122	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1123	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1124	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1125	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1126	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1127	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1128	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1129	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1130	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1131	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1132	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1133	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1134	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1135	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1136	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1137	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1138	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1139	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1140	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1141	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1142
				1143	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1144	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1145	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1146	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1147	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1148	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1149	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1150	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1151	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1152	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1153	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1154	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1155	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1156	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1157	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1158	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1159	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1160	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1161	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1162	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1163	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1164	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1165	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1166	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1167	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1168	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1169	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1170	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1171
				1172	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1173	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1174	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1175	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1176	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1177	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1178	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1179	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1180	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1181	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1182	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1183	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1184	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1185	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1186	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1187	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1188	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1189
				1190	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1191	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1192
				1193	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1194
				1195	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1196	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1197
				1198	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1199	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1200
				1201	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1202	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1203
				1204	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1205
				1206	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1207	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1208	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1209	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1210	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1211	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1212	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1213	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1214	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1215	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1216	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1217	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1218	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1219	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1220	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1221	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1222
				1223	{ 8704, "forall","for all, U+2200 ISOtech" },
				1224	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1225	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1226	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1227	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1228	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1229	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1230	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1231	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1232	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1233	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1234	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1235	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1236	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1237	{ 8734, "infin","infinity, U+221E ISOtech" },
				1238	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1239	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1240	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1241	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1242	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1243	{ 8747, "int", "integral, U+222B ISOtech" },
				1244	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1245	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1246	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1247	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1248	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1249	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1250	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1251	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1252	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1253	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1254	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1255	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1256	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1257	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1258	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1259	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1260	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1261	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1262	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1263	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1264	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1265	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1266	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1267	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1268
				1269	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1270	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1271	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1272	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1273
				1274	};
				1275
				1276	/************************************************************************
				1277	* *
				1278	* Commodity functions to handle entities *
				1279	* *
				1280	************************************************************************/
				1281
				1282	/*
				1283	* Macro used to grow the current buffer.
				1284	*/
				1285	#define growBuffer(buffer) { \
				1286	buffer##_size *= 2; \
				1287	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1288	if (buffer == NULL) { \
				1289	perror("realloc failed"); \
				1290	return(NULL); \
				1291	} \
				1292	}
				1293
				1294	/**
				1295	* htmlEntityLookup:
				1296	* @name: the entity name
				1297	*
				1298	* Lookup the given entity in EntitiesTable
				1299	*
				1300	* TODO: the linear scan is really ugly, an hash table is really needed.
				1301	*
				1302	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1303	*/
				1304	htmlEntityDescPtr
				1305	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1306	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1307
				1308	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1309	sizeof(html40EntitiesTable[0]));i++) {
				1310	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1311	#ifdef DEBUG
				1312	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1313	#endif
				1314	return(&html40EntitiesTable[i]);
				1315	}
				1316	}
				1317	return(NULL);
				1318	}
				1319
				1320	/**
				1321	* htmlEntityValueLookup:
				1322	* @value: the entity's unicode value
				1323	*
				1324	* Lookup the given entity in EntitiesTable
				1325	*
				1326	* TODO: the linear scan is really ugly, an hash table is really needed.
				1327	*
				1328	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1329	*/
				1330	htmlEntityDescPtr
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1331	htmlEntityValueLookup(unsigned int value) {
				1332	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1333	#ifdef DEBUG
				1334	int lv = 0;
				1335	#endif
				1336
				1337	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1338	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1339	if (html40EntitiesTable[i].value >= value) {
				1340	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1341	break;
				1342	#ifdef DEBUG
				1343	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1344	#endif
				1345	return(&html40EntitiesTable[i]);
				1346	}
				1347	#ifdef DEBUG
				1348	if (lv > html40EntitiesTable[i].value) {
				1349	xmlGenericError(xmlGenericErrorContext,
				1350	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1351	lv, html40EntitiesTable[i].value);
				1352	}
				1353	lv = html40EntitiesTable[i].value;
				1354	#endif
				1355	}
				1356	return(NULL);
				1357	}
				1358
				1359	/**
				1360	* UTF8ToHtml:
				1361	* @out: a pointer to an array of bytes to store the result
				1362	* @outlen: the length of @out
				1363	* @in: a pointer to an array of UTF-8 chars
				1364	* @inlen: the length of @in
				1365	*
				1366	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1367	* plus HTML entities block of chars out.
				1368	*
				1369	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1370	* The value of @inlen after return is the number of octets consumed
				1371	* as the return value is positive, else unpredictiable.
				1372	* The value of @outlen after return is the number of octets consumed.
				1373	*/
				1374	int
				1375	UTF8ToHtml(unsigned char* out, int *outlen,
				1376	const unsigned char* in, int *inlen) {
				1377	const unsigned char* processed = in;
				1378	const unsigned char* outend;
				1379	const unsigned char* outstart = out;
				1380	const unsigned char* instart = in;
				1381	const unsigned char* inend;
				1382	unsigned int c, d;
				1383	int trailing;
				1384
				1385	if (in == NULL) {
				1386	/*
				1387	* initialization nothing to do
				1388	*/
				1389	*outlen = 0;
				1390	*inlen = 0;
				1391	return(0);
				1392	}
				1393	inend = in + (*inlen);
				1394	outend = out + (*outlen);
				1395	while (in < inend) {
				1396	d = *in++;
				1397	if (d < 0x80) { c= d; trailing= 0; }
				1398	else if (d < 0xC0) {
				1399	/* trailing byte in leading position */
				1400	*outlen = out - outstart;
				1401	*inlen = processed - instart;
				1402	return(-2);
				1403	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1404	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1405	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1406	else {
				1407	/* no chance for this in Ascii */
				1408	*outlen = out - outstart;
				1409	*inlen = processed - instart;
				1410	return(-2);
				1411	}
				1412
				1413	if (inend - in < trailing) {
				1414	break;
				1415	}
				1416
				1417	for ( ; trailing; trailing--) {
				1418	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1419	break;
				1420	c <<= 6;
				1421	c \|= d & 0x3F;
				1422	}
				1423
				1424	/* assertion: c is a single UTF-4 value */
				1425	if (c < 0x80) {
				1426	if (out + 1 >= outend)
				1427	break;
				1428	*out++ = c;
				1429	} else {
				1430	int len;
				1431	htmlEntityDescPtr ent;
				1432
				1433	/*
				1434	* Try to lookup a predefined HTML entity for it
				1435	*/
				1436
				1437	ent = htmlEntityValueLookup(c);
				1438	if (ent == NULL) {
				1439	/* no chance for this in Ascii */
				1440	*outlen = out - outstart;
				1441	*inlen = processed - instart;
				1442	return(-2);
				1443	}
				1444	len = strlen(ent->name);
				1445	if (out + 2 + len >= outend)
				1446	break;
				1447	*out++ = '&';
				1448	memcpy(out, ent->name, len);
				1449	out += len;
				1450	*out++ = ';';
				1451	}
				1452	processed = in;
				1453	}
				1454	*outlen = out - outstart;
				1455	*inlen = processed - instart;
				1456	return(0);
				1457	}
				1458
				1459	/**
				1460	* htmlEncodeEntities:
				1461	* @out: a pointer to an array of bytes to store the result
				1462	* @outlen: the length of @out
				1463	* @in: a pointer to an array of UTF-8 chars
				1464	* @inlen: the length of @in
				1465	* @quoteChar: the quote character to escape (' or ") or zero.
				1466	*
				1467	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1468	* plus HTML entities block of chars out.
				1469	*
				1470	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1471	* The value of @inlen after return is the number of octets consumed
				1472	* as the return value is positive, else unpredictiable.
				1473	* The value of @outlen after return is the number of octets consumed.
				1474	*/
				1475	int
				1476	htmlEncodeEntities(unsigned char* out, int *outlen,
				1477	const unsigned char* in, int *inlen, int quoteChar) {
				1478	const unsigned char* processed = in;
				1479	const unsigned char* outend = out + (*outlen);
				1480	const unsigned char* outstart = out;
				1481	const unsigned char* instart = in;
				1482	const unsigned char* inend = in + (*inlen);
				1483	unsigned int c, d;
				1484	int trailing;
				1485
				1486	while (in < inend) {
				1487	d = *in++;
				1488	if (d < 0x80) { c= d; trailing= 0; }
				1489	else if (d < 0xC0) {
				1490	/* trailing byte in leading position */
				1491	*outlen = out - outstart;
				1492	*inlen = processed - instart;
				1493	return(-2);
				1494	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1495	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1496	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1497	else {
				1498	/* no chance for this in Ascii */
				1499	*outlen = out - outstart;
				1500	*inlen = processed - instart;
				1501	return(-2);
				1502	}
				1503
				1504	if (inend - in < trailing)
				1505	break;
				1506
				1507	while (trailing--) {
				1508	if (((d= *in++) & 0xC0) != 0x80) {
				1509	*outlen = out - outstart;
				1510	*inlen = processed - instart;
				1511	return(-2);
				1512	}
				1513	c <<= 6;
				1514	c \|= d & 0x3F;
				1515	}
				1516
				1517	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1518	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1519	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1520	if (out >= outend)
				1521	break;
				1522	*out++ = c;
				1523	} else {
				1524	htmlEntityDescPtr ent;
				1525	const char *cp;
				1526	char nbuf[16];
				1527	int len;
				1528
				1529	/*
				1530	* Try to lookup a predefined HTML entity for it
				1531	*/
				1532	ent = htmlEntityValueLookup(c);
				1533	if (ent == NULL) {
				1534	sprintf(nbuf, "#%u", c);
				1535	cp = nbuf;
				1536	}
				1537	else
				1538	cp = ent->name;
				1539	len = strlen(cp);
				1540	if (out + 2 + len > outend)
				1541	break;
				1542	*out++ = '&';
				1543	memcpy(out, cp, len);
				1544	out += len;
				1545	*out++ = ';';
				1546	}
				1547	processed = in;
				1548	}
				1549	*outlen = out - outstart;
				1550	*inlen = processed - instart;
				1551	return(0);
				1552	}
				1553
				1554	/**
				1555	* htmlDecodeEntities:
				1556	* @ctxt: the parser context
				1557	* @len: the len to decode (in bytes !), -1 for no size limit
				1558	* @end: an end marker xmlChar, 0 if none
				1559	* @end2: an end marker xmlChar, 0 if none
				1560	* @end3: an end marker xmlChar, 0 if none
				1561	*
				1562	* Subtitute the HTML entities by their value
				1563	*
				1564	* DEPRECATED !!!!
				1565	*
				1566	* Returns A newly allocated string with the substitution done. The caller
				1567	* must deallocate it !
				1568	*/
				1569	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1570	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1571	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1572	static int deprecated = 0;
				1573	if (!deprecated) {
				1574	xmlGenericError(xmlGenericErrorContext,
				1575	"htmlDecodeEntities() deprecated function reached\n");
				1576	deprecated = 1;
				1577	}
				1578	return(NULL);
				1579	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1580	xmlChar *name = NULL;
				1581	xmlChar *buffer = NULL;
				1582	unsigned int buffer_size = 0;
				1583	unsigned int nbchars = 0;
				1584	htmlEntityDescPtr ent;
				1585	unsigned int max = (unsigned int) len;
				1586	int c,l;
				1587
				1588	if (ctxt->depth > 40) {
				1589	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1590	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1591	ctxt->sax->error(ctxt->userData,
				1592	"Detected entity reference loop\n");
				1593	ctxt->wellFormed = 0;
				1594	ctxt->disableSAX = 1;
				1595	return(NULL);
				1596	}
				1597
				1598	/*
				1599	* allocate a translation buffer.
				1600	*/
				1601	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1602	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1603	if (buffer == NULL) {
				1604	perror("xmlDecodeEntities: malloc failed");
				1605	return(NULL);
				1606	}
				1607
				1608	/*
				1609	* Ok loop until we reach one of the ending char or a size limit.
				1610	*/
				1611	c = CUR_CHAR(l);
				1612	while ((nbchars < max) && (c != end) &&
				1613	(c != end2) && (c != end3)) {
				1614
				1615	if (c == 0) break;
				1616	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1617	int val = htmlParseCharRef(ctxt);
				1618	COPY_BUF(0,buffer,nbchars,val);
				1619	NEXTL(l);
				1620	} else if ((c == '&') && (ctxt->token != '&')) {
				1621	ent = htmlParseEntityRef(ctxt, &name);
				1622	if (name != NULL) {
				1623	if (ent != NULL) {
				1624	int val = ent->value;
				1625	COPY_BUF(0,buffer,nbchars,val);
				1626	NEXTL(l);
				1627	} else {
				1628	const xmlChar *cur = name;
				1629
				1630	buffer[nbchars++] = '&';
				1631	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1632	growBuffer(buffer);
				1633	}
				1634	while (*cur != 0) {
				1635	buffer[nbchars++] = *cur++;
				1636	}
				1637	buffer[nbchars++] = ';';
				1638	}
				1639	}
				1640	} else {
				1641	COPY_BUF(l,buffer,nbchars,c);
				1642	NEXTL(l);
				1643	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1644	growBuffer(buffer);
				1645	}
				1646	}
				1647	c = CUR_CHAR(l);
				1648	}
				1649	buffer[nbchars++] = 0;
				1650	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1651	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1652	}
				1653
				1654	/************************************************************************
				1655	* *
				1656	* Commodity functions to handle streams *
				1657	* *
				1658	************************************************************************/
				1659
				1660	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1661	* htmlNewInputStream:
				1662	* @ctxt: an HTML parser context
				1663	*
				1664	* Create a new input stream structure
				1665	* Returns the new input stream or NULL
				1666	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1667	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1668	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1669	htmlParserInputPtr input;
				1670
				1671	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1672	if (input == NULL) {
				1673	ctxt->errNo = XML_ERR_NO_MEMORY;
				1674	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1675	ctxt->sax->error(ctxt->userData,
				1676	"malloc: couldn't allocate a new input stream\n");
				1677	return(NULL);
				1678	}
				1679	memset(input, 0, sizeof(htmlParserInput));
				1680	input->filename = NULL;
				1681	input->directory = NULL;
				1682	input->base = NULL;
				1683	input->cur = NULL;
				1684	input->buf = NULL;
				1685	input->line = 1;
				1686	input->col = 1;
				1687	input->buf = NULL;
				1688	input->free = NULL;
				1689	input->version = NULL;
				1690	input->consumed = 0;
				1691	input->length = 0;
				1692	return(input);
				1693	}
				1694
				1695
				1696	/************************************************************************
				1697	* *
				1698	* Commodity functions, cleanup needed ? *
				1699	* *
				1700	************************************************************************/
				1701
				1702	/**
				1703	* areBlanks:
				1704	* @ctxt: an HTML parser context
				1705	* @str: a xmlChar *
				1706	* @len: the size of @str
				1707	*
				1708	* Is this a sequence of blank chars that one can ignore ?
				1709	*
				1710	* Returns 1 if ignorable 0 otherwise.
				1711	*/
				1712
				1713	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1714	int i;
				1715	xmlNodePtr lastChild;
				1716
				1717	for (i = 0;i < len;i++)
				1718	if (!(IS_BLANK(str[i]))) return(0);
				1719
				1720	if (CUR == 0) return(1);
				1721	if (CUR != '<') return(0);
				1722	if (ctxt->name == NULL)
				1723	return(1);
				1724	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1725	return(1);
				1726	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1727	return(1);
				1728	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1729	return(1);
				1730	if (ctxt->node == NULL) return(0);
				1731	lastChild = xmlGetLastChild(ctxt->node);
				1732	if (lastChild == NULL) {
				1733	if (ctxt->node->content != NULL) return(0);
				1734	} else if (xmlNodeIsText(lastChild)) {
				1735	return(0);
				1736	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1737	return(0);
				1738	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1739	return(0);
				1740	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1741	return(0);
				1742	}
				1743	return(1);
				1744	}
				1745
				1746	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1747	* htmlNewDocNoDtD:
				1748	* @URI: URI for the dtd, or NULL
				1749	* @ExternalID: the external ID of the DTD, or NULL
				1750	*
				1751	* Returns a new document, do not intialize the DTD if not provided
				1752	*/
				1753	htmlDocPtr
				1754	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1755	xmlDocPtr cur;
				1756
				1757	/*
				1758	* Allocate a new document and fill the fields.
				1759	*/
				1760	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1761	if (cur == NULL) {
				1762	xmlGenericError(xmlGenericErrorContext,
				1763	"xmlNewDoc : malloc failed\n");
				1764	return(NULL);
				1765	}
				1766	memset(cur, 0, sizeof(xmlDoc));
				1767
				1768	cur->type = XML_HTML_DOCUMENT_NODE;
				1769	cur->version = NULL;
				1770	cur->intSubset = NULL;
				1771	if ((ExternalID != NULL) \|\|
				1772	(URI != NULL))
				1773	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1774	cur->doc = cur;
				1775	cur->name = NULL;
				1776	cur->children = NULL;
				1777	cur->extSubset = NULL;
				1778	cur->oldNs = NULL;
				1779	cur->encoding = NULL;
				1780	cur->standalone = 1;
				1781	cur->compression = 0;
				1782	cur->ids = NULL;
				1783	cur->refs = NULL;
				1784	#ifndef XML_WITHOUT_CORBA
				1785	cur->_private = NULL;
				1786	#endif
				1787	return(cur);
				1788	}
				1789
				1790	/**
				1791	* htmlNewDoc:
				1792	* @URI: URI for the dtd, or NULL
				1793	* @ExternalID: the external ID of the DTD, or NULL
				1794	*
				1795	* Returns a new document
				1796	*/
				1797	htmlDocPtr
				1798	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1799	if ((URI == NULL) && (ExternalID == NULL))
				1800	return(htmlNewDocNoDtD(
				1801	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				1802	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
				1803
				1804	return(htmlNewDocNoDtD(URI, ExternalID));
				1805	}
				1806
				1807
				1808	/************************************************************************
				1809	* *
				1810	* The parser itself *
				1811	* Relates to http://www.w3.org/TR/html40 *
				1812	* *
				1813	************************************************************************/
				1814
				1815	/************************************************************************
				1816	* *
				1817	* The parser itself *
				1818	* *
				1819	************************************************************************/
				1820
				1821	/**
				1822	* htmlParseHTMLName:
				1823	* @ctxt: an HTML parser context
				1824	*
				1825	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1826	* since HTML names are not case-sensitive.
				1827	*
				1828	* Returns the Tag Name parsed or NULL
				1829	*/
				1830
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1831	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1832	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1833	xmlChar *ret = NULL;
				1834	int i = 0;
				1835	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1836
				1837	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1838	(CUR != ':')) return(NULL);
				1839
				1840	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1841	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1842	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1843	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1844	else loc[i] = CUR;
				1845	i++;
				1846
				1847	NEXT;
				1848	}
				1849
				1850	ret = xmlStrndup(loc, i);
				1851
				1852	return(ret);
				1853	}
				1854
				1855	/**
				1856	* htmlParseName:
				1857	* @ctxt: an HTML parser context
				1858	*
				1859	* parse an HTML name, this routine is case sensistive.
				1860	*
				1861	* Returns the Name parsed or NULL
				1862	*/
				1863
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1864	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1865	htmlParseName(htmlParserCtxtPtr ctxt) {
				1866	xmlChar buf[HTML_MAX_NAMELEN];
				1867	int len = 0;
				1868
				1869	GROW;
				1870	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1871	return(NULL);
				1872	}
				1873
				1874	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1875	(CUR == '.') \|\| (CUR == '-') \|\|
				1876	(CUR == '_') \|\| (CUR == ':') \|\|
				1877	(IS_COMBINING(CUR)) \|\|
				1878	(IS_EXTENDER(CUR))) {
				1879	buf[len++] = CUR;
				1880	NEXT;
				1881	if (len >= HTML_MAX_NAMELEN) {
				1882	xmlGenericError(xmlGenericErrorContext,
				1883	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1884	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1885	(CUR == '.') \|\| (CUR == '-') \|\|
				1886	(CUR == '_') \|\| (CUR == ':') \|\|
				1887	(IS_COMBINING(CUR)) \|\|
				1888	(IS_EXTENDER(CUR)))
				1889	NEXT;
				1890	break;
				1891	}
				1892	}
				1893	return(xmlStrndup(buf, len));
				1894	}
				1895
				1896	/**
				1897	* htmlParseHTMLAttribute:
				1898	* @ctxt: an HTML parser context
				1899	* @stop: a char stop value
				1900	*
				1901	* parse an HTML attribute value till the stop (quote), if
				1902	* stop is 0 then it stops at the first space
				1903	*
				1904	* Returns the attribute parsed or NULL
				1905	*/
				1906
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1907	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1908	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1909	xmlChar *buffer = NULL;
				1910	int buffer_size = 0;
				1911	xmlChar *out = NULL;
				1912	xmlChar *name = NULL;
				1913
				1914	xmlChar *cur = NULL;
				1915	htmlEntityDescPtr ent;
				1916
				1917	/*
				1918	* allocate a translation buffer.
				1919	*/
				1920	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1921	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1922	if (buffer == NULL) {
				1923	perror("htmlParseHTMLAttribute: malloc failed");
				1924	return(NULL);
				1925	}
				1926	out = buffer;
				1927
				1928	/*
				1929	* Ok loop until we reach one of the ending chars
				1930	*/
				1931	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1932	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1933	if (CUR == '&') {
				1934	if (NXT(1) == '#') {
				1935	unsigned int c;
				1936	int bits;
				1937
				1938	c = htmlParseCharRef(ctxt);
				1939	if (c < 0x80)
				1940	{ *out++ = c; bits= -6; }
				1941	else if (c < 0x800)
				1942	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1943	else if (c < 0x10000)
				1944	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1945	else
				1946	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1947
				1948	for ( ; bits >= 0; bits-= 6) {
				1949	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1950	}
				1951	} else {
				1952	ent = htmlParseEntityRef(ctxt, &name);
				1953	if (name == NULL) {
				1954	*out++ = '&';
				1955	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1956	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1957
				1958	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1959	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1960	}
				1961	} else if (ent == NULL) {
				1962	*out++ = '&';
				1963	cur = name;
				1964	while (*cur != 0) {
				1965	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1966	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1967
				1968	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1969	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1970	}
				1971	out++ = cur++;
				1972	}
				1973	xmlFree(name);
				1974	} else {
				1975	unsigned int c;
				1976	int bits;
				1977
				1978	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1979	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1980
				1981	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1982	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1983	}
				1984	c = (xmlChar)ent->value;
				1985	if (c < 0x80)
				1986	{ *out++ = c; bits= -6; }
				1987	else if (c < 0x800)
				1988	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1989	else if (c < 0x10000)
				1990	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1991	else
				1992	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1993
				1994	for ( ; bits >= 0; bits-= 6) {
				1995	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1996	}
				1997	xmlFree(name);
				1998	}
				1999	}
				2000	} else {
				2001	unsigned int c;
				2002	int bits, l;
				2003
				2004	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2005	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2006
				2007	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2008	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2009	}
				2010	c = CUR_CHAR(l);
				2011	if (c < 0x80)
				2012	{ *out++ = c; bits= -6; }
				2013	else if (c < 0x800)
				2014	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2015	else if (c < 0x10000)
				2016	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2017	else
				2018	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2019
				2020	for ( ; bits >= 0; bits-= 6) {
				2021	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2022	}
				2023	NEXT;
				2024	}
				2025	}
				2026	*out++ = 0;
				2027	return(buffer);
				2028	}
				2029
				2030	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2031	* htmlParseEntityRef:
				2032	* @ctxt: an HTML parser context
				2033	* @str: location to store the entity name
				2034	*
				2035	* parse an HTML ENTITY references
				2036	*
				2037	* [68] EntityRef ::= '&' Name ';'
				2038	*
				2039	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2040	* if non-NULL *str will have to be freed by the caller.
				2041	*/
				2042	htmlEntityDescPtr
				2043	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2044	xmlChar *name;
				2045	htmlEntityDescPtr ent = NULL;
				2046	*str = NULL;
				2047
				2048	if (CUR == '&') {
				2049	NEXT;
				2050	name = htmlParseName(ctxt);
				2051	if (name == NULL) {
				2052	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2053	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2054	ctxt->wellFormed = 0;
				2055	} else {
				2056	GROW;
				2057	if (CUR == ';') {
				2058	*str = name;
				2059
				2060	/*
				2061	* Lookup the entity in the table.
				2062	*/
				2063	ent = htmlEntityLookup(name);
				2064	if (ent != NULL) /* OK that's ugly !!! */
				2065	NEXT;
				2066	} else {
				2067	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2068	ctxt->sax->error(ctxt->userData,
				2069	"htmlParseEntityRef: expecting ';'\n");
				2070	*str = name;
				2071	}
				2072	}
				2073	}
				2074	return(ent);
				2075	}
				2076
				2077	/**
				2078	* htmlParseAttValue:
				2079	* @ctxt: an HTML parser context
				2080	*
				2081	* parse a value for an attribute
				2082	* Note: the parser won't do substitution of entities here, this
				2083	* will be handled later in xmlStringGetNodeList, unless it was
				2084	* asked for ctxt->replaceEntities != 0
				2085	*
				2086	* Returns the AttValue parsed or NULL.
				2087	*/
				2088
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2089	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2090	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2091	xmlChar *ret = NULL;
				2092
				2093	if (CUR == '"') {
				2094	NEXT;
				2095	ret = htmlParseHTMLAttribute(ctxt, '"');
				2096	if (CUR != '"') {
				2097	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2098	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2099	ctxt->wellFormed = 0;
				2100	} else
				2101	NEXT;
				2102	} else if (CUR == '\'') {
				2103	NEXT;
				2104	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2105	if (CUR != '\'') {
				2106	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2107	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2108	ctxt->wellFormed = 0;
				2109	} else
				2110	NEXT;
				2111	} else {
				2112	/*
				2113	* That's an HTMLism, the attribute value may not be quoted
				2114	*/
				2115	ret = htmlParseHTMLAttribute(ctxt, 0);
				2116	if (ret == NULL) {
				2117	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2118	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2119	ctxt->wellFormed = 0;
				2120	}
				2121	}
				2122	return(ret);
				2123	}
				2124
				2125	/**
				2126	* htmlParseSystemLiteral:
				2127	* @ctxt: an HTML parser context
				2128	*
				2129	* parse an HTML Literal
				2130	*
				2131	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2132	*
				2133	* Returns the SystemLiteral parsed or NULL
				2134	*/
				2135
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2136	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2137	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2138	const xmlChar *q;
				2139	xmlChar *ret = NULL;
				2140
				2141	if (CUR == '"') {
				2142	NEXT;
				2143	q = CUR_PTR;
				2144	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2145	NEXT;
				2146	if (!IS_CHAR(CUR)) {
				2147	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2148	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2149	ctxt->wellFormed = 0;
				2150	} else {
				2151	ret = xmlStrndup(q, CUR_PTR - q);
				2152	NEXT;
				2153	}
				2154	} else if (CUR == '\'') {
				2155	NEXT;
				2156	q = CUR_PTR;
				2157	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2158	NEXT;
				2159	if (!IS_CHAR(CUR)) {
				2160	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2161	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2162	ctxt->wellFormed = 0;
				2163	} else {
				2164	ret = xmlStrndup(q, CUR_PTR - q);
				2165	NEXT;
				2166	}
				2167	} else {
				2168	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2169	ctxt->sax->error(ctxt->userData,
				2170	"SystemLiteral \" or ' expected\n");
				2171	ctxt->wellFormed = 0;
				2172	}
				2173
				2174	return(ret);
				2175	}
				2176
				2177	/**
				2178	* htmlParsePubidLiteral:
				2179	* @ctxt: an HTML parser context
				2180	*
				2181	* parse an HTML public literal
				2182	*
				2183	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2184	*
				2185	* Returns the PubidLiteral parsed or NULL.
				2186	*/
				2187
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2188	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2189	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2190	const xmlChar *q;
				2191	xmlChar *ret = NULL;
				2192	/*
				2193	* Name ::= (Letter \| '_') (NameChar)*
				2194	*/
				2195	if (CUR == '"') {
				2196	NEXT;
				2197	q = CUR_PTR;
				2198	while (IS_PUBIDCHAR(CUR)) NEXT;
				2199	if (CUR != '"') {
				2200	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2201	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2202	ctxt->wellFormed = 0;
				2203	} else {
				2204	ret = xmlStrndup(q, CUR_PTR - q);
				2205	NEXT;
				2206	}
				2207	} else if (CUR == '\'') {
				2208	NEXT;
				2209	q = CUR_PTR;
				2210	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2211	NEXT;
				2212	if (!IS_LETTER(CUR)) {
				2213	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2214	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2215	ctxt->wellFormed = 0;
				2216	} else {
				2217	ret = xmlStrndup(q, CUR_PTR - q);
				2218	NEXT;
				2219	}
				2220	} else {
				2221	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2222	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2223	ctxt->wellFormed = 0;
				2224	}
				2225
				2226	return(ret);
				2227	}
				2228
				2229	/**
				2230	* htmlParseScript:
				2231	* @ctxt: an HTML parser context
				2232	*
				2233	* parse the content of an HTML SCRIPT or STYLE element
				2234	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2235	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2236	* http://www.w3.org/TR/html4/types.html#type-script
				2237	* http://www.w3.org/TR/html4/types.html#h-6.15
				2238	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2239	*
				2240	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2241	* element and the value of intrinsic event attributes. User agents must
				2242	* not evaluate script data as HTML markup but instead must pass it on as
				2243	* data to a script engine.
				2244	* NOTES:
				2245	* - The content is passed like CDATA
				2246	* - the attributes for style and scripting "onXXX" are also described
				2247	* as CDATA but SGML allows entities references in attributes so their
				2248	* processing is identical as other attributes
				2249	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2250	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2251	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2252	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2253	int nbchar = 0;
				2254	xmlChar cur;
				2255
				2256	SHRINK;
				2257	cur = CUR;
				2258	while (IS_CHAR(cur)) {
				2259	if ((cur == '<') && (NXT(1) == '/')) {
				2260	/*
				2261	* One should break here, the specification is clear:
				2262	* Authors should therefore escape "</" within the content.
				2263	* Escape mechanisms are specific to each scripting or
				2264	* style sheet language.
				2265	*/
				2266	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2267	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2268	break; /* while */
				2269	}
				2270	buf[nbchar++] = cur;
				2271	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2272	if (ctxt->sax->cdataBlock!= NULL) {
				2273	/*
				2274	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2275	*/
				2276	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2277	}
				2278	nbchar = 0;
				2279	}
				2280	NEXT;
				2281	cur = CUR;
				2282	}
				2283	if (!(IS_CHAR(cur))) {
				2284	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2285	ctxt->sax->error(ctxt->userData,
				2286	"Invalid char in CDATA 0x%X\n", cur);
				2287	ctxt->wellFormed = 0;
				2288	NEXT;
				2289	}
				2290
				2291	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2292	if (ctxt->sax->cdataBlock!= NULL) {
				2293	/*
				2294	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2295	*/
				2296	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2297	}
				2298	}
				2299	}
				2300
				2301
				2302	/**
				2303	* htmlParseCharData:
				2304	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2305	*
				2306	* parse a CharData section.
				2307	* if we are within a CDATA section ']]>' marks an end of section.
				2308	*
				2309	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2310	*/
				2311
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2312	static void
				2313	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2314	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2315	int nbchar = 0;
				2316	int cur, l;
				2317
				2318	SHRINK;
				2319	cur = CUR_CHAR(l);
				2320	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2321	((cur != '&') \|\| (ctxt->token == '&')) &&
				2322	(IS_CHAR(cur))) {
				2323	COPY_BUF(l,buf,nbchar,cur);
				2324	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2325	/*
				2326	* Ok the segment is to be consumed as chars.
				2327	*/
				2328	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2329	if (areBlanks(ctxt, buf, nbchar)) {
				2330	if (ctxt->sax->ignorableWhitespace != NULL)
				2331	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2332	buf, nbchar);
				2333	} else {
				2334	htmlCheckParagraph(ctxt);
				2335	if (ctxt->sax->characters != NULL)
				2336	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2337	}
				2338	}
				2339	nbchar = 0;
				2340	}
				2341	NEXTL(l);
				2342	cur = CUR_CHAR(l);
				2343	}
				2344	if (nbchar != 0) {
				2345	/*
				2346	* Ok the segment is to be consumed as chars.
				2347	*/
				2348	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2349	if (areBlanks(ctxt, buf, nbchar)) {
				2350	if (ctxt->sax->ignorableWhitespace != NULL)
				2351	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2352	} else {
				2353	htmlCheckParagraph(ctxt);
				2354	if (ctxt->sax->characters != NULL)
				2355	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2356	}
				2357	}
				2358	}
				2359	}
				2360
				2361	/**
				2362	* htmlParseExternalID:
				2363	* @ctxt: an HTML parser context
				2364	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2365	*
				2366	* Parse an External ID or a Public ID
				2367	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2368	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2369	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2370	*
				2371	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2372	*
				2373	* Returns the function returns SystemLiteral and in the second
				2374	* case publicID receives PubidLiteral, is strict is off
				2375	* it is possible to return NULL and have publicID set.
				2376	*/
				2377
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2378	static xmlChar *
				2379	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2380	xmlChar *URI = NULL;
				2381
				2382	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2383	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2384	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2385	SKIP(6);
				2386	if (!IS_BLANK(CUR)) {
				2387	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2388	ctxt->sax->error(ctxt->userData,
				2389	"Space required after 'SYSTEM'\n");
				2390	ctxt->wellFormed = 0;
				2391	}
				2392	SKIP_BLANKS;
				2393	URI = htmlParseSystemLiteral(ctxt);
				2394	if (URI == NULL) {
				2395	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2396	ctxt->sax->error(ctxt->userData,
				2397	"htmlParseExternalID: SYSTEM, no URI\n");
				2398	ctxt->wellFormed = 0;
				2399	}
				2400	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2401	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2402	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2403	SKIP(6);
				2404	if (!IS_BLANK(CUR)) {
				2405	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2406	ctxt->sax->error(ctxt->userData,
				2407	"Space required after 'PUBLIC'\n");
				2408	ctxt->wellFormed = 0;
				2409	}
				2410	SKIP_BLANKS;
				2411	*publicID = htmlParsePubidLiteral(ctxt);
				2412	if (*publicID == NULL) {
				2413	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2414	ctxt->sax->error(ctxt->userData,
				2415	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2416	ctxt->wellFormed = 0;
				2417	}
				2418	SKIP_BLANKS;
				2419	if ((CUR == '"') \|\| (CUR == '\'')) {
				2420	URI = htmlParseSystemLiteral(ctxt);
				2421	}
				2422	}
				2423	return(URI);
				2424	}
				2425
				2426	/**
				2427	* htmlParseComment:
				2428	* @ctxt: an HTML parser context
				2429	*
				2430	* Parse an XML (SGML) comment <!-- .... -->
				2431	*
				2432	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2433	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2434	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2435	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2436	xmlChar *buf = NULL;
				2437	int len;
				2438	int size = HTML_PARSER_BUFFER_SIZE;
				2439	int q, ql;
				2440	int r, rl;
				2441	int cur, l;
				2442	xmlParserInputState state;
				2443
				2444	/*
				2445	* Check that there is a comment right here.
				2446	*/
				2447	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2448	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2449
				2450	state = ctxt->instate;
				2451	ctxt->instate = XML_PARSER_COMMENT;
				2452	SHRINK;
				2453	SKIP(4);
				2454	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2455	if (buf == NULL) {
				2456	xmlGenericError(xmlGenericErrorContext,
				2457	"malloc of %d byte failed\n", size);
				2458	ctxt->instate = state;
				2459	return;
				2460	}
				2461	q = CUR_CHAR(ql);
				2462	NEXTL(ql);
				2463	r = CUR_CHAR(rl);
				2464	NEXTL(rl);
				2465	cur = CUR_CHAR(l);
				2466	len = 0;
				2467	while (IS_CHAR(cur) &&
				2468	((cur != '>') \|\|
				2469	(r != '-') \|\| (q != '-'))) {
				2470	if (len + 5 >= size) {
				2471	size *= 2;
				2472	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2473	if (buf == NULL) {
				2474	xmlGenericError(xmlGenericErrorContext,
				2475	"realloc of %d byte failed\n", size);
				2476	ctxt->instate = state;
				2477	return;
				2478	}
				2479	}
				2480	COPY_BUF(ql,buf,len,q);
				2481	q = r;
				2482	ql = rl;
				2483	r = cur;
				2484	rl = l;
				2485	NEXTL(l);
				2486	cur = CUR_CHAR(l);
				2487	if (cur == 0) {
				2488	SHRINK;
				2489	GROW;
				2490	cur = CUR_CHAR(l);
				2491	}
				2492	}
				2493	buf[len] = 0;
				2494	if (!IS_CHAR(cur)) {
				2495	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2496	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2497	ctxt->sax->error(ctxt->userData,
				2498	"Comment not terminated \n<!--%.50s\n", buf);
				2499	ctxt->wellFormed = 0;
				2500	xmlFree(buf);
				2501	} else {
				2502	NEXT;
				2503	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2504	(!ctxt->disableSAX))
				2505	ctxt->sax->comment(ctxt->userData, buf);
				2506	xmlFree(buf);
				2507	}
				2508	ctxt->instate = state;
				2509	}
				2510
				2511	/**
				2512	* htmlParseCharRef:
				2513	* @ctxt: an HTML parser context
				2514	*
				2515	* parse Reference declarations
				2516	*
				2517	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2518	* '&#x' [0-9a-fA-F]+ ';'
				2519	*
				2520	* Returns the value parsed (as an int)
				2521	*/
				2522	int
				2523	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2524	int val = 0;
				2525
				2526	if ((CUR == '&') && (NXT(1) == '#') &&
				2527	(NXT(2) == 'x')) {
				2528	SKIP(3);
				2529	while (CUR != ';') {
				2530	if ((CUR >= '0') && (CUR <= '9'))
				2531	val = val * 16 + (CUR - '0');
				2532	else if ((CUR >= 'a') && (CUR <= 'f'))
				2533	val = val * 16 + (CUR - 'a') + 10;
				2534	else if ((CUR >= 'A') && (CUR <= 'F'))
				2535	val = val * 16 + (CUR - 'A') + 10;
				2536	else {
				2537	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2538	ctxt->sax->error(ctxt->userData,
				2539	"htmlParseCharRef: invalid hexadecimal value\n");
				2540	ctxt->wellFormed = 0;
				2541	return(0);
				2542	}
				2543	NEXT;
				2544	}
				2545	if (CUR == ';')
				2546	NEXT;
				2547	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2548	SKIP(2);
				2549	while (CUR != ';') {
				2550	if ((CUR >= '0') && (CUR <= '9'))
				2551	val = val * 10 + (CUR - '0');
				2552	else {
				2553	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2554	ctxt->sax->error(ctxt->userData,
				2555	"htmlParseCharRef: invalid decimal value\n");
				2556	ctxt->wellFormed = 0;
				2557	return(0);
				2558	}
				2559	NEXT;
				2560	}
				2561	if (CUR == ';')
				2562	NEXT;
				2563	} else {
				2564	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2565	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2566	ctxt->wellFormed = 0;
				2567	}
				2568	/*
				2569	* Check the value IS_CHAR ...
				2570	*/
				2571	if (IS_CHAR(val)) {
				2572	return(val);
				2573	} else {
				2574	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2575	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2576	val);
				2577	ctxt->wellFormed = 0;
				2578	}
				2579	return(0);
				2580	}
				2581
				2582
				2583	/**
				2584	* htmlParseDocTypeDecl :
				2585	* @ctxt: an HTML parser context
				2586	*
				2587	* parse a DOCTYPE declaration
				2588	*
				2589	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2590	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2591	*/
				2592
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2593	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2594	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2595	xmlChar *name;
				2596	xmlChar *ExternalID = NULL;
				2597	xmlChar *URI = NULL;
				2598
				2599	/*
				2600	* We know that '<!DOCTYPE' has been detected.
				2601	*/
				2602	SKIP(9);
				2603
				2604	SKIP_BLANKS;
				2605
				2606	/*
				2607	* Parse the DOCTYPE name.
				2608	*/
				2609	name = htmlParseName(ctxt);
				2610	if (name == NULL) {
				2611	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2612	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2613	ctxt->wellFormed = 0;
				2614	}
				2615	/*
				2616	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2617	*/
				2618
				2619	SKIP_BLANKS;
				2620
				2621	/*
				2622	* Check for SystemID and ExternalID
				2623	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2624	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2625	SKIP_BLANKS;
				2626
				2627	/*
				2628	* We should be at the end of the DOCTYPE declaration.
				2629	*/
				2630	if (CUR != '>') {
				2631	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2632	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2633	ctxt->wellFormed = 0;
				2634	/* We shouldn't try to resynchronize ... */
				2635	}
				2636	NEXT;
				2637
				2638	/*
				2639	* Create or update the document accordingly to the DOCTYPE
				2640	*/
				2641	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2642	(!ctxt->disableSAX))
				2643	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2644
				2645	/*
				2646	* Cleanup, since we don't use all those identifiers
				2647	*/
				2648	if (URI != NULL) xmlFree(URI);
				2649	if (ExternalID != NULL) xmlFree(ExternalID);
				2650	if (name != NULL) xmlFree(name);
				2651	}
				2652
				2653	/**
				2654	* htmlParseAttribute:
				2655	* @ctxt: an HTML parser context
				2656	* @value: a xmlChar ** used to store the value of the attribute
				2657	*
				2658	* parse an attribute
				2659	*
				2660	* [41] Attribute ::= Name Eq AttValue
				2661	*
				2662	* [25] Eq ::= S? '=' S?
				2663	*
				2664	* With namespace:
				2665	*
				2666	* [NS 11] Attribute ::= QName Eq AttValue
				2667	*
				2668	* Also the case QName == xmlns:??? is handled independently as a namespace
				2669	* definition.
				2670	*
				2671	* Returns the attribute name, and the value in *value.
				2672	*/
				2673
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2674	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2675	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2676	xmlChar name, val = NULL;
				2677
				2678	*value = NULL;
				2679	name = htmlParseHTMLName(ctxt);
				2680	if (name == NULL) {
				2681	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2682	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2683	ctxt->wellFormed = 0;
				2684	return(NULL);
				2685	}
				2686
				2687	/*
				2688	* read the value
				2689	*/
				2690	SKIP_BLANKS;
				2691	if (CUR == '=') {
				2692	NEXT;
				2693	SKIP_BLANKS;
				2694	val = htmlParseAttValue(ctxt);
				2695	/******
				2696	} else {
				2697	* TODO : some attribute must have values, some may not
				2698	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2699	ctxt->sax->warning(ctxt->userData,
				2700	"No value for attribute %s\n", name); */
				2701	}
				2702
				2703	*value = val;
				2704	return(name);
				2705	}
				2706
				2707	/**
				2708	* htmlCheckEncoding:
				2709	* @ctxt: an HTML parser context
				2710	* @attvalue: the attribute value
				2711	*
				2712	* Checks an http-equiv attribute from a Meta tag to detect
				2713	* the encoding
				2714	* If a new encoding is detected the parser is switched to decode
				2715	* it and pass UTF8
				2716	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2717	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2718	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2719	const xmlChar *encoding;
				2720
				2721	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2722	return;
				2723
				2724	/* do not change encoding */
				2725	if (ctxt->input->encoding != NULL)
				2726	return;
				2727
				2728	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2729	if (encoding != NULL) {
				2730	encoding += 8;
				2731	} else {
				2732	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2733	if (encoding != NULL)
				2734	encoding += 9;
				2735	}
				2736	if (encoding != NULL) {
				2737	xmlCharEncoding enc;
				2738	xmlCharEncodingHandlerPtr handler;
				2739
				2740	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2741
				2742	if (ctxt->input->encoding != NULL)
				2743	xmlFree((xmlChar *) ctxt->input->encoding);
				2744	ctxt->input->encoding = xmlStrdup(encoding);
				2745
				2746	enc = xmlParseCharEncoding((const char *) encoding);
				2747	/*
				2748	* registered set of known encodings
				2749	*/
				2750	if (enc != XML_CHAR_ENCODING_ERROR) {
				2751	xmlSwitchEncoding(ctxt, enc);
				2752	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2753	} else {
				2754	/*
				2755	* fallback for unknown encodings
				2756	*/
				2757	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2758	if (handler != NULL) {
				2759	xmlSwitchToEncoding(ctxt, handler);
				2760	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2761	} else {
				2762	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2763	}
				2764	}
				2765
				2766	if ((ctxt->input->buf != NULL) &&
				2767	(ctxt->input->buf->encoder != NULL) &&
				2768	(ctxt->input->buf->raw != NULL) &&
				2769	(ctxt->input->buf->buffer != NULL)) {
				2770	int nbchars;
				2771	int processed;
				2772
				2773	/*
				2774	* convert as much as possible to the parser reading buffer.
				2775	*/
				2776	processed = ctxt->input->cur - ctxt->input->base;
				2777	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2778	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2779	ctxt->input->buf->buffer,
				2780	ctxt->input->buf->raw);
				2781	if (nbchars < 0) {
				2782	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2783	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2784	ctxt->sax->error(ctxt->userData,
				2785	"htmlCheckEncoding: encoder error\n");
				2786	}
				2787	ctxt->input->base =
				2788	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2789	}
				2790	}
				2791	}
				2792
				2793	/**
				2794	* htmlCheckMeta:
				2795	* @ctxt: an HTML parser context
				2796	* @atts: the attributes values
				2797	*
				2798	* Checks an attributes from a Meta tag
				2799	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2800	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2801	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2802	int i;
				2803	const xmlChar att, value;
				2804	int http = 0;
				2805	const xmlChar *content = NULL;
				2806
				2807	if ((ctxt == NULL) \|\| (atts == NULL))
				2808	return;
				2809
				2810	i = 0;
				2811	att = atts[i++];
				2812	while (att != NULL) {
				2813	value = atts[i++];
				2814	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2815	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2816	http = 1;
				2817	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2818	content = value;
				2819	att = atts[i++];
				2820	}
				2821	if ((http) && (content != NULL))
				2822	htmlCheckEncoding(ctxt, content);
				2823
				2824	}
				2825
				2826	/**
				2827	* htmlParseStartTag:
				2828	* @ctxt: an HTML parser context
				2829	*
				2830	* parse a start of tag either for rule element or
				2831	* EmptyElement. In both case we don't parse the tag closing chars.
				2832	*
				2833	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2834	*
				2835	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2836	*
				2837	* With namespace:
				2838	*
				2839	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2840	*
				2841	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2842	*
				2843	*/
				2844
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2845	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2846	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2847	xmlChar *name;
				2848	xmlChar *attname;
				2849	xmlChar *attvalue;
				2850	const xmlChar **atts = NULL;
				2851	int nbatts = 0;
				2852	int maxatts = 0;
				2853	int meta = 0;
				2854	int i;
				2855
				2856	if (CUR != '<') return;
				2857	NEXT;
				2858
				2859	GROW;
				2860	name = htmlParseHTMLName(ctxt);
				2861	if (name == NULL) {
				2862	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2863	ctxt->sax->error(ctxt->userData,
				2864	"htmlParseStartTag: invalid element name\n");
				2865	ctxt->wellFormed = 0;
				2866	/* Dump the bogus tag like browsers do */
				2867	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2868	NEXT;
				2869	return;
				2870	}
				2871	if (xmlStrEqual(name, BAD_CAST"meta"))
				2872	meta = 1;
				2873
				2874	/*
				2875	* Check for auto-closure of HTML elements.
				2876	*/
				2877	htmlAutoClose(ctxt, name);
				2878
				2879	/*
				2880	* Check for implied HTML elements.
				2881	*/
				2882	htmlCheckImplied(ctxt, name);
				2883
				2884	/*
				2885	* Avoid html at any level > 0, head at any level != 1
				2886	* or any attempt to recurse body
				2887	*/
				2888	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2889	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2890	ctxt->sax->error(ctxt->userData,
				2891	"htmlParseStartTag: misplaced <html> tag\n");
				2892	ctxt->wellFormed = 0;
				2893	xmlFree(name);
				2894	return;
				2895	}
				2896	if ((ctxt->nameNr != 1) &&
				2897	(xmlStrEqual(name, BAD_CAST"head"))) {
				2898	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2899	ctxt->sax->error(ctxt->userData,
				2900	"htmlParseStartTag: misplaced <head> tag\n");
				2901	ctxt->wellFormed = 0;
				2902	xmlFree(name);
				2903	return;
				2904	}
				2905	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2906	int indx;
				2907	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2908	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2909	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2910	ctxt->sax->error(ctxt->userData,
				2911	"htmlParseStartTag: misplaced <body> tag\n");
				2912	ctxt->wellFormed = 0;
				2913	xmlFree(name);
				2914	return;
				2915	}
				2916	}
				2917	}
				2918
				2919	/*
				2920	* Now parse the attributes, it ends up with the ending
				2921	*
				2922	* (S Attribute)* S?
				2923	*/
				2924	SKIP_BLANKS;
				2925	while ((IS_CHAR(CUR)) &&
				2926	(CUR != '>') &&
				2927	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2928	long cons = ctxt->nbChars;
				2929
				2930	GROW;
				2931	attname = htmlParseAttribute(ctxt, &attvalue);
				2932	if (attname != NULL) {
				2933
				2934	/*
				2935	* Well formedness requires at most one declaration of an attribute
				2936	*/
				2937	for (i = 0; i < nbatts;i += 2) {
				2938	if (xmlStrEqual(atts[i], attname)) {
				2939	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2940	ctxt->sax->error(ctxt->userData,
				2941	"Attribute %s redefined\n",
				2942	attname);
				2943	ctxt->wellFormed = 0;
				2944	xmlFree(attname);
				2945	if (attvalue != NULL)
				2946	xmlFree(attvalue);
				2947	goto failed;
				2948	}
				2949	}
				2950
				2951	/*
				2952	* Add the pair to atts
				2953	*/
				2954	if (atts == NULL) {
				2955	maxatts = 10;
				2956	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				2957	if (atts == NULL) {
				2958	xmlGenericError(xmlGenericErrorContext,
				2959	"malloc of %ld byte failed\n",
				2960	maxatts * (long)sizeof(xmlChar *));
				2961	if (name != NULL) xmlFree(name);
				2962	return;
				2963	}
				2964	} else if (nbatts + 4 > maxatts) {
				2965	maxatts *= 2;
				2966	atts = (const xmlChar *) xmlRealloc((void ) atts,
				2967	maxatts * sizeof(xmlChar *));
				2968	if (atts == NULL) {
				2969	xmlGenericError(xmlGenericErrorContext,
				2970	"realloc of %ld byte failed\n",
				2971	maxatts * (long)sizeof(xmlChar *));
				2972	if (name != NULL) xmlFree(name);
				2973	return;
				2974	}
				2975	}
				2976	atts[nbatts++] = attname;
				2977	atts[nbatts++] = attvalue;
				2978	atts[nbatts] = NULL;
				2979	atts[nbatts + 1] = NULL;
				2980	}
				2981	else {
				2982	/* Dump the bogus attribute string up to the next blank or
				2983	* the end of the tag. */
				2984	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				2985	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				2986	NEXT;
				2987	}
				2988
				2989	failed:
				2990	SKIP_BLANKS;
				2991	if (cons == ctxt->nbChars) {
				2992	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2993	ctxt->sax->error(ctxt->userData,
				2994	"htmlParseStartTag: problem parsing attributes\n");
				2995	ctxt->wellFormed = 0;
				2996	break;
				2997	}
				2998	}
				2999
				3000	/*
				3001	* Handle specific association to the META tag
				3002	*/
				3003	if (meta)
				3004	htmlCheckMeta(ctxt, atts);
				3005
				3006	/*
				3007	* SAX: Start of Element !
				3008	*/
				3009	htmlnamePush(ctxt, xmlStrdup(name));
				3010	#ifdef DEBUG
				3011	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3012	#endif
				3013	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3014	ctxt->sax->startElement(ctxt->userData, name, atts);
				3015
				3016	if (atts != NULL) {
				3017	for (i = 0;i < nbatts;i++) {
				3018	if (atts[i] != NULL)
				3019	xmlFree((xmlChar *) atts[i]);
				3020	}
				3021	xmlFree((void *) atts);
				3022	}
				3023	if (name != NULL) xmlFree(name);
				3024	}
				3025
				3026	/**
				3027	* htmlParseEndTag:
				3028	* @ctxt: an HTML parser context
				3029	*
				3030	* parse an end of tag
				3031	*
				3032	* [42] ETag ::= '</' Name S? '>'
				3033	*
				3034	* With namespace
				3035	*
				3036	* [NS 9] ETag ::= '</' QName S? '>'
				3037	*/
				3038
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3039	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3040	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3041	xmlChar *name;
				3042	xmlChar *oldname;
				3043	int i;
				3044
				3045	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3046	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3047	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3048	ctxt->wellFormed = 0;
				3049	return;
				3050	}
				3051	SKIP(2);
				3052
				3053	name = htmlParseHTMLName(ctxt);
				3054	if (name == NULL) return;
				3055
				3056	/*
				3057	* We should definitely be at the ending "S? '>'" part
				3058	*/
				3059	SKIP_BLANKS;
				3060	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3061	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3062	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3063	ctxt->wellFormed = 0;
				3064	} else
				3065	NEXT;
				3066
				3067	/*
				3068	* If the name read is not one of the element in the parsing stack
				3069	* then return, it's just an error.
				3070	*/
				3071	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3072	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3073	}
				3074	if (i < 0) {
				3075	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3076	ctxt->sax->error(ctxt->userData,
				3077	"Unexpected end tag : %s\n", name);
				3078	xmlFree(name);
				3079	ctxt->wellFormed = 0;
				3080	return;
				3081	}
				3082
				3083
				3084	/*
				3085	* Check for auto-closure of HTML elements.
				3086	*/
				3087
				3088	htmlAutoCloseOnClose(ctxt, name);
				3089
				3090	/*
				3091	* Well formedness constraints, opening and closing must match.
				3092	* With the exception that the autoclose may have popped stuff out
				3093	* of the stack.
				3094	*/
				3095	if (!xmlStrEqual(name, ctxt->name)) {
				3096	#ifdef DEBUG
				3097	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3098	#endif
				3099	if ((ctxt->name != NULL) &&
				3100	(!xmlStrEqual(ctxt->name, name))) {
				3101	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3102	ctxt->sax->error(ctxt->userData,
				3103	"Opening and ending tag mismatch: %s and %s\n",
				3104	name, ctxt->name);
				3105	ctxt->wellFormed = 0;
				3106	}
				3107	}
				3108
				3109	/*
				3110	* SAX: End of Tag
				3111	*/
				3112	oldname = ctxt->name;
				3113	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3114	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3115	ctxt->sax->endElement(ctxt->userData, name);
				3116	oldname = htmlnamePop(ctxt);
				3117	if (oldname != NULL) {
				3118	#ifdef DEBUG
				3119	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3120	#endif
				3121	xmlFree(oldname);
				3122	#ifdef DEBUG
				3123	} else {
				3124	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3125	#endif
				3126	}
				3127	}
				3128
				3129	if (name != NULL)
				3130	xmlFree(name);
				3131
				3132	return;
				3133	}
				3134
				3135
				3136	/**
				3137	* htmlParseReference:
				3138	* @ctxt: an HTML parser context
				3139	*
				3140	* parse and handle entity references in content,
				3141	* this will end-up in a call to character() since this is either a
				3142	* CharRef, or a predefined entity.
				3143	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3144	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3145	htmlParseReference(htmlParserCtxtPtr ctxt) {
				3146	htmlEntityDescPtr ent;
				3147	xmlChar out[6];
				3148	xmlChar *name;
				3149	if (CUR != '&') return;
				3150
				3151	if (NXT(1) == '#') {
				3152	unsigned int c;
				3153	int bits, i = 0;
				3154
				3155	c = htmlParseCharRef(ctxt);
				3156	if (c == 0)
				3157	return;
				3158
				3159	if (c < 0x80) { out[i++]= c; bits= -6; }
				3160	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3161	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3162	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3163
				3164	for ( ; bits >= 0; bits-= 6) {
				3165	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3166	}
				3167	out[i] = 0;
				3168
				3169	htmlCheckParagraph(ctxt);
				3170	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3171	ctxt->sax->characters(ctxt->userData, out, i);
				3172	} else {
				3173	ent = htmlParseEntityRef(ctxt, &name);
				3174	if (name == NULL) {
				3175	htmlCheckParagraph(ctxt);
				3176	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3177	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3178	return;
				3179	}
				3180	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3181	htmlCheckParagraph(ctxt);
				3182	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3183	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3184	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3185	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3186	}
				3187	} else {
				3188	unsigned int c;
				3189	int bits, i = 0;
				3190
				3191	c = ent->value;
				3192	if (c < 0x80)
				3193	{ out[i++]= c; bits= -6; }
				3194	else if (c < 0x800)
				3195	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3196	else if (c < 0x10000)
				3197	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3198	else
				3199	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3200
				3201	for ( ; bits >= 0; bits-= 6) {
				3202	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3203	}
				3204	out[i] = 0;
				3205
				3206	htmlCheckParagraph(ctxt);
				3207	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3208	ctxt->sax->characters(ctxt->userData, out, i);
				3209	}
				3210	xmlFree(name);
				3211	}
				3212	}
				3213
				3214	/**
				3215	* htmlParseContent:
				3216	* @ctxt: an HTML parser context
				3217	* @name: the node name
				3218	*
				3219	* Parse a content: comment, sub-element, reference or text.
				3220	*
				3221	*/
				3222
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3223	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3224	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3225	xmlChar *currentNode;
				3226	int depth;
				3227
				3228	currentNode = xmlStrdup(ctxt->name);
				3229	depth = ctxt->nameNr;
				3230	while (1) {
				3231	long cons = ctxt->nbChars;
				3232
				3233	GROW;
				3234	/*
				3235	* Our tag or one of it's parent or children is ending.
				3236	*/
				3237	if ((CUR == '<') && (NXT(1) == '/')) {
				3238	htmlParseEndTag(ctxt);
				3239	if (currentNode != NULL) xmlFree(currentNode);
				3240	return;
				3241	}
				3242
				3243	/*
				3244	* Has this node been popped out during parsing of
				3245	* the next element
				3246	*/
				3247	if ((!xmlStrEqual(currentNode, ctxt->name)) &&
				3248	(depth >= ctxt->nameNr)) {
				3249	if (currentNode != NULL) xmlFree(currentNode);
				3250	return;
				3251	}
				3252
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3253	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3254	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3255	/*
				3256	* Handle SCRIPT/STYLE separately
				3257	*/
				3258	htmlParseScript(ctxt);
				3259	} else {
				3260	/*
				3261	* Sometimes DOCTYPE arrives in the middle of the document
				3262	*/
				3263	if ((CUR == '<') && (NXT(1) == '!') &&
				3264	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3265	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3266	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3267	(UPP(8) == 'E')) {
				3268	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3269	ctxt->sax->error(ctxt->userData,
				3270	"Misplaced DOCTYPE declaration\n");
				3271	ctxt->wellFormed = 0;
				3272	htmlParseDocTypeDecl(ctxt);
				3273	}
				3274
				3275	/*
				3276	* First case : a comment
				3277	*/
				3278	if ((CUR == '<') && (NXT(1) == '!') &&
				3279	(NXT(2) == '-') && (NXT(3) == '-')) {
				3280	htmlParseComment(ctxt);
				3281	}
				3282
				3283	/*
				3284	* Second case : a sub-element.
				3285	*/
				3286	else if (CUR == '<') {
				3287	htmlParseElement(ctxt);
				3288	}
				3289
				3290	/*
				3291	* Third case : a reference. If if has not been resolved,
				3292	* parsing returns it's Name, create the node
				3293	*/
				3294	else if (CUR == '&') {
				3295	htmlParseReference(ctxt);
				3296	}
				3297
				3298	/*
				3299	* Fourth : end of the resource
				3300	*/
				3301	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3302	htmlAutoCloseOnEnd(ctxt);
				3303	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3304	}
				3305
				3306	/*
				3307	* Last case, text. Note that References are handled directly.
				3308	*/
				3309	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3310	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3311	}
				3312
				3313	if (cons == ctxt->nbChars) {
				3314	if (ctxt->node != NULL) {
				3315	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3316	ctxt->sax->error(ctxt->userData,
				3317	"detected an error in element content\n");
				3318	ctxt->wellFormed = 0;
				3319	}
				3320	break;
				3321	}
				3322	}
				3323	GROW;
				3324	}
				3325	if (currentNode != NULL) xmlFree(currentNode);
				3326	}
				3327
				3328	/**
				3329	* htmlParseElement:
				3330	* @ctxt: an HTML parser context
				3331	*
				3332	* parse an HTML element, this is highly recursive
				3333	*
				3334	* [39] element ::= EmptyElemTag \| STag content ETag
				3335	*
				3336	* [41] Attribute ::= Name Eq AttValue
				3337	*/
				3338
				3339	void
				3340	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3341	xmlChar *name;
				3342	xmlChar *currentNode = NULL;
				3343	htmlElemDescPtr info;
				3344	htmlParserNodeInfo node_info;
				3345	xmlChar *oldname;
				3346	int depth = ctxt->nameNr;
				3347
				3348	/* Capture start position */
				3349	if (ctxt->record_info) {
				3350	node_info.begin_pos = ctxt->input->consumed +
				3351	(CUR_PTR - ctxt->input->base);
				3352	node_info.begin_line = ctxt->input->line;
				3353	}
				3354
				3355	oldname = xmlStrdup(ctxt->name);
				3356	htmlParseStartTag(ctxt);
				3357	name = ctxt->name;
				3358	#ifdef DEBUG
				3359	if (oldname == NULL)
				3360	xmlGenericError(xmlGenericErrorContext,
				3361	"Start of element %s\n", name);
				3362	else if (name == NULL)
				3363	xmlGenericError(xmlGenericErrorContext,
				3364	"Start of element failed, was %s\n", oldname);
				3365	else
				3366	xmlGenericError(xmlGenericErrorContext,
				3367	"Start of element %s, was %s\n", name, oldname);
				3368	#endif
				3369	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3370	(name == NULL)) {
				3371	if (CUR == '>')
				3372	NEXT;
				3373	if (oldname != NULL)
				3374	xmlFree(oldname);
				3375	return;
				3376	}
				3377	if (oldname != NULL)
				3378	xmlFree(oldname);
				3379
				3380	/*
				3381	* Lookup the info for that element.
				3382	*/
				3383	info = htmlTagLookup(name);
				3384	if (info == NULL) {
				3385	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3386	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3387	name);
				3388	ctxt->wellFormed = 0;
				3389	} else if (info->depr) {
				3390	/***************************
				3391	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3392	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3393	name);
				3394	***************************/
				3395	}
				3396
				3397	/*
				3398	* Check for an Empty Element labelled the XML/SGML way
				3399	*/
				3400	if ((CUR == '/') && (NXT(1) == '>')) {
				3401	SKIP(2);
				3402	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3403	ctxt->sax->endElement(ctxt->userData, name);
				3404	oldname = htmlnamePop(ctxt);
				3405	#ifdef DEBUG
				3406	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3407	#endif
				3408	if (oldname != NULL)
				3409	xmlFree(oldname);
				3410	return;
				3411	}
				3412
				3413	if (CUR == '>') {
				3414	NEXT;
				3415	} else {
				3416	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3417	ctxt->sax->error(ctxt->userData,
				3418	"Couldn't find end of Start Tag %s\n",
				3419	name);
				3420	ctxt->wellFormed = 0;
				3421
				3422	/*
				3423	* end of parsing of this node.
				3424	*/
				3425	if (xmlStrEqual(name, ctxt->name)) {
				3426	nodePop(ctxt);
				3427	oldname = htmlnamePop(ctxt);
				3428	#ifdef DEBUG
				3429	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3430	#endif
				3431	if (oldname != NULL)
				3432	xmlFree(oldname);
				3433	}
				3434
				3435	/*
				3436	* Capture end position and add node
				3437	*/
				3438	if ( currentNode != NULL && ctxt->record_info ) {
				3439	node_info.end_pos = ctxt->input->consumed +
				3440	(CUR_PTR - ctxt->input->base);
				3441	node_info.end_line = ctxt->input->line;
				3442	node_info.node = ctxt->node;
				3443	xmlParserAddNodeInfo(ctxt, &node_info);
				3444	}
				3445	return;
				3446	}
				3447
				3448	/*
				3449	* Check for an Empty Element from DTD definition
				3450	*/
				3451	if ((info != NULL) && (info->empty)) {
				3452	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3453	ctxt->sax->endElement(ctxt->userData, name);
				3454	oldname = htmlnamePop(ctxt);
				3455	#ifdef DEBUG
				3456	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3457	#endif
				3458	if (oldname != NULL)
				3459	xmlFree(oldname);
				3460	return;
				3461	}
				3462
				3463	/*
				3464	* Parse the content of the element:
				3465	*/
				3466	currentNode = xmlStrdup(ctxt->name);
				3467	depth = ctxt->nameNr;
				3468	while (IS_CHAR(CUR)) {
				3469	htmlParseContent(ctxt);
				3470	if (ctxt->nameNr < depth) break;
				3471	}
				3472
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3473	/*
				3474	* Capture end position and add node
				3475	*/
				3476	if ( currentNode != NULL && ctxt->record_info ) {
				3477	node_info.end_pos = ctxt->input->consumed +
				3478	(CUR_PTR - ctxt->input->base);
				3479	node_info.end_line = ctxt->input->line;
				3480	node_info.node = ctxt->node;
				3481	xmlParserAddNodeInfo(ctxt, &node_info);
				3482	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3483	if (!IS_CHAR(CUR)) {
				3484	htmlAutoCloseOnEnd(ctxt);
				3485	}
				3486
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3487	if (currentNode != NULL)
				3488	xmlFree(currentNode);
				3489	}
				3490
				3491	/**
				3492	* htmlParseDocument :
				3493	* @ctxt: an HTML parser context
				3494	*
				3495	* parse an HTML document (and build a tree if using the standard SAX
				3496	* interface).
				3497	*
				3498	* Returns 0, -1 in case of error. the parser context is augmented
				3499	* as a result of the parsing.
				3500	*/
				3501
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3502	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3503	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3504	xmlDtdPtr dtd;
				3505
				3506	htmlDefaultSAXHandlerInit();
				3507	ctxt->html = 1;
				3508
				3509	GROW;
				3510	/*
				3511	* SAX: beginning of the document processing.
				3512	*/
				3513	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3514	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3515
				3516	/*
				3517	* Wipe out everything which is before the first '<'
				3518	*/
				3519	SKIP_BLANKS;
				3520	if (CUR == 0) {
				3521	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3522	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3523	ctxt->wellFormed = 0;
				3524	}
				3525
				3526	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3527	ctxt->sax->startDocument(ctxt->userData);
				3528
				3529
				3530	/*
				3531	* Parse possible comments before any content
				3532	*/
				3533	while ((CUR == '<') && (NXT(1) == '!') &&
				3534	(NXT(2) == '-') && (NXT(3) == '-')) {
				3535	htmlParseComment(ctxt);
				3536	SKIP_BLANKS;
				3537	}
				3538
				3539
				3540	/*
				3541	* Then possibly doc type declaration(s) and more Misc
				3542	* (doctypedecl Misc*)?
				3543	*/
				3544	if ((CUR == '<') && (NXT(1) == '!') &&
				3545	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3546	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3547	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3548	(UPP(8) == 'E')) {
				3549	htmlParseDocTypeDecl(ctxt);
				3550	}
				3551	SKIP_BLANKS;
				3552
				3553	/*
				3554	* Parse possible comments before any content
				3555	*/
				3556	while ((CUR == '<') && (NXT(1) == '!') &&
				3557	(NXT(2) == '-') && (NXT(3) == '-')) {
				3558	htmlParseComment(ctxt);
				3559	SKIP_BLANKS;
				3560	}
				3561
				3562	/*
				3563	* Time to start parsing the tree itself
				3564	*/
				3565	htmlParseContent(ctxt);
				3566
				3567	/*
				3568	* autoclose
				3569	*/
				3570	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3571	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3572
				3573
				3574	/*
				3575	* SAX: end of the document processing.
				3576	*/
				3577	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3578	ctxt->sax->endDocument(ctxt->userData);
				3579
				3580	if (ctxt->myDoc != NULL) {
				3581	dtd = xmlGetIntSubset(ctxt->myDoc);
				3582	if (dtd == NULL)
				3583	ctxt->myDoc->intSubset =
				3584	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3585	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3586	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3587	}
				3588	if (! ctxt->wellFormed) return(-1);
				3589	return(0);
				3590	}
				3591
				3592
				3593	/************************************************************************
				3594	* *
				3595	* Parser contexts handling *
				3596	* *
				3597	************************************************************************/
				3598
				3599	/**
				3600	* xmlInitParserCtxt:
				3601	* @ctxt: an HTML parser context
				3602	*
				3603	* Initialize a parser context
				3604	*/
				3605
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3606	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3607	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3608	{
				3609	htmlSAXHandler *sax;
				3610
				3611	if (ctxt == NULL) return;
				3612	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3613
				3614	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3615	if (sax == NULL) {
				3616	xmlGenericError(xmlGenericErrorContext,
				3617	"htmlInitParserCtxt: out of memory\n");
				3618	}
				3619	else
				3620	memset(sax, 0, sizeof(htmlSAXHandler));
				3621
				3622	/* Allocate the Input stack */
				3623	ctxt->inputTab = (htmlParserInputPtr *)
				3624	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3625	if (ctxt->inputTab == NULL) {
				3626	xmlGenericError(xmlGenericErrorContext,
				3627	"htmlInitParserCtxt: out of memory\n");
				3628	ctxt->inputNr = 0;
				3629	ctxt->inputMax = 0;
				3630	ctxt->input = NULL;
				3631	return;
				3632	}
				3633	ctxt->inputNr = 0;
				3634	ctxt->inputMax = 5;
				3635	ctxt->input = NULL;
				3636	ctxt->version = NULL;
				3637	ctxt->encoding = NULL;
				3638	ctxt->standalone = -1;
				3639	ctxt->instate = XML_PARSER_START;
				3640
				3641	/* Allocate the Node stack */
				3642	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3643	if (ctxt->nodeTab == NULL) {
				3644	xmlGenericError(xmlGenericErrorContext,
				3645	"htmlInitParserCtxt: out of memory\n");
				3646	ctxt->nodeNr = 0;
				3647	ctxt->nodeMax = 0;
				3648	ctxt->node = NULL;
				3649	ctxt->inputNr = 0;
				3650	ctxt->inputMax = 0;
				3651	ctxt->input = NULL;
				3652	return;
				3653	}
				3654	ctxt->nodeNr = 0;
				3655	ctxt->nodeMax = 10;
				3656	ctxt->node = NULL;
				3657
				3658	/* Allocate the Name stack */
				3659	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3660	if (ctxt->nameTab == NULL) {
				3661	xmlGenericError(xmlGenericErrorContext,
				3662	"htmlInitParserCtxt: out of memory\n");
				3663	ctxt->nameNr = 0;
				3664	ctxt->nameMax = 10;
				3665	ctxt->name = NULL;
				3666	ctxt->nodeNr = 0;
				3667	ctxt->nodeMax = 0;
				3668	ctxt->node = NULL;
				3669	ctxt->inputNr = 0;
				3670	ctxt->inputMax = 0;
				3671	ctxt->input = NULL;
				3672	return;
				3673	}
				3674	ctxt->nameNr = 0;
				3675	ctxt->nameMax = 10;
				3676	ctxt->name = NULL;
				3677
				3678	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3679	else {
				3680	ctxt->sax = sax;
				3681	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3682	}
				3683	ctxt->userData = ctxt;
				3684	ctxt->myDoc = NULL;
				3685	ctxt->wellFormed = 1;
				3686	ctxt->replaceEntities = 0;
				3687	ctxt->html = 1;
				3688	ctxt->record_info = 0;
				3689	ctxt->validate = 0;
				3690	ctxt->nbChars = 0;
				3691	ctxt->checkIndex = 0;
				3692	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3693	}
				3694
				3695	/**
				3696	* htmlFreeParserCtxt:
				3697	* @ctxt: an HTML parser context
				3698	*
				3699	* Free all the memory used by a parser context. However the parsed
				3700	* document in ctxt->myDoc is not freed.
				3701	*/
				3702
				3703	void
				3704	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3705	{
				3706	xmlFreeParserCtxt(ctxt);
				3707	}
				3708
				3709	/**
				3710	* htmlCreateDocParserCtxt :
				3711	* @cur: a pointer to an array of xmlChar
				3712	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3713	*
				3714	* Create a parser context for an HTML document.
				3715	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3716	* TODO: check the need to add encoding handling there
				3717	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3718	* Returns the new parser context or NULL
				3719	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3720	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3721	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3722	htmlParserCtxtPtr ctxt;
				3723	htmlParserInputPtr input;
				3724	/* htmlCharEncoding enc; */
				3725
				3726	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3727	if (ctxt == NULL) {
				3728	perror("malloc");
				3729	return(NULL);
				3730	}
				3731	htmlInitParserCtxt(ctxt);
				3732	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3733	if (input == NULL) {
				3734	perror("malloc");
				3735	xmlFree(ctxt);
				3736	return(NULL);
				3737	}
				3738	memset(input, 0, sizeof(htmlParserInput));
				3739
				3740	input->line = 1;
				3741	input->col = 1;
				3742	input->base = cur;
				3743	input->cur = cur;
				3744
				3745	inputPush(ctxt, input);
				3746	return(ctxt);
				3747	}
				3748
				3749	/************************************************************************
				3750	* *
				3751	* Progressive parsing interfaces *
				3752	* *
				3753	************************************************************************/
				3754
				3755	/**
				3756	* htmlParseLookupSequence:
				3757	* @ctxt: an HTML parser context
				3758	* @first: the first char to lookup
				3759	* @next: the next char to lookup or zero
				3760	* @third: the next char to lookup or zero
				3761	*
				3762	* Try to find if a sequence (first, next, third) or just (first next) or
				3763	* (first) is available in the input stream.
				3764	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3765	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3766	* parser, do not use liberally.
				3767	* This is basically similar to xmlParseLookupSequence()
				3768	*
				3769	* Returns the index to the current parsing point if the full sequence
				3770	* is available, -1 otherwise.
				3771	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3772	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3773	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3774	xmlChar next, xmlChar third) {
				3775	int base, len;
				3776	htmlParserInputPtr in;
				3777	const xmlChar *buf;
				3778
				3779	in = ctxt->input;
				3780	if (in == NULL) return(-1);
				3781	base = in->cur - in->base;
				3782	if (base < 0) return(-1);
				3783	if (ctxt->checkIndex > base)
				3784	base = ctxt->checkIndex;
				3785	if (in->buf == NULL) {
				3786	buf = in->base;
				3787	len = in->length;
				3788	} else {
				3789	buf = in->buf->buffer->content;
				3790	len = in->buf->buffer->use;
				3791	}
				3792	/* take into account the sequence length */
				3793	if (third) len -= 2;
				3794	else if (next) len --;
				3795	for (;base < len;base++) {
				3796	if (buf[base] == first) {
				3797	if (third != 0) {
				3798	if ((buf[base + 1] != next) \|\|
				3799	(buf[base + 2] != third)) continue;
				3800	} else if (next != 0) {
				3801	if (buf[base + 1] != next) continue;
				3802	}
				3803	ctxt->checkIndex = 0;
				3804	#ifdef DEBUG_PUSH
				3805	if (next == 0)
				3806	xmlGenericError(xmlGenericErrorContext,
				3807	"HPP: lookup '%c' found at %d\n",
				3808	first, base);
				3809	else if (third == 0)
				3810	xmlGenericError(xmlGenericErrorContext,
				3811	"HPP: lookup '%c%c' found at %d\n",
				3812	first, next, base);
				3813	else
				3814	xmlGenericError(xmlGenericErrorContext,
				3815	"HPP: lookup '%c%c%c' found at %d\n",
				3816	first, next, third, base);
				3817	#endif
				3818	return(base - (in->cur - in->base));
				3819	}
				3820	}
				3821	ctxt->checkIndex = base;
				3822	#ifdef DEBUG_PUSH
				3823	if (next == 0)
				3824	xmlGenericError(xmlGenericErrorContext,
				3825	"HPP: lookup '%c' failed\n", first);
				3826	else if (third == 0)
				3827	xmlGenericError(xmlGenericErrorContext,
				3828	"HPP: lookup '%c%c' failed\n", first, next);
				3829	else
				3830	xmlGenericError(xmlGenericErrorContext,
				3831	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3832	#endif
				3833	return(-1);
				3834	}
				3835
				3836	/**
				3837	* htmlParseTryOrFinish:
				3838	* @ctxt: an HTML parser context
				3839	* @terminate: last chunk indicator
				3840	*
				3841	* Try to progress on parsing
				3842	*
				3843	* Returns zero if no parsing was possible
				3844	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3845	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3846	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3847	int ret = 0;
				3848	htmlParserInputPtr in;
				3849	int avail = 0;
				3850	xmlChar cur, next;
				3851
				3852	#ifdef DEBUG_PUSH
				3853	switch (ctxt->instate) {
				3854	case XML_PARSER_EOF:
				3855	xmlGenericError(xmlGenericErrorContext,
				3856	"HPP: try EOF\n"); break;
				3857	case XML_PARSER_START:
				3858	xmlGenericError(xmlGenericErrorContext,
				3859	"HPP: try START\n"); break;
				3860	case XML_PARSER_MISC:
				3861	xmlGenericError(xmlGenericErrorContext,
				3862	"HPP: try MISC\n");break;
				3863	case XML_PARSER_COMMENT:
				3864	xmlGenericError(xmlGenericErrorContext,
				3865	"HPP: try COMMENT\n");break;
				3866	case XML_PARSER_PROLOG:
				3867	xmlGenericError(xmlGenericErrorContext,
				3868	"HPP: try PROLOG\n");break;
				3869	case XML_PARSER_START_TAG:
				3870	xmlGenericError(xmlGenericErrorContext,
				3871	"HPP: try START_TAG\n");break;
				3872	case XML_PARSER_CONTENT:
				3873	xmlGenericError(xmlGenericErrorContext,
				3874	"HPP: try CONTENT\n");break;
				3875	case XML_PARSER_CDATA_SECTION:
				3876	xmlGenericError(xmlGenericErrorContext,
				3877	"HPP: try CDATA_SECTION\n");break;
				3878	case XML_PARSER_END_TAG:
				3879	xmlGenericError(xmlGenericErrorContext,
				3880	"HPP: try END_TAG\n");break;
				3881	case XML_PARSER_ENTITY_DECL:
				3882	xmlGenericError(xmlGenericErrorContext,
				3883	"HPP: try ENTITY_DECL\n");break;
				3884	case XML_PARSER_ENTITY_VALUE:
				3885	xmlGenericError(xmlGenericErrorContext,
				3886	"HPP: try ENTITY_VALUE\n");break;
				3887	case XML_PARSER_ATTRIBUTE_VALUE:
				3888	xmlGenericError(xmlGenericErrorContext,
				3889	"HPP: try ATTRIBUTE_VALUE\n");break;
				3890	case XML_PARSER_DTD:
				3891	xmlGenericError(xmlGenericErrorContext,
				3892	"HPP: try DTD\n");break;
				3893	case XML_PARSER_EPILOG:
				3894	xmlGenericError(xmlGenericErrorContext,
				3895	"HPP: try EPILOG\n");break;
				3896	case XML_PARSER_PI:
				3897	xmlGenericError(xmlGenericErrorContext,
				3898	"HPP: try PI\n");break;
				3899	case XML_PARSER_SYSTEM_LITERAL:
				3900	xmlGenericError(xmlGenericErrorContext,
				3901	"HPP: try SYSTEM_LITERAL\n");break;
				3902	}
				3903	#endif
				3904
				3905	while (1) {
				3906
				3907	in = ctxt->input;
				3908	if (in == NULL) break;
				3909	if (in->buf == NULL)
				3910	avail = in->length - (in->cur - in->base);
				3911	else
				3912	avail = in->buf->buffer->use - (in->cur - in->base);
				3913	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3914	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3915	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3916	/*
				3917	* SAX: end of the document processing.
				3918	*/
				3919	ctxt->instate = XML_PARSER_EOF;
				3920	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3921	ctxt->sax->endDocument(ctxt->userData);
				3922	}
				3923	}
				3924	if (avail < 1)
				3925	goto done;
				3926	switch (ctxt->instate) {
				3927	case XML_PARSER_EOF:
				3928	/*
				3929	* Document parsing is done !
				3930	*/
				3931	goto done;
				3932	case XML_PARSER_START:
				3933	/*
				3934	* Very first chars read from the document flow.
				3935	*/
				3936	cur = in->cur[0];
				3937	if (IS_BLANK(cur)) {
				3938	SKIP_BLANKS;
				3939	if (in->buf == NULL)
				3940	avail = in->length - (in->cur - in->base);
				3941	else
				3942	avail = in->buf->buffer->use - (in->cur - in->base);
				3943	}
				3944	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3945	ctxt->sax->setDocumentLocator(ctxt->userData,
				3946	&xmlDefaultSAXLocator);
				3947	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				3948	(!ctxt->disableSAX))
				3949	ctxt->sax->startDocument(ctxt->userData);
				3950
				3951	cur = in->cur[0];
				3952	next = in->cur[1];
				3953	if ((cur == '<') && (next == '!') &&
				3954	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3955	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3956	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3957	(UPP(8) == 'E')) {
				3958	if ((!terminate) &&
				3959	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				3960	goto done;
				3961	#ifdef DEBUG_PUSH
				3962	xmlGenericError(xmlGenericErrorContext,
				3963	"HPP: Parsing internal subset\n");
				3964	#endif
				3965	htmlParseDocTypeDecl(ctxt);
				3966	ctxt->instate = XML_PARSER_PROLOG;
				3967	#ifdef DEBUG_PUSH
				3968	xmlGenericError(xmlGenericErrorContext,
				3969	"HPP: entering PROLOG\n");
				3970	#endif
				3971	} else {
				3972	ctxt->instate = XML_PARSER_MISC;
				3973	}
				3974	#ifdef DEBUG_PUSH
				3975	xmlGenericError(xmlGenericErrorContext,
				3976	"HPP: entering MISC\n");
				3977	#endif
				3978	break;
				3979	case XML_PARSER_MISC:
				3980	SKIP_BLANKS;
				3981	if (in->buf == NULL)
				3982	avail = in->length - (in->cur - in->base);
				3983	else
				3984	avail = in->buf->buffer->use - (in->cur - in->base);
				3985	if (avail < 2)
				3986	goto done;
				3987	cur = in->cur[0];
				3988	next = in->cur[1];
				3989	if ((cur == '<') && (next == '!') &&
				3990	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				3991	if ((!terminate) &&
				3992	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				3993	goto done;
				3994	#ifdef DEBUG_PUSH
				3995	xmlGenericError(xmlGenericErrorContext,
				3996	"HPP: Parsing Comment\n");
				3997	#endif
				3998	htmlParseComment(ctxt);
				3999	ctxt->instate = XML_PARSER_MISC;
				4000	} else if ((cur == '<') && (next == '!') &&
				4001	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4002	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4003	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4004	(UPP(8) == 'E')) {
				4005	if ((!terminate) &&
				4006	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4007	goto done;
				4008	#ifdef DEBUG_PUSH
				4009	xmlGenericError(xmlGenericErrorContext,
				4010	"HPP: Parsing internal subset\n");
				4011	#endif
				4012	htmlParseDocTypeDecl(ctxt);
				4013	ctxt->instate = XML_PARSER_PROLOG;
				4014	#ifdef DEBUG_PUSH
				4015	xmlGenericError(xmlGenericErrorContext,
				4016	"HPP: entering PROLOG\n");
				4017	#endif
				4018	} else if ((cur == '<') && (next == '!') &&
				4019	(avail < 9)) {
				4020	goto done;
				4021	} else {
				4022	ctxt->instate = XML_PARSER_START_TAG;
				4023	#ifdef DEBUG_PUSH
				4024	xmlGenericError(xmlGenericErrorContext,
				4025	"HPP: entering START_TAG\n");
				4026	#endif
				4027	}
				4028	break;
				4029	case XML_PARSER_PROLOG:
				4030	SKIP_BLANKS;
				4031	if (in->buf == NULL)
				4032	avail = in->length - (in->cur - in->base);
				4033	else
				4034	avail = in->buf->buffer->use - (in->cur - in->base);
				4035	if (avail < 2)
				4036	goto done;
				4037	cur = in->cur[0];
				4038	next = in->cur[1];
				4039	if ((cur == '<') && (next == '!') &&
				4040	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4041	if ((!terminate) &&
				4042	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4043	goto done;
				4044	#ifdef DEBUG_PUSH
				4045	xmlGenericError(xmlGenericErrorContext,
				4046	"HPP: Parsing Comment\n");
				4047	#endif
				4048	htmlParseComment(ctxt);
				4049	ctxt->instate = XML_PARSER_PROLOG;
				4050	} else if ((cur == '<') && (next == '!') &&
				4051	(avail < 4)) {
				4052	goto done;
				4053	} else {
				4054	ctxt->instate = XML_PARSER_START_TAG;
				4055	#ifdef DEBUG_PUSH
				4056	xmlGenericError(xmlGenericErrorContext,
				4057	"HPP: entering START_TAG\n");
				4058	#endif
				4059	}
				4060	break;
				4061	case XML_PARSER_EPILOG:
				4062	if (in->buf == NULL)
				4063	avail = in->length - (in->cur - in->base);
				4064	else
				4065	avail = in->buf->buffer->use - (in->cur - in->base);
				4066	if (avail < 1)
				4067	goto done;
				4068	cur = in->cur[0];
				4069	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4070	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4071	goto done;
				4072	}
				4073	if (avail < 2)
				4074	goto done;
				4075	next = in->cur[1];
				4076	if ((cur == '<') && (next == '!') &&
				4077	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4078	if ((!terminate) &&
				4079	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4080	goto done;
				4081	#ifdef DEBUG_PUSH
				4082	xmlGenericError(xmlGenericErrorContext,
				4083	"HPP: Parsing Comment\n");
				4084	#endif
				4085	htmlParseComment(ctxt);
				4086	ctxt->instate = XML_PARSER_EPILOG;
				4087	} else if ((cur == '<') && (next == '!') &&
				4088	(avail < 4)) {
				4089	goto done;
				4090	} else {
				4091	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4092	ctxt->wellFormed = 0;
				4093	ctxt->instate = XML_PARSER_EOF;
				4094	#ifdef DEBUG_PUSH
				4095	xmlGenericError(xmlGenericErrorContext,
				4096	"HPP: entering EOF\n");
				4097	#endif
				4098	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4099	ctxt->sax->endDocument(ctxt->userData);
				4100	goto done;
				4101	}
				4102	break;
				4103	case XML_PARSER_START_TAG: {
				4104	xmlChar name, oldname;
				4105	int depth = ctxt->nameNr;
				4106	htmlElemDescPtr info;
				4107
				4108	if (avail < 2)
				4109	goto done;
				4110	cur = in->cur[0];
				4111	if (cur != '<') {
				4112	ctxt->instate = XML_PARSER_CONTENT;
				4113	#ifdef DEBUG_PUSH
				4114	xmlGenericError(xmlGenericErrorContext,
				4115	"HPP: entering CONTENT\n");
				4116	#endif
				4117	break;
				4118	}
				4119	if ((!terminate) &&
				4120	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4121	goto done;
				4122
				4123	oldname = xmlStrdup(ctxt->name);
				4124	htmlParseStartTag(ctxt);
				4125	name = ctxt->name;
				4126	#ifdef DEBUG
				4127	if (oldname == NULL)
				4128	xmlGenericError(xmlGenericErrorContext,
				4129	"Start of element %s\n", name);
				4130	else if (name == NULL)
				4131	xmlGenericError(xmlGenericErrorContext,
				4132	"Start of element failed, was %s\n",
				4133	oldname);
				4134	else
				4135	xmlGenericError(xmlGenericErrorContext,
				4136	"Start of element %s, was %s\n",
				4137	name, oldname);
				4138	#endif
				4139	if (((depth == ctxt->nameNr) &&
				4140	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4141	(name == NULL)) {
				4142	if (CUR == '>')
				4143	NEXT;
				4144	if (oldname != NULL)
				4145	xmlFree(oldname);
				4146	break;
				4147	}
				4148	if (oldname != NULL)
				4149	xmlFree(oldname);
				4150
				4151	/*
				4152	* Lookup the info for that element.
				4153	*/
				4154	info = htmlTagLookup(name);
				4155	if (info == NULL) {
				4156	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4157	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4158	name);
				4159	ctxt->wellFormed = 0;
				4160	} else if (info->depr) {
				4161	/***************************
				4162	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4163	ctxt->sax->warning(ctxt->userData,
				4164	"Tag %s is deprecated\n",
				4165	name);
				4166	***************************/
				4167	}
				4168
				4169	/*
				4170	* Check for an Empty Element labelled the XML/SGML way
				4171	*/
				4172	if ((CUR == '/') && (NXT(1) == '>')) {
				4173	SKIP(2);
				4174	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4175	ctxt->sax->endElement(ctxt->userData, name);
				4176	oldname = htmlnamePop(ctxt);
				4177	#ifdef DEBUG
				4178	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4179	oldname);
				4180	#endif
				4181	if (oldname != NULL)
				4182	xmlFree(oldname);
				4183	ctxt->instate = XML_PARSER_CONTENT;
				4184	#ifdef DEBUG_PUSH
				4185	xmlGenericError(xmlGenericErrorContext,
				4186	"HPP: entering CONTENT\n");
				4187	#endif
				4188	break;
				4189	}
				4190
				4191	if (CUR == '>') {
				4192	NEXT;
				4193	} else {
				4194	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4195	ctxt->sax->error(ctxt->userData,
				4196	"Couldn't find end of Start Tag %s\n",
				4197	name);
				4198	ctxt->wellFormed = 0;
				4199
				4200	/*
				4201	* end of parsing of this node.
				4202	*/
				4203	if (xmlStrEqual(name, ctxt->name)) {
				4204	nodePop(ctxt);
				4205	oldname = htmlnamePop(ctxt);
				4206	#ifdef DEBUG
				4207	xmlGenericError(xmlGenericErrorContext,
				4208	"End of start tag problem: popping out %s\n", oldname);
				4209	#endif
				4210	if (oldname != NULL)
				4211	xmlFree(oldname);
				4212	}
				4213
				4214	ctxt->instate = XML_PARSER_CONTENT;
				4215	#ifdef DEBUG_PUSH
				4216	xmlGenericError(xmlGenericErrorContext,
				4217	"HPP: entering CONTENT\n");
				4218	#endif
				4219	break;
				4220	}
				4221
				4222	/*
				4223	* Check for an Empty Element from DTD definition
				4224	*/
				4225	if ((info != NULL) && (info->empty)) {
				4226	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4227	ctxt->sax->endElement(ctxt->userData, name);
				4228	oldname = htmlnamePop(ctxt);
				4229	#ifdef DEBUG
				4230	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4231	#endif
				4232	if (oldname != NULL)
				4233	xmlFree(oldname);
				4234	}
				4235	ctxt->instate = XML_PARSER_CONTENT;
				4236	#ifdef DEBUG_PUSH
				4237	xmlGenericError(xmlGenericErrorContext,
				4238	"HPP: entering CONTENT\n");
				4239	#endif
				4240	break;
				4241	}
				4242	case XML_PARSER_CONTENT: {
				4243	long cons;
				4244	/*
				4245	* Handle preparsed entities and charRef
				4246	*/
				4247	if (ctxt->token != 0) {
				4248	xmlChar chr[2] = { 0 , 0 } ;
				4249
				4250	chr[0] = (xmlChar) ctxt->token;
				4251	htmlCheckParagraph(ctxt);
				4252	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4253	ctxt->sax->characters(ctxt->userData, chr, 1);
				4254	ctxt->token = 0;
				4255	ctxt->checkIndex = 0;
				4256	}
				4257	if ((avail == 1) && (terminate)) {
				4258	cur = in->cur[0];
				4259	if ((cur != '<') && (cur != '&')) {
				4260	if (ctxt->sax != NULL) {
				4261	if (IS_BLANK(cur)) {
				4262	if (ctxt->sax->ignorableWhitespace != NULL)
				4263	ctxt->sax->ignorableWhitespace(
				4264	ctxt->userData, &cur, 1);
				4265	} else {
				4266	htmlCheckParagraph(ctxt);
				4267	if (ctxt->sax->characters != NULL)
				4268	ctxt->sax->characters(
				4269	ctxt->userData, &cur, 1);
				4270	}
				4271	}
				4272	ctxt->token = 0;
				4273	ctxt->checkIndex = 0;
				4274	NEXT;
				4275	}
				4276	break;
				4277	}
				4278	if (avail < 2)
				4279	goto done;
				4280	cur = in->cur[0];
				4281	next = in->cur[1];
				4282	cons = ctxt->nbChars;
				4283	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4284	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4285	/*
				4286	* Handle SCRIPT/STYLE separately
				4287	*/
				4288	if ((!terminate) &&
				4289	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4290	goto done;
				4291	htmlParseScript(ctxt);
				4292	if ((cur == '<') && (next == '/')) {
				4293	ctxt->instate = XML_PARSER_END_TAG;
				4294	ctxt->checkIndex = 0;
				4295	#ifdef DEBUG_PUSH
				4296	xmlGenericError(xmlGenericErrorContext,
				4297	"HPP: entering END_TAG\n");
				4298	#endif
				4299	break;
				4300	}
				4301	} else {
				4302	/*
				4303	* Sometimes DOCTYPE arrives in the middle of the document
				4304	*/
				4305	if ((cur == '<') && (next == '!') &&
				4306	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4307	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4308	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4309	(UPP(8) == 'E')) {
				4310	if ((!terminate) &&
				4311	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4312	goto done;
				4313	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4314	ctxt->sax->error(ctxt->userData,
				4315	"Misplaced DOCTYPE declaration\n");
				4316	ctxt->wellFormed = 0;
				4317	htmlParseDocTypeDecl(ctxt);
				4318	} else if ((cur == '<') && (next == '!') &&
				4319	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4320	if ((!terminate) &&
				4321	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4322	goto done;
				4323	#ifdef DEBUG_PUSH
				4324	xmlGenericError(xmlGenericErrorContext,
				4325	"HPP: Parsing Comment\n");
				4326	#endif
				4327	htmlParseComment(ctxt);
				4328	ctxt->instate = XML_PARSER_CONTENT;
				4329	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4330	goto done;
				4331	} else if ((cur == '<') && (next == '/')) {
				4332	ctxt->instate = XML_PARSER_END_TAG;
				4333	ctxt->checkIndex = 0;
				4334	#ifdef DEBUG_PUSH
				4335	xmlGenericError(xmlGenericErrorContext,
				4336	"HPP: entering END_TAG\n");
				4337	#endif
				4338	break;
				4339	} else if (cur == '<') {
				4340	ctxt->instate = XML_PARSER_START_TAG;
				4341	ctxt->checkIndex = 0;
				4342	#ifdef DEBUG_PUSH
				4343	xmlGenericError(xmlGenericErrorContext,
				4344	"HPP: entering START_TAG\n");
				4345	#endif
				4346	break;
				4347	} else if (cur == '&') {
				4348	if ((!terminate) &&
				4349	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4350	goto done;
				4351	#ifdef DEBUG_PUSH
				4352	xmlGenericError(xmlGenericErrorContext,
				4353	"HPP: Parsing Reference\n");
				4354	#endif
				4355	/* TODO: check generation of subtrees if noent !!! */
				4356	htmlParseReference(ctxt);
				4357	} else {
				4358	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4359	/*
				4360	* Goal of the following test is :
				4361	* - minimize calls to the SAX 'character' callback
				4362	* when they are mergeable
				4363	*/
				4364	if ((ctxt->inputNr == 1) &&
				4365	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4366	if ((!terminate) &&
				4367	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4368	goto done;
				4369	}
				4370	ctxt->checkIndex = 0;
				4371	#ifdef DEBUG_PUSH
				4372	xmlGenericError(xmlGenericErrorContext,
				4373	"HPP: Parsing char data\n");
				4374	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4375	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4376	}
				4377	}
				4378	if (cons == ctxt->nbChars) {
				4379	if (ctxt->node != NULL) {
				4380	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4381	ctxt->sax->error(ctxt->userData,
				4382	"detected an error in element content\n");
				4383	ctxt->wellFormed = 0;
				4384	}
				4385	NEXT;
				4386	break;
				4387	}
				4388
				4389	break;
				4390	}
				4391	case XML_PARSER_END_TAG:
				4392	if (avail < 2)
				4393	goto done;
				4394	if ((!terminate) &&
				4395	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4396	goto done;
				4397	htmlParseEndTag(ctxt);
				4398	if (ctxt->nameNr == 0) {
				4399	ctxt->instate = XML_PARSER_EPILOG;
				4400	} else {
				4401	ctxt->instate = XML_PARSER_CONTENT;
				4402	}
				4403	ctxt->checkIndex = 0;
				4404	#ifdef DEBUG_PUSH
				4405	xmlGenericError(xmlGenericErrorContext,
				4406	"HPP: entering CONTENT\n");
				4407	#endif
				4408	break;
				4409	case XML_PARSER_CDATA_SECTION:
				4410	xmlGenericError(xmlGenericErrorContext,
				4411	"HPP: internal error, state == CDATA\n");
				4412	ctxt->instate = XML_PARSER_CONTENT;
				4413	ctxt->checkIndex = 0;
				4414	#ifdef DEBUG_PUSH
				4415	xmlGenericError(xmlGenericErrorContext,
				4416	"HPP: entering CONTENT\n");
				4417	#endif
				4418	break;
				4419	case XML_PARSER_DTD:
				4420	xmlGenericError(xmlGenericErrorContext,
				4421	"HPP: internal error, state == DTD\n");
				4422	ctxt->instate = XML_PARSER_CONTENT;
				4423	ctxt->checkIndex = 0;
				4424	#ifdef DEBUG_PUSH
				4425	xmlGenericError(xmlGenericErrorContext,
				4426	"HPP: entering CONTENT\n");
				4427	#endif
				4428	break;
				4429	case XML_PARSER_COMMENT:
				4430	xmlGenericError(xmlGenericErrorContext,
				4431	"HPP: internal error, state == COMMENT\n");
				4432	ctxt->instate = XML_PARSER_CONTENT;
				4433	ctxt->checkIndex = 0;
				4434	#ifdef DEBUG_PUSH
				4435	xmlGenericError(xmlGenericErrorContext,
				4436	"HPP: entering CONTENT\n");
				4437	#endif
				4438	break;
				4439	case XML_PARSER_PI:
				4440	xmlGenericError(xmlGenericErrorContext,
				4441	"HPP: internal error, state == PI\n");
				4442	ctxt->instate = XML_PARSER_CONTENT;
				4443	ctxt->checkIndex = 0;
				4444	#ifdef DEBUG_PUSH
				4445	xmlGenericError(xmlGenericErrorContext,
				4446	"HPP: entering CONTENT\n");
				4447	#endif
				4448	break;
				4449	case XML_PARSER_ENTITY_DECL:
				4450	xmlGenericError(xmlGenericErrorContext,
				4451	"HPP: internal error, state == ENTITY_DECL\n");
				4452	ctxt->instate = XML_PARSER_CONTENT;
				4453	ctxt->checkIndex = 0;
				4454	#ifdef DEBUG_PUSH
				4455	xmlGenericError(xmlGenericErrorContext,
				4456	"HPP: entering CONTENT\n");
				4457	#endif
				4458	break;
				4459	case XML_PARSER_ENTITY_VALUE:
				4460	xmlGenericError(xmlGenericErrorContext,
				4461	"HPP: internal error, state == ENTITY_VALUE\n");
				4462	ctxt->instate = XML_PARSER_CONTENT;
				4463	ctxt->checkIndex = 0;
				4464	#ifdef DEBUG_PUSH
				4465	xmlGenericError(xmlGenericErrorContext,
				4466	"HPP: entering DTD\n");
				4467	#endif
				4468	break;
				4469	case XML_PARSER_ATTRIBUTE_VALUE:
				4470	xmlGenericError(xmlGenericErrorContext,
				4471	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4472	ctxt->instate = XML_PARSER_START_TAG;
				4473	ctxt->checkIndex = 0;
				4474	#ifdef DEBUG_PUSH
				4475	xmlGenericError(xmlGenericErrorContext,
				4476	"HPP: entering START_TAG\n");
				4477	#endif
				4478	break;
				4479	case XML_PARSER_SYSTEM_LITERAL:
				4480	xmlGenericError(xmlGenericErrorContext,
				4481	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4482	ctxt->instate = XML_PARSER_CONTENT;
				4483	ctxt->checkIndex = 0;
				4484	#ifdef DEBUG_PUSH
				4485	xmlGenericError(xmlGenericErrorContext,
				4486	"HPP: entering CONTENT\n");
				4487	#endif
				4488	break;
				4489	case XML_PARSER_IGNORE:
				4490	xmlGenericError(xmlGenericErrorContext,
				4491	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4492	ctxt->instate = XML_PARSER_CONTENT;
				4493	ctxt->checkIndex = 0;
				4494	#ifdef DEBUG_PUSH
				4495	xmlGenericError(xmlGenericErrorContext,
				4496	"HPP: entering CONTENT\n");
				4497	#endif
				4498	break;
				4499	}
				4500	}
				4501	done:
				4502	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4503	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4504	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4505	/*
				4506	* SAX: end of the document processing.
				4507	*/
				4508	ctxt->instate = XML_PARSER_EOF;
				4509	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4510	ctxt->sax->endDocument(ctxt->userData);
				4511	}
				4512	}
				4513	if ((ctxt->myDoc != NULL) &&
				4514	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4515	(ctxt->instate == XML_PARSER_EPILOG))) {
				4516	xmlDtdPtr dtd;
				4517	dtd = xmlGetIntSubset(ctxt->myDoc);
				4518	if (dtd == NULL)
				4519	ctxt->myDoc->intSubset =
				4520	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4521	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4522	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4523	}
				4524	#ifdef DEBUG_PUSH
				4525	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4526	#endif
				4527	return(ret);
				4528	}
				4529
				4530	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4531	* htmlParseChunk:
				4532	* @ctxt: an XML parser context
				4533	* @chunk: an char array
				4534	* @size: the size in byte of the chunk
				4535	* @terminate: last chunk indicator
				4536	*
				4537	* Parse a Chunk of memory
				4538	*
				4539	* Returns zero if no error, the xmlParserErrors otherwise.
				4540	*/
				4541	int
				4542	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4543	int terminate) {
				4544	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4545	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4546	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4547	int cur = ctxt->input->cur - ctxt->input->base;
				4548
				4549	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4550	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4551	ctxt->input->cur = ctxt->input->base + cur;
				4552	#ifdef DEBUG_PUSH
				4553	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4554	#endif
				4555
				4556	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4557	htmlParseTryOrFinish(ctxt, terminate);
				4558	} else if (ctxt->instate != XML_PARSER_EOF) {
				4559	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4560	htmlParseTryOrFinish(ctxt, terminate);
				4561	}
				4562	if (terminate) {
				4563	if ((ctxt->instate != XML_PARSER_EOF) &&
				4564	(ctxt->instate != XML_PARSER_EPILOG) &&
				4565	(ctxt->instate != XML_PARSER_MISC)) {
				4566	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4567	ctxt->wellFormed = 0;
				4568	}
				4569	if (ctxt->instate != XML_PARSER_EOF) {
				4570	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4571	ctxt->sax->endDocument(ctxt->userData);
				4572	}
				4573	ctxt->instate = XML_PARSER_EOF;
				4574	}
				4575	return((xmlParserErrors) ctxt->errNo);
				4576	}
				4577
				4578	/************************************************************************
				4579	* *
				4580	* User entry points *
				4581	* *
				4582	************************************************************************/
				4583
				4584	/**
				4585	* htmlCreatePushParserCtxt :
				4586	* @sax: a SAX handler
				4587	* @user_data: The user data returned on SAX callbacks
				4588	* @chunk: a pointer to an array of chars
				4589	* @size: number of chars in the array
				4590	* @filename: an optional file name or URI
				4591	* @enc: an optional encoding
				4592	*
				4593	* Create a parser context for using the HTML parser in push mode
				4594	* To allow content encoding detection, @size should be >= 4
				4595	* The value of @filename is used for fetching external entities
				4596	* and error/warning reports.
				4597	*
				4598	* Returns the new parser context or NULL
				4599	*/
				4600	htmlParserCtxtPtr
				4601	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4602	const char chunk, int size, const char filename,
				4603	xmlCharEncoding enc) {
				4604	htmlParserCtxtPtr ctxt;
				4605	htmlParserInputPtr inputStream;
				4606	xmlParserInputBufferPtr buf;
				4607
				4608	buf = xmlAllocParserInputBuffer(enc);
				4609	if (buf == NULL) return(NULL);
				4610
				4611	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4612	if (ctxt == NULL) {
				4613	xmlFree(buf);
				4614	return(NULL);
				4615	}
				4616	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4617	htmlInitParserCtxt(ctxt);
				4618	if (sax != NULL) {
				4619	if (ctxt->sax != &htmlDefaultSAXHandler)
				4620	xmlFree(ctxt->sax);
				4621	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4622	if (ctxt->sax == NULL) {
				4623	xmlFree(buf);
				4624	xmlFree(ctxt);
				4625	return(NULL);
				4626	}
				4627	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4628	if (user_data != NULL)
				4629	ctxt->userData = user_data;
				4630	}
				4631	if (filename == NULL) {
				4632	ctxt->directory = NULL;
				4633	} else {
				4634	ctxt->directory = xmlParserGetDirectory(filename);
				4635	}
				4636
				4637	inputStream = htmlNewInputStream(ctxt);
				4638	if (inputStream == NULL) {
				4639	xmlFreeParserCtxt(ctxt);
				4640	return(NULL);
				4641	}
				4642
				4643	if (filename == NULL)
				4644	inputStream->filename = NULL;
				4645	else
				4646	inputStream->filename = xmlMemStrdup(filename);
				4647	inputStream->buf = buf;
				4648	inputStream->base = inputStream->buf->buffer->content;
				4649	inputStream->cur = inputStream->buf->buffer->content;
				4650
				4651	inputPush(ctxt, inputStream);
				4652
				4653	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4654	(ctxt->input->buf != NULL)) {
				4655	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4656	#ifdef DEBUG_PUSH
				4657	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4658	#endif
				4659	}
				4660
				4661	return(ctxt);
				4662	}
				4663
				4664	/**
				4665	* htmlSAXParseDoc :
				4666	* @cur: a pointer to an array of xmlChar
				4667	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4668	* @sax: the SAX handler block
				4669	* @userData: if using SAX, this pointer will be provided on callbacks.
				4670	*
				4671	* parse an HTML in-memory document and build a tree.
				4672	* It use the given SAX function block to handle the parsing callback.
				4673	* If sax is NULL, fallback to the default DOM tree building routines.
				4674	*
				4675	* Returns the resulting document tree
				4676	*/
				4677
				4678	htmlDocPtr
				4679	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4680	htmlDocPtr ret;
				4681	htmlParserCtxtPtr ctxt;
				4682
				4683	if (cur == NULL) return(NULL);
				4684
				4685
				4686	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4687	if (ctxt == NULL) return(NULL);
				4688	if (sax != NULL) {
				4689	ctxt->sax = sax;
				4690	ctxt->userData = userData;
				4691	}
				4692
				4693	htmlParseDocument(ctxt);
				4694	ret = ctxt->myDoc;
				4695	if (sax != NULL) {
				4696	ctxt->sax = NULL;
				4697	ctxt->userData = NULL;
				4698	}
				4699	htmlFreeParserCtxt(ctxt);
				4700
				4701	return(ret);
				4702	}
				4703
				4704	/**
				4705	* htmlParseDoc :
				4706	* @cur: a pointer to an array of xmlChar
				4707	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4708	*
				4709	* parse an HTML in-memory document and build a tree.
				4710	*
				4711	* Returns the resulting document tree
				4712	*/
				4713
				4714	htmlDocPtr
				4715	htmlParseDoc(xmlChar cur, const char encoding) {
				4716	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4717	}
				4718
				4719
				4720	/**
				4721	* htmlCreateFileParserCtxt :
				4722	* @filename: the filename
				4723	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4724	*
				4725	* Create a parser context for a file content.
				4726	* Automatic support for ZLIB/Compress compressed document is provided
				4727	* by default if found at compile-time.
				4728	*
				4729	* Returns the new parser context or NULL
				4730	*/
				4731	htmlParserCtxtPtr
				4732	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4733	{
				4734	htmlParserCtxtPtr ctxt;
				4735	htmlParserInputPtr inputStream;
				4736	xmlParserInputBufferPtr buf;
				4737	/* htmlCharEncoding enc; */
				4738	xmlChar content, content_line = (xmlChar *) "charset=";
				4739
				4740	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4741	if (buf == NULL) return(NULL);
				4742
				4743	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4744	if (ctxt == NULL) {
				4745	perror("malloc");
				4746	return(NULL);
				4747	}
				4748	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4749	htmlInitParserCtxt(ctxt);
				4750	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4751	if (inputStream == NULL) {
				4752	perror("malloc");
				4753	xmlFree(ctxt);
				4754	return(NULL);
				4755	}
				4756	memset(inputStream, 0, sizeof(htmlParserInput));
				4757
				4758	inputStream->filename = xmlMemStrdup(filename);
				4759	inputStream->line = 1;
				4760	inputStream->col = 1;
				4761	inputStream->buf = buf;
				4762	inputStream->directory = NULL;
				4763
				4764	inputStream->base = inputStream->buf->buffer->content;
				4765	inputStream->cur = inputStream->buf->buffer->content;
				4766	inputStream->free = NULL;
				4767
				4768	inputPush(ctxt, inputStream);
				4769
				4770	/* set encoding */
				4771	if (encoding) {
				4772	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4773	if (content) {
				4774	strcpy ((char )content, (char )content_line);
				4775	strcat ((char )content, (char )encoding);
				4776	htmlCheckEncoding (ctxt, content);
				4777	xmlFree (content);
				4778	}
				4779	}
				4780
				4781	return(ctxt);
				4782	}
				4783
				4784	/**
				4785	* htmlSAXParseFile :
				4786	* @filename: the filename
				4787	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4788	* @sax: the SAX handler block
				4789	* @userData: if using SAX, this pointer will be provided on callbacks.
				4790	*
				4791	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4792	* compressed document is provided by default if found at compile-time.
				4793	* It use the given SAX function block to handle the parsing callback.
				4794	* If sax is NULL, fallback to the default DOM tree building routines.
				4795	*
				4796	* Returns the resulting document tree
				4797	*/
				4798
				4799	htmlDocPtr
				4800	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4801	void *userData) {
				4802	htmlDocPtr ret;
				4803	htmlParserCtxtPtr ctxt;
				4804	htmlSAXHandlerPtr oldsax = NULL;
				4805
				4806	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4807	if (ctxt == NULL) return(NULL);
				4808	if (sax != NULL) {
				4809	oldsax = ctxt->sax;
				4810	ctxt->sax = sax;
				4811	ctxt->userData = userData;
				4812	}
				4813
				4814	htmlParseDocument(ctxt);
				4815
				4816	ret = ctxt->myDoc;
				4817	if (sax != NULL) {
				4818	ctxt->sax = oldsax;
				4819	ctxt->userData = NULL;
				4820	}
				4821	htmlFreeParserCtxt(ctxt);
				4822
				4823	return(ret);
				4824	}
				4825
				4826	/**
				4827	* htmlParseFile :
				4828	* @filename: the filename
				4829	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4830	*
				4831	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4832	* compressed document is provided by default if found at compile-time.
				4833	*
				4834	* Returns the resulting document tree
				4835	*/
				4836
				4837	htmlDocPtr
				4838	htmlParseFile(const char filename, const char encoding) {
				4839	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4840	}
				4841
				4842	/**
				4843	* htmlHandleOmittedElem:
				4844	* @val: int 0 or 1
				4845	*
				4846	* Set and return the previous value for handling HTML omitted tags.
				4847	*
				4848	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4849	*/
				4850
				4851	int
				4852	htmlHandleOmittedElem(int val) {
				4853	int old = htmlOmittedDefaultValue;
				4854
				4855	htmlOmittedDefaultValue = val;
				4856	return(old);
				4857	}
				4858
				4859	#endif /* LIBXML_HTML_ENABLED */