Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: 870201dd4a25061d79217c0c6bcc2be1fd52e7bd [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* Daniel.Veillard@w3.org
				7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
				43
				44	#define HTML_MAX_NAMELEN 1000
				45	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				46	#define HTML_PARSER_BUFFER_SIZE 100
				47
				48	/* #define DEBUG */
				49	/* #define DEBUG_PUSH */
				50
				51	int htmlOmittedDefaultValue = 1;
				52
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	53	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				54	xmlChar end, xmlChar end2, xmlChar end3);
				55
				56	/************************************************************************
				57	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	58	* Parser stacks related functions and macros *
				59	* *
				60	************************************************************************/
				61
				62	/*
				63	* Generic function for accessing stacks in the Parser Context
				64	*/
				65
				66	#define PUSH_AND_POP(scope, type, name) \
				67	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				68	if (ctxt->name##Nr >= ctxt->name##Max) { \
				69	ctxt->name##Max *= 2; \
				70	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				71	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				72	if (ctxt->name##Tab == NULL) { \
				73	xmlGenericError(xmlGenericErrorContext, \
				74	"realloc failed !\n"); \
				75	return(0); \
				76	} \
				77	} \
				78	ctxt->name##Tab[ctxt->name##Nr] = value; \
				79	ctxt->name = value; \
				80	return(ctxt->name##Nr++); \
				81	} \
				82	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				83	type ret; \
				84	if (ctxt->name##Nr < 0) return(0); \
				85	ctxt->name##Nr--; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	if (ctxt->name##Nr > 0) \
				88	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				89	else \
				90	ctxt->name = NULL; \
				91	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				92	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				93	return(ret); \
				94	} \
				95
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	96	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				97	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	98
				99	/*
				100	* Macros for accessing the content. Those should be used only by the parser,
				101	* and not exported.
				102	*
				103	* Dirty macros, i.e. one need to make assumption on the context to use them
				104	*
				105	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				106	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				107	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				108	* in UNICODE mode. This should be used internally by the parser
				109	* only to compare to ASCII values otherwise it would break when
				110	* running with UTF-8 encoding.
				111	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				112	* to compare on ASCII based substring.
				113	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				114	* it should be used only to compare on ASCII based substring.
				115	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				116	* strings within the parser.
				117	*
				118	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				119	*
				120	* CURRENT Returns the current char value, with the full decoding of
				121	* UTF-8 if we are using this mode. It returns an int.
				122	* NEXT Skip to the next character, this does the proper decoding
				123	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				124	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				125	*/
				126
				127	#define UPPER (toupper(*ctxt->input->cur))
				128
				129	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				130
				131	#define NXT(val) ctxt->input->cur[(val)]
				132
				133	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				134
				135	#define CUR_PTR ctxt->input->cur
				136
				137	#define SHRINK xmlParserInputShrink(ctxt->input)
				138
				139	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				140
				141	#define CURRENT ((int) (*ctxt->input->cur))
				142
				143	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				144
				145	/* Inported from XML */
				146
				147	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				148	#define CUR ((int) (*ctxt->input->cur))
				149	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				150
				151	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				152	#define NXT(val) ctxt->input->cur[(val)]
				153	#define CUR_PTR ctxt->input->cur
				154
				155
				156	#define NEXTL(l) do { \
				157	if (*(ctxt->input->cur) == '\n') { \
				158	ctxt->input->line++; ctxt->input->col = 1; \
				159	} else ctxt->input->col++; \
				160	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				161	} while (0)
				162
				163	/************
				164	\
				165	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				166	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				167	************/
				168
				169	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				170	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				171
				172	#define COPY_BUF(l,b,i,v) \
				173	if (l == 1) b[i++] = (xmlChar) v; \
				174	else i += xmlCopyChar(l,&b[i],v)
				175
				176	/**
				177	* htmlCurrentChar:
				178	* @ctxt: the HTML parser context
				179	* @len: pointer to the length of the char read
				180	*
				181	* The current char value, if using UTF-8 this may actaully span multiple
				182	* bytes in the input buffer. Implement the end of line normalization:
				183	* 2.11 End-of-Line Handling
				184	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				185	* char, then the encoding converter is plugged in automatically.
				186	*
				187	* Returns the current char value and its lenght
				188	*/
				189
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	190	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				192	if (ctxt->instate == XML_PARSER_EOF)
				193	return(0);
				194
				195	if (ctxt->token != 0) {
				196	*len = 0;
				197	return(ctxt->token);
				198	}
				199	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				200	/*
				201	* We are supposed to handle UTF8, check it's valid
				202	* From rfc2044: encoding of the Unicode values on UTF-8:
				203	*
				204	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				205	* 0000 0000-0000 007F 0xxxxxxx
				206	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				207	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				208	*
				209	* Check for the 0x110000 limit too
				210	*/
				211	const unsigned char *cur = ctxt->input->cur;
				212	unsigned char c;
				213	unsigned int val;
				214
				215	c = *cur;
				216	if (c & 0x80) {
				217	if (cur[1] == 0)
				218	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				219	if ((cur[1] & 0xc0) != 0x80)
				220	goto encoding_error;
				221	if ((c & 0xe0) == 0xe0) {
				222
				223	if (cur[2] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[2] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xf0) == 0xf0) {
				228	if (cur[3] == 0)
				229	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				230	if (((c & 0xf8) != 0xf0) \|\|
				231	((cur[3] & 0xc0) != 0x80))
				232	goto encoding_error;
				233	/* 4-byte code */
				234	*len = 4;
				235	val = (cur[0] & 0x7) << 18;
				236	val \|= (cur[1] & 0x3f) << 12;
				237	val \|= (cur[2] & 0x3f) << 6;
				238	val \|= cur[3] & 0x3f;
				239	} else {
				240	/* 3-byte code */
				241	*len = 3;
				242	val = (cur[0] & 0xf) << 12;
				243	val \|= (cur[1] & 0x3f) << 6;
				244	val \|= cur[2] & 0x3f;
				245	}
				246	} else {
				247	/* 2-byte code */
				248	*len = 2;
				249	val = (cur[0] & 0x1f) << 6;
				250	val \|= cur[1] & 0x3f;
				251	}
				252	if (!IS_CHAR(val)) {
				253	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				254	if ((ctxt->sax != NULL) &&
				255	(ctxt->sax->error != NULL))
				256	ctxt->sax->error(ctxt->userData,
				257	"Char 0x%X out of allowed range\n", val);
				258	ctxt->wellFormed = 0;
				259	ctxt->disableSAX = 1;
				260	}
				261	return(val);
				262	} else {
				263	/* 1-byte code */
				264	*len = 1;
				265	return((int) *ctxt->input->cur);
				266	}
				267	}
				268	/*
				269	* Assume it's a fixed lenght encoding (1) with
				270	* a compatibke encoding for the ASCII set, since
				271	* XML constructs only use < 128 chars
				272	*/
				273	*len = 1;
				274	if ((int) *ctxt->input->cur < 0x80)
				275	return((int) *ctxt->input->cur);
				276
				277	/*
				278	* Humm this is bad, do an automatic flow conversion
				279	*/
				280	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				281	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				282	return(xmlCurrentChar(ctxt, len));
				283
				284	encoding_error:
				285	/*
				286	* If we detect an UTF8 error that probably mean that the
				287	* input encoding didn't get properly advertized in the
				288	* declaration header. Report the error and switch the encoding
				289	* to ISO-Latin-1 (if you don't like this policy, just declare the
				290	* encoding !)
				291	*/
				292	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				293	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				294	ctxt->sax->error(ctxt->userData,
				295	"Input is not proper UTF-8, indicate encoding !\n");
				296	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				297	ctxt->input->cur[0], ctxt->input->cur[1],
				298	ctxt->input->cur[2], ctxt->input->cur[3]);
				299	}
				300
				301	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				302	*len = 1;
				303	return((int) *ctxt->input->cur);
				304	}
				305
				306	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	307	* htmlSkipBlankChars:
				308	* @ctxt: the HTML parser context
				309	*
				310	* skip all blanks character found at that point in the input streams.
				311	*
				312	* Returns the number of space chars skipped
				313	*/
				314
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	315	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	316	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				317	int res = 0;
				318
				319	while (IS_BLANK(*(ctxt->input->cur))) {
				320	if ((*ctxt->input->cur == 0) &&
				321	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				322	xmlPopInput(ctxt);
				323	} else {
				324	if (*(ctxt->input->cur) == '\n') {
				325	ctxt->input->line++; ctxt->input->col = 1;
				326	} else ctxt->input->col++;
				327	ctxt->input->cur++;
				328	ctxt->nbChars++;
				329	if (*ctxt->input->cur == 0)
				330	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				331	}
				332	res++;
				333	}
				334	return(res);
				335	}
				336
				337
				338
				339	/************************************************************************
				340	* *
				341	* The list of HTML elements and their properties *
				342	* *
				343	************************************************************************/
				344
				345	/*
				346	* Start Tag: 1 means the start tag can be ommited
				347	* End Tag: 1 means the end tag can be ommited
				348	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	349	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	350	* Depr: this element is deprecated
				351	* DTD: 1 means that this element is valid only in the Loose DTD
				352	* 2 means that this element is valid only in the Frameset DTD
				353	*
				354	* Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
				355	*/
				356	htmlElemDesc html40ElementTable[] = {
				357	{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
				358	{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
				359	{ "acronym", 0, 0, 0, 0, 0, 0, "" },
				360	{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
				361	{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
				362	{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	363	{ "b", 0, 3, 0, 0, 0, 0, "bold text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	364	{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
				365	{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
				366	{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	367	{ "big", 0, 3, 0, 0, 0, 0, "large text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	368	{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
				369	{ "body", 1, 1, 0, 0, 0, 0, "document body " },
				370	{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
				371	{ "button", 0, 0, 0, 0, 0, 0, "push button " },
				372	{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	373	{ "center", 0, 3, 0, 0, 1, 1, "shorthand for div align=center " },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	374	{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
				375	{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
				376	{ "col", 0, 2, 2, 1, 0, 0, "table column " },
				377	{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
				378	{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
				379	{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
				380	{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
				381	{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
				382	{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
				383	{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
				384	{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	385	{ "em", 0, 3, 0, 0, 0, 0, "emphasis" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	386	{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	387	{ "font", 0, 3, 0, 0, 1, 1, "local change to font " },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	388	{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
				389	{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
				390	{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
				391	{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
				392	{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
				393	{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "head", 1, 1, 0, 0, 0, 0, "document head " },
				398	{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
				399	{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	400	{ "i", 0, 3, 0, 0, 0, 0, "italic text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	401	{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
				402	{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
				403	{ "input", 0, 2, 2, 1, 0, 0, "form control " },
				404	{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
				405	{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
				406	{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
				407	{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
				408	{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
				409	{ "li", 0, 1, 1, 0, 0, 0, "list item " },
				410	{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
				411	{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
				412	{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
				413	{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
				414	{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
				415	{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				416	{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
				417	{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
				418	{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
				419	{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
				420	{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
				421	{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
				422	{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
				423	{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	424	{ "s", 0, 3, 0, 0, 1, 1, "strike-through text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	425	{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
				426	{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
				427	{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	428	{ "small", 0, 3, 0, 0, 0, 0, "small text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	429	{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	430	{ "strike", 0, 3, 0, 0, 1, 1, "strike-through text" },
				431	{ "strong", 0, 3, 0, 0, 0, 0, "strong emphasis" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	432	{ "style", 0, 0, 0, 0, 0, 0, "style info " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	433	{ "sub", 0, 3, 0, 0, 0, 0, "subscript" },
				434	{ "sup", 0, 3, 0, 0, 0, 0, "superscript " },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	435	{ "table", 0, 0, 0, 0, 0, 0, " " },
				436	{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
				437	{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
				438	{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
				439	{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
				440	{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
				441	{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
				442	{ "title", 0, 0, 0, 0, 0, 0, "document title " },
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	443	{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	444	{ "tt", 0, 3, 0, 0, 0, 0, "teletype or monospaced text style" },
				445	{ "u", 0, 3, 0, 0, 1, 1, "underlined text style" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	446	{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
				447	{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
				448	};
				449
				450	/*
				451	* start tags that imply the end of a current element
				452	* any tag of each line implies the end of the current element if the type of
				453	* that element is in the same line
				454	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	455	const char *htmlEquEnd[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	456	"dt", "dd", "li", "option", NULL,
				457	"h1", "h2", "h3", "h4", "h5", "h6", NULL,
				458	"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
				459	NULL
				460	};
				461	/*
				462	* acording the HTML DTD, HR should be added to the 2nd line above, as it
				463	* is not allowed within a H1, H2, H3, etc. But we should tolerate that case
				464	* because many documents contain rules in headings...
				465	*/
				466
				467	/*
				468	* start tags that imply the end of current element
				469	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	470	const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	471	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				472	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				473	"listing", "xmp", "head", NULL,
				474	"head", "p", NULL,
				475	"title", "p", NULL,
				476	"body", "head", "style", "link", "title", "p", NULL,
				477	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				478	"pre", "listing", "xmp", "head", "li", NULL,
				479	"hr", "p", "head", NULL,
				480	"h1", "p", "head", NULL,
				481	"h2", "p", "head", NULL,
				482	"h3", "p", "head", NULL,
				483	"h4", "p", "head", NULL,
				484	"h5", "p", "head", NULL,
				485	"h6", "p", "head", NULL,
				486	"dir", "p", "head", NULL,
				487	"address", "p", "head", "ul", NULL,
				488	"pre", "p", "head", "ul", NULL,
				489	"listing", "p", "head", NULL,
				490	"xmp", "p", "head", NULL,
				491	"blockquote", "p", "head", NULL,
				492	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				493	"xmp", "head", NULL,
				494	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				495	"head", "dd", NULL,
				496	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				497	"head", "dt", NULL,
				498	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				499	"listing", "xmp", NULL,
				500	"ol", "p", "head", "ul", NULL,
				501	"menu", "p", "head", "ul", NULL,
				502	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				503	"div", "p", "head", NULL,
				504	"noscript", "p", "head", NULL,
				505	"center", "font", "b", "i", "p", "head", NULL,
				506	"a", "a", NULL,
				507	"caption", "p", NULL,
				508	"colgroup", "caption", "colgroup", "col", "p", NULL,
				509	"col", "caption", "col", "p", NULL,
				510	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				511	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	512	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				513	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	514	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				515	"thead", "caption", "col", "colgroup", NULL,
				516	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				517	"tbody", "p", NULL,
				518	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				519	"tfoot", "tbody", "p", NULL,
				520	"optgroup", "option", NULL,
				521	"option", "option", NULL,
				522	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				523	"pre", "listing", "xmp", "a", NULL,
				524	NULL
				525	};
				526
				527	/*
				528	* The list of HTML elements which are supposed not to have
				529	* CDATA content and where a p element will be implied
				530	*
				531	* TODO: extend that list by reading the HTML SGML DtD on
				532	* implied paragraph
				533	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	534	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	535	"html",
				536	"head",
				537	"body",
				538	NULL
				539	};
				540
				541	/*
				542	* The list of HTML attributes which are of content %Script;
				543	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				544	* it assumes the name starts with 'on'
				545	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	546	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	547	"onclick",
				548	"ondblclick",
				549	"onmousedown",
				550	"onmouseup",
				551	"onmouseover",
				552	"onmousemove",
				553	"onmouseout",
				554	"onkeypress",
				555	"onkeydown",
				556	"onkeyup",
				557	"onload",
				558	"onunload",
				559	"onfocus",
				560	"onblur",
				561	"onsubmit",
				562	"onrest",
				563	"onchange",
				564	"onselect"
				565	};
				566
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	567	/*
				568	* end tags that imply the end of the inside elements
				569	*/
				570	const char *htmlEndClose[] = {
				571	"head",
				572	"body",
				573	"html",
				574	NULL
				575	};
				576
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	577
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	578	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	579	static int htmlStartCloseIndexinitialized = 0;
				580
				581	/************************************************************************
				582	* *
				583	* functions to handle HTML specific data *
				584	* *
				585	************************************************************************/
				586
				587	/**
				588	* htmlInitAutoClose:
				589	*
				590	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				591	* This is not reentrant. Call xmlInitParser() once before processing in
				592	* case of use in multithreaded programs.
				593	*/
				594	void
				595	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	596	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	597
				598	if (htmlStartCloseIndexinitialized) return;
				599
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	600	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				601	indx = 0;
				602	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				603	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	604	while (htmlStartClose[i] != NULL) i++;
				605	i++;
				606	}
				607	htmlStartCloseIndexinitialized = 1;
				608	}
				609
				610	/**
				611	* htmlTagLookup:
				612	* @tag: The tag name in lowercase
				613	*
				614	* Lookup the HTML tag in the ElementTable
				615	*
				616	* Returns the related htmlElemDescPtr or NULL if not found.
				617	*/
				618	htmlElemDescPtr
				619	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	620	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	621
				622	for (i = 0; i < (sizeof(html40ElementTable) /
				623	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	624	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	625	return(&html40ElementTable[i]);
				626	}
				627	return(NULL);
				628	}
				629
				630	/**
				631	* htmlCheckAutoClose:
				632	* @newtag: The new tag name
				633	* @oldtag: The old tag name
				634	*
				635	* Checks wether the new tag is one of the registered valid tags for closing old.
				636	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				637	*
				638	* Returns 0 if no, 1 if yes.
				639	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	640	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	641	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	642	int i, indx;
				643	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	644
				645	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				646
				647	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	648	for (indx = 0; indx < 100;indx++) {
				649	closed = htmlStartCloseIndex[indx];
				650	if (closed == NULL) return(0);
				651	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	652	}
				653
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	654	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	655	i++;
				656	while (htmlStartClose[i] != NULL) {
				657	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				658	return(1);
				659	}
				660	i++;
				661	}
				662	return(0);
				663	}
				664
				665	/**
				666	* htmlAutoCloseOnClose:
				667	* @ctxt: an HTML parser context
				668	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	669	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	670	*
				671	* The HTmL DtD allows an ending tag to implicitely close other tags.
				672	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	673	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	674	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				675	htmlElemDescPtr info;
				676	xmlChar *oldname;
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	677	int i, endCloses = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	678
				679	#ifdef DEBUG
				680	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				681	for (i = 0;i < ctxt->nameNr;i++)
				682	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				683	#endif
				684
				685	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				686	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
				687	}
				688	if (i < 0) return;
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	689	for (i = 0; (htmlEndClose[i] != NULL);i++)
				690	if (xmlStrEqual(newtag, (const xmlChar *) htmlEndClose[i])) {
				691	endCloses = 1;
				692	break;
				693	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	694
				695	while (!xmlStrEqual(newtag, ctxt->name)) {
				696	info = htmlTagLookup(ctxt->name);
				697	if ((info == NULL) \|\| (info->endTag == 1)) {
				698	#ifdef DEBUG
				699	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				700	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	701	} else if (info->endTag == 3) {
				702	#ifdef DEBUG
				703	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				704	#endif
				705	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				706	ctxt->sax->error(ctxt->userData,
				707	"Opening and ending tag mismatch: %s and %s\n",
				708	newtag, ctxt->name);
				709	ctxt->wellFormed = 0;
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	710	} else if (endCloses == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	711	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	712	}
				713	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				714	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				715	oldname = htmlnamePop(ctxt);
				716	if (oldname != NULL) {
				717	#ifdef DEBUG
				718	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				719	#endif
				720	xmlFree(oldname);
				721	}
				722	}
				723	}
				724
				725	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	726	* htmlAutoCloseOnEnd:
				727	* @ctxt: an HTML parser context
				728	*
				729	* Close all remaining tags at the end of the stream
				730	*/
				731	static void
				732	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				733	xmlChar *oldname;
				734	int i;
				735
				736	if (ctxt->nameNr == 0)
				737	return;
				738	#ifdef DEBUG
				739	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				740	#endif
				741
				742	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				743	#ifdef DEBUG
				744	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				745	#endif
				746	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				747	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				748	oldname = htmlnamePop(ctxt);
				749	if (oldname != NULL) {
				750	#ifdef DEBUG
				751	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				752	#endif
				753	xmlFree(oldname);
				754	}
				755	}
				756	}
				757
				758	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	759	* htmlAutoClose:
				760	* @ctxt: an HTML parser context
				761	* @newtag: The new tag name or NULL
				762	*
				763	* The HTmL DtD allows a tag to implicitely close other tags.
				764	* The list is kept in htmlStartClose array. This function is
				765	* called when a new tag has been detected and generates the
				766	* appropriates closes if possible/needed.
				767	* If newtag is NULL this mean we are at the end of the resource
				768	* and we should check
				769	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	770	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	771	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				772	xmlChar *oldname;
				773	while ((newtag != NULL) && (ctxt->name != NULL) &&
				774	(htmlCheckAutoClose(newtag, ctxt->name))) {
				775	#ifdef DEBUG
				776	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				777	#endif
				778	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				779	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				780	oldname = htmlnamePop(ctxt);
				781	if (oldname != NULL) {
				782	#ifdef DEBUG
				783	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				784	#endif
				785	xmlFree(oldname);
				786	}
				787	}
				788	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	789	htmlAutoCloseOnEnd(ctxt);
				790	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	791	}
				792	while ((newtag == NULL) && (ctxt->name != NULL) &&
				793	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				794	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				795	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				796	#ifdef DEBUG
				797	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				798	#endif
				799	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				800	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				801	oldname = htmlnamePop(ctxt);
				802	if (oldname != NULL) {
				803	#ifdef DEBUG
				804	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				805	#endif
				806	xmlFree(oldname);
				807	}
				808	}
				809
				810	}
				811
				812	/**
				813	* htmlAutoCloseTag:
				814	* @doc: the HTML document
				815	* @name: The tag name
				816	* @elem: the HTML element
				817	*
				818	* The HTmL DtD allows a tag to implicitely close other tags.
				819	* The list is kept in htmlStartClose array. This function checks
				820	* if the element or one of it's children would autoclose the
				821	* given tag.
				822	*
				823	* Returns 1 if autoclose, 0 otherwise
				824	*/
				825	int
				826	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				827	htmlNodePtr child;
				828
				829	if (elem == NULL) return(1);
				830	if (xmlStrEqual(name, elem->name)) return(0);
				831	if (htmlCheckAutoClose(elem->name, name)) return(1);
				832	child = elem->children;
				833	while (child != NULL) {
				834	if (htmlAutoCloseTag(doc, name, child)) return(1);
				835	child = child->next;
				836	}
				837	return(0);
				838	}
				839
				840	/**
				841	* htmlIsAutoClosed:
				842	* @doc: the HTML document
				843	* @elem: the HTML element
				844	*
				845	* The HTmL DtD allows a tag to implicitely close other tags.
				846	* The list is kept in htmlStartClose array. This function checks
				847	* if a tag is autoclosed by one of it's child
				848	*
				849	* Returns 1 if autoclosed, 0 otherwise
				850	*/
				851	int
				852	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				853	htmlNodePtr child;
				854
				855	if (elem == NULL) return(1);
				856	child = elem->children;
				857	while (child != NULL) {
				858	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				859	child = child->next;
				860	}
				861	return(0);
				862	}
				863
				864	/**
				865	* htmlCheckImplied:
				866	* @ctxt: an HTML parser context
				867	* @newtag: The new tag name
				868	*
				869	* The HTML DtD allows a tag to exists only implicitely
				870	* called when a new tag has been detected and generates the
				871	* appropriates implicit tags if missing
				872	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	873	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	874	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				875	if (!htmlOmittedDefaultValue)
				876	return;
				877	if (xmlStrEqual(newtag, BAD_CAST"html"))
				878	return;
				879	if (ctxt->nameNr <= 0) {
				880	#ifdef DEBUG
				881	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				882	#endif
				883	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				884	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				885	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				886	}
				887	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				888	return;
				889	if ((ctxt->nameNr <= 1) &&
				890	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				891	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				892	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				893	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				894	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				895	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				896	/*
				897	* dropped OBJECT ... i you put it first BODY will be
				898	* assumed !
				899	*/
				900	#ifdef DEBUG
				901	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				902	#endif
				903	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				904	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				905	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				906	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				907	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				908	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				909	int i;
				910	for (i = 0;i < ctxt->nameNr;i++) {
				911	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				912	return;
				913	}
				914	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				915	return;
				916	}
				917	}
				918
				919	#ifdef DEBUG
				920	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				921	#endif
				922	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				923	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				924	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				925	}
				926	}
				927
				928	/**
				929	* htmlCheckParagraph
				930	* @ctxt: an HTML parser context
				931	*
				932	* Check whether a p element need to be implied before inserting
				933	* characters in the current element.
				934	*
				935	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				936	* in case of error.
				937	*/
				938
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	939	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	940	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				941	const xmlChar *tag;
				942	int i;
				943
				944	if (ctxt == NULL)
				945	return(-1);
				946	tag = ctxt->name;
				947	if (tag == NULL) {
				948	htmlAutoClose(ctxt, BAD_CAST"p");
				949	htmlCheckImplied(ctxt, BAD_CAST"p");
				950	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				951	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				952	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				953	return(1);
				954	}
				955	if (!htmlOmittedDefaultValue)
				956	return(0);
				957	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				958	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				959	#ifdef DEBUG
				960	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				961	#endif
				962	htmlAutoClose(ctxt, BAD_CAST"p");
				963	htmlCheckImplied(ctxt, BAD_CAST"p");
				964	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				965	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				966	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				967	return(1);
				968	}
				969	}
				970	return(0);
				971	}
				972
				973	/**
				974	* htmlIsScriptAttribute:
				975	* @name: an attribute name
				976	*
				977	* Check if an attribute is of content type Script
				978	*
				979	* Returns 1 is the attribute is a script 0 otherwise
				980	*/
				981	int
				982	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	983	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	984
				985	if (name == NULL)
				986	return(0);
				987	/*
				988	* all script attributes start with 'on'
				989	*/
				990	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				991	return(0);
				992	for (i = 0;
				993	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				994	i++) {
				995	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				996	return(1);
				997	}
				998	return(0);
				999	}
				1000
				1001	/************************************************************************
				1002	* *
				1003	* The list of HTML predefined entities *
				1004	* *
				1005	************************************************************************/
				1006
				1007
				1008	htmlEntityDesc html40EntitiesTable[] = {
				1009	/*
				1010	* the 4 absolute ones, plus apostrophe.
				1011	*/
				1012	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1013	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1014	{ 39, "apos", "single quote" },
				1015	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1016	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1017
				1018	/*
				1019	* A bunch still in the 128-255 range
				1020	* Replacing them depend really on the charset used.
				1021	*/
				1022	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1023	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1024	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1025	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1026	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1027	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1028	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1029	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1030	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1031	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1032	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1033	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1034	{ 172, "not", "not sign, U+00AC ISOnum" },
				1035	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1036	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1037	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1038	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1039	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1040	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1041	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1042	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1043	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1044	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1045	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1046	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1047	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1048	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1049	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1050	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1051	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1052	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1053	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1054	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1055	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1056	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1057	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1058	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1059	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1060	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1061	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1062	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1063	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1064	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1065	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1066	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1067	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1068	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1069	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1070	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1071	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1072	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1073	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1074	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1075	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1076	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1077	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1078	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1079	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1080	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1081	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1082	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1083	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1084	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1085	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1086	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1087	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1088	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1089	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1090	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1091	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1092	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1093	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1094	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1095	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1096	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1097	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1098	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1099	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1100	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1101	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1102	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1103	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1104	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1105	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1106	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1107	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1108	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1109	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1110	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1111	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1112	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1113	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1114	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1115	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1116	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1117	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1118
				1119	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1120	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1121	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1122	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1123	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1124
				1125	/*
				1126	* Anything below should really be kept as entities references
				1127	*/
				1128	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1129
				1130	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1131	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1132
				1133	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1134	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1135	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1136	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1137	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1138	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1139	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1140	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1141	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1142	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1143	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1144	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1145	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1146	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1147	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1148	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1149	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1150	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1151	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1152	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1153	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1154	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1155	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1156	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1157
				1158	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1159	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1160	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1161	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1162	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1163	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1164	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1165	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1166	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1167	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1168	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1169	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1170	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1171	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1172	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1173	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1174	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1175	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1176	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1177	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1178	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1179	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1180	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1181	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1182	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1183	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1184	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1185	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1186
				1187	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1188	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1189	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1190	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1191	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1192	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1193	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1194	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1195	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1196	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1197	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1198	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1199	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1200	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1201	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1202	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1203	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1204
				1205	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1206	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1207
				1208	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1209
				1210	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1211	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1212
				1213	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1214	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1215
				1216	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1217	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1218
				1219	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1220
				1221	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1222	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1223	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1224	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1225	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1226	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1227	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1228	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1229	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1230	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1231	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1232	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1233	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1234	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1235	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1236	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1237
				1238	{ 8704, "forall","for all, U+2200 ISOtech" },
				1239	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1240	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1241	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1242	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1243	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1244	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1245	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1246	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1247	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1248	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1249	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1250	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1251	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1252	{ 8734, "infin","infinity, U+221E ISOtech" },
				1253	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1254	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1255	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1256	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1257	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1258	{ 8747, "int", "integral, U+222B ISOtech" },
				1259	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1260	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1261	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1262	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1263	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1264	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1265	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1266	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1267	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1268	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1269	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1270	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1271	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1272	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1273	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1274	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1275	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1276	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1277	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1278	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1279	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1280	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1281	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1282	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1283
				1284	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1285	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1286	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1287	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1288
				1289	};
				1290
				1291	/************************************************************************
				1292	* *
				1293	* Commodity functions to handle entities *
				1294	* *
				1295	************************************************************************/
				1296
				1297	/*
				1298	* Macro used to grow the current buffer.
				1299	*/
				1300	#define growBuffer(buffer) { \
				1301	buffer##_size *= 2; \
				1302	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1303	if (buffer == NULL) { \
				1304	perror("realloc failed"); \
				1305	return(NULL); \
				1306	} \
				1307	}
				1308
				1309	/**
				1310	* htmlEntityLookup:
				1311	* @name: the entity name
				1312	*
				1313	* Lookup the given entity in EntitiesTable
				1314	*
				1315	* TODO: the linear scan is really ugly, an hash table is really needed.
				1316	*
				1317	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1318	*/
				1319	htmlEntityDescPtr
				1320	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1321	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1322
				1323	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1324	sizeof(html40EntitiesTable[0]));i++) {
				1325	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1326	#ifdef DEBUG
				1327	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1328	#endif
				1329	return(&html40EntitiesTable[i]);
				1330	}
				1331	}
				1332	return(NULL);
				1333	}
				1334
				1335	/**
				1336	* htmlEntityValueLookup:
				1337	* @value: the entity's unicode value
				1338	*
				1339	* Lookup the given entity in EntitiesTable
				1340	*
				1341	* TODO: the linear scan is really ugly, an hash table is really needed.
				1342	*
				1343	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1344	*/
				1345	htmlEntityDescPtr
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1346	htmlEntityValueLookup(unsigned int value) {
				1347	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1348	#ifdef DEBUG
				1349	int lv = 0;
				1350	#endif
				1351
				1352	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1353	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1354	if (html40EntitiesTable[i].value >= value) {
				1355	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1356	break;
				1357	#ifdef DEBUG
				1358	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1359	#endif
				1360	return(&html40EntitiesTable[i]);
				1361	}
				1362	#ifdef DEBUG
				1363	if (lv > html40EntitiesTable[i].value) {
				1364	xmlGenericError(xmlGenericErrorContext,
				1365	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1366	lv, html40EntitiesTable[i].value);
				1367	}
				1368	lv = html40EntitiesTable[i].value;
				1369	#endif
				1370	}
				1371	return(NULL);
				1372	}
				1373
				1374	/**
				1375	* UTF8ToHtml:
				1376	* @out: a pointer to an array of bytes to store the result
				1377	* @outlen: the length of @out
				1378	* @in: a pointer to an array of UTF-8 chars
				1379	* @inlen: the length of @in
				1380	*
				1381	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1382	* plus HTML entities block of chars out.
				1383	*
				1384	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1385	* The value of @inlen after return is the number of octets consumed
				1386	* as the return value is positive, else unpredictiable.
				1387	* The value of @outlen after return is the number of octets consumed.
				1388	*/
				1389	int
				1390	UTF8ToHtml(unsigned char* out, int *outlen,
				1391	const unsigned char* in, int *inlen) {
				1392	const unsigned char* processed = in;
				1393	const unsigned char* outend;
				1394	const unsigned char* outstart = out;
				1395	const unsigned char* instart = in;
				1396	const unsigned char* inend;
				1397	unsigned int c, d;
				1398	int trailing;
				1399
				1400	if (in == NULL) {
				1401	/*
				1402	* initialization nothing to do
				1403	*/
				1404	*outlen = 0;
				1405	*inlen = 0;
				1406	return(0);
				1407	}
				1408	inend = in + (*inlen);
				1409	outend = out + (*outlen);
				1410	while (in < inend) {
				1411	d = *in++;
				1412	if (d < 0x80) { c= d; trailing= 0; }
				1413	else if (d < 0xC0) {
				1414	/* trailing byte in leading position */
				1415	*outlen = out - outstart;
				1416	*inlen = processed - instart;
				1417	return(-2);
				1418	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1419	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1420	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1421	else {
				1422	/* no chance for this in Ascii */
				1423	*outlen = out - outstart;
				1424	*inlen = processed - instart;
				1425	return(-2);
				1426	}
				1427
				1428	if (inend - in < trailing) {
				1429	break;
				1430	}
				1431
				1432	for ( ; trailing; trailing--) {
				1433	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1434	break;
				1435	c <<= 6;
				1436	c \|= d & 0x3F;
				1437	}
				1438
				1439	/* assertion: c is a single UTF-4 value */
				1440	if (c < 0x80) {
				1441	if (out + 1 >= outend)
				1442	break;
				1443	*out++ = c;
				1444	} else {
				1445	int len;
				1446	htmlEntityDescPtr ent;
				1447
				1448	/*
				1449	* Try to lookup a predefined HTML entity for it
				1450	*/
				1451
				1452	ent = htmlEntityValueLookup(c);
				1453	if (ent == NULL) {
				1454	/* no chance for this in Ascii */
				1455	*outlen = out - outstart;
				1456	*inlen = processed - instart;
				1457	return(-2);
				1458	}
				1459	len = strlen(ent->name);
				1460	if (out + 2 + len >= outend)
				1461	break;
				1462	*out++ = '&';
				1463	memcpy(out, ent->name, len);
				1464	out += len;
				1465	*out++ = ';';
				1466	}
				1467	processed = in;
				1468	}
				1469	*outlen = out - outstart;
				1470	*inlen = processed - instart;
				1471	return(0);
				1472	}
				1473
				1474	/**
				1475	* htmlEncodeEntities:
				1476	* @out: a pointer to an array of bytes to store the result
				1477	* @outlen: the length of @out
				1478	* @in: a pointer to an array of UTF-8 chars
				1479	* @inlen: the length of @in
				1480	* @quoteChar: the quote character to escape (' or ") or zero.
				1481	*
				1482	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1483	* plus HTML entities block of chars out.
				1484	*
				1485	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1486	* The value of @inlen after return is the number of octets consumed
				1487	* as the return value is positive, else unpredictiable.
				1488	* The value of @outlen after return is the number of octets consumed.
				1489	*/
				1490	int
				1491	htmlEncodeEntities(unsigned char* out, int *outlen,
				1492	const unsigned char* in, int *inlen, int quoteChar) {
				1493	const unsigned char* processed = in;
				1494	const unsigned char* outend = out + (*outlen);
				1495	const unsigned char* outstart = out;
				1496	const unsigned char* instart = in;
				1497	const unsigned char* inend = in + (*inlen);
				1498	unsigned int c, d;
				1499	int trailing;
				1500
				1501	while (in < inend) {
				1502	d = *in++;
				1503	if (d < 0x80) { c= d; trailing= 0; }
				1504	else if (d < 0xC0) {
				1505	/* trailing byte in leading position */
				1506	*outlen = out - outstart;
				1507	*inlen = processed - instart;
				1508	return(-2);
				1509	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1510	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1511	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1512	else {
				1513	/* no chance for this in Ascii */
				1514	*outlen = out - outstart;
				1515	*inlen = processed - instart;
				1516	return(-2);
				1517	}
				1518
				1519	if (inend - in < trailing)
				1520	break;
				1521
				1522	while (trailing--) {
				1523	if (((d= *in++) & 0xC0) != 0x80) {
				1524	*outlen = out - outstart;
				1525	*inlen = processed - instart;
				1526	return(-2);
				1527	}
				1528	c <<= 6;
				1529	c \|= d & 0x3F;
				1530	}
				1531
				1532	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1533	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1534	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1535	if (out >= outend)
				1536	break;
				1537	*out++ = c;
				1538	} else {
				1539	htmlEntityDescPtr ent;
				1540	const char *cp;
				1541	char nbuf[16];
				1542	int len;
				1543
				1544	/*
				1545	* Try to lookup a predefined HTML entity for it
				1546	*/
				1547	ent = htmlEntityValueLookup(c);
				1548	if (ent == NULL) {
				1549	sprintf(nbuf, "#%u", c);
				1550	cp = nbuf;
				1551	}
				1552	else
				1553	cp = ent->name;
				1554	len = strlen(cp);
				1555	if (out + 2 + len > outend)
				1556	break;
				1557	*out++ = '&';
				1558	memcpy(out, cp, len);
				1559	out += len;
				1560	*out++ = ';';
				1561	}
				1562	processed = in;
				1563	}
				1564	*outlen = out - outstart;
				1565	*inlen = processed - instart;
				1566	return(0);
				1567	}
				1568
				1569	/**
				1570	* htmlDecodeEntities:
				1571	* @ctxt: the parser context
				1572	* @len: the len to decode (in bytes !), -1 for no size limit
				1573	* @end: an end marker xmlChar, 0 if none
				1574	* @end2: an end marker xmlChar, 0 if none
				1575	* @end3: an end marker xmlChar, 0 if none
				1576	*
				1577	* Subtitute the HTML entities by their value
				1578	*
				1579	* DEPRECATED !!!!
				1580	*
				1581	* Returns A newly allocated string with the substitution done. The caller
				1582	* must deallocate it !
				1583	*/
				1584	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1585	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1586	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1587	static int deprecated = 0;
				1588	if (!deprecated) {
				1589	xmlGenericError(xmlGenericErrorContext,
				1590	"htmlDecodeEntities() deprecated function reached\n");
				1591	deprecated = 1;
				1592	}
				1593	return(NULL);
				1594	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1595	xmlChar *name = NULL;
				1596	xmlChar *buffer = NULL;
				1597	unsigned int buffer_size = 0;
				1598	unsigned int nbchars = 0;
				1599	htmlEntityDescPtr ent;
				1600	unsigned int max = (unsigned int) len;
				1601	int c,l;
				1602
				1603	if (ctxt->depth > 40) {
				1604	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1605	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1606	ctxt->sax->error(ctxt->userData,
				1607	"Detected entity reference loop\n");
				1608	ctxt->wellFormed = 0;
				1609	ctxt->disableSAX = 1;
				1610	return(NULL);
				1611	}
				1612
				1613	/*
				1614	* allocate a translation buffer.
				1615	*/
				1616	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1617	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1618	if (buffer == NULL) {
				1619	perror("xmlDecodeEntities: malloc failed");
				1620	return(NULL);
				1621	}
				1622
				1623	/*
				1624	* Ok loop until we reach one of the ending char or a size limit.
				1625	*/
				1626	c = CUR_CHAR(l);
				1627	while ((nbchars < max) && (c != end) &&
				1628	(c != end2) && (c != end3)) {
				1629
				1630	if (c == 0) break;
				1631	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1632	int val = htmlParseCharRef(ctxt);
				1633	COPY_BUF(0,buffer,nbchars,val);
				1634	NEXTL(l);
				1635	} else if ((c == '&') && (ctxt->token != '&')) {
				1636	ent = htmlParseEntityRef(ctxt, &name);
				1637	if (name != NULL) {
				1638	if (ent != NULL) {
				1639	int val = ent->value;
				1640	COPY_BUF(0,buffer,nbchars,val);
				1641	NEXTL(l);
				1642	} else {
				1643	const xmlChar *cur = name;
				1644
				1645	buffer[nbchars++] = '&';
				1646	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1647	growBuffer(buffer);
				1648	}
				1649	while (*cur != 0) {
				1650	buffer[nbchars++] = *cur++;
				1651	}
				1652	buffer[nbchars++] = ';';
				1653	}
				1654	}
				1655	} else {
				1656	COPY_BUF(l,buffer,nbchars,c);
				1657	NEXTL(l);
				1658	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1659	growBuffer(buffer);
				1660	}
				1661	}
				1662	c = CUR_CHAR(l);
				1663	}
				1664	buffer[nbchars++] = 0;
				1665	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1666	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1667	}
				1668
				1669	/************************************************************************
				1670	* *
				1671	* Commodity functions to handle streams *
				1672	* *
				1673	************************************************************************/
				1674
				1675	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1676	* htmlNewInputStream:
				1677	* @ctxt: an HTML parser context
				1678	*
				1679	* Create a new input stream structure
				1680	* Returns the new input stream or NULL
				1681	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1682	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1683	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1684	htmlParserInputPtr input;
				1685
				1686	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1687	if (input == NULL) {
				1688	ctxt->errNo = XML_ERR_NO_MEMORY;
				1689	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1690	ctxt->sax->error(ctxt->userData,
				1691	"malloc: couldn't allocate a new input stream\n");
				1692	return(NULL);
				1693	}
				1694	memset(input, 0, sizeof(htmlParserInput));
				1695	input->filename = NULL;
				1696	input->directory = NULL;
				1697	input->base = NULL;
				1698	input->cur = NULL;
				1699	input->buf = NULL;
				1700	input->line = 1;
				1701	input->col = 1;
				1702	input->buf = NULL;
				1703	input->free = NULL;
				1704	input->version = NULL;
				1705	input->consumed = 0;
				1706	input->length = 0;
				1707	return(input);
				1708	}
				1709
				1710
				1711	/************************************************************************
				1712	* *
				1713	* Commodity functions, cleanup needed ? *
				1714	* *
				1715	************************************************************************/
				1716
				1717	/**
				1718	* areBlanks:
				1719	* @ctxt: an HTML parser context
				1720	* @str: a xmlChar *
				1721	* @len: the size of @str
				1722	*
				1723	* Is this a sequence of blank chars that one can ignore ?
				1724	*
				1725	* Returns 1 if ignorable 0 otherwise.
				1726	*/
				1727
				1728	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1729	int i;
				1730	xmlNodePtr lastChild;
				1731
				1732	for (i = 0;i < len;i++)
				1733	if (!(IS_BLANK(str[i]))) return(0);
				1734
				1735	if (CUR == 0) return(1);
				1736	if (CUR != '<') return(0);
				1737	if (ctxt->name == NULL)
				1738	return(1);
				1739	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1740	return(1);
				1741	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1742	return(1);
				1743	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1744	return(1);
				1745	if (ctxt->node == NULL) return(0);
				1746	lastChild = xmlGetLastChild(ctxt->node);
				1747	if (lastChild == NULL) {
				1748	if (ctxt->node->content != NULL) return(0);
				1749	} else if (xmlNodeIsText(lastChild)) {
				1750	return(0);
				1751	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1752	return(0);
				1753	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1754	return(0);
				1755	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1756	return(0);
				1757	}
				1758	return(1);
				1759	}
				1760
				1761	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1762	* htmlNewDocNoDtD:
				1763	* @URI: URI for the dtd, or NULL
				1764	* @ExternalID: the external ID of the DTD, or NULL
				1765	*
				1766	* Returns a new document, do not intialize the DTD if not provided
				1767	*/
				1768	htmlDocPtr
				1769	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1770	xmlDocPtr cur;
				1771
				1772	/*
				1773	* Allocate a new document and fill the fields.
				1774	*/
				1775	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1776	if (cur == NULL) {
				1777	xmlGenericError(xmlGenericErrorContext,
				1778	"xmlNewDoc : malloc failed\n");
				1779	return(NULL);
				1780	}
				1781	memset(cur, 0, sizeof(xmlDoc));
				1782
				1783	cur->type = XML_HTML_DOCUMENT_NODE;
				1784	cur->version = NULL;
				1785	cur->intSubset = NULL;
				1786	if ((ExternalID != NULL) \|\|
				1787	(URI != NULL))
				1788	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1789	cur->doc = cur;
				1790	cur->name = NULL;
				1791	cur->children = NULL;
				1792	cur->extSubset = NULL;
				1793	cur->oldNs = NULL;
				1794	cur->encoding = NULL;
				1795	cur->standalone = 1;
				1796	cur->compression = 0;
				1797	cur->ids = NULL;
				1798	cur->refs = NULL;
				1799	#ifndef XML_WITHOUT_CORBA
				1800	cur->_private = NULL;
				1801	#endif
				1802	return(cur);
				1803	}
				1804
				1805	/**
				1806	* htmlNewDoc:
				1807	* @URI: URI for the dtd, or NULL
				1808	* @ExternalID: the external ID of the DTD, or NULL
				1809	*
				1810	* Returns a new document
				1811	*/
				1812	htmlDocPtr
				1813	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1814	if ((URI == NULL) && (ExternalID == NULL))
				1815	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame^]	1816	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1817	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1818
				1819	return(htmlNewDocNoDtD(URI, ExternalID));
				1820	}
				1821
				1822
				1823	/************************************************************************
				1824	* *
				1825	* The parser itself *
				1826	* Relates to http://www.w3.org/TR/html40 *
				1827	* *
				1828	************************************************************************/
				1829
				1830	/************************************************************************
				1831	* *
				1832	* The parser itself *
				1833	* *
				1834	************************************************************************/
				1835
				1836	/**
				1837	* htmlParseHTMLName:
				1838	* @ctxt: an HTML parser context
				1839	*
				1840	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1841	* since HTML names are not case-sensitive.
				1842	*
				1843	* Returns the Tag Name parsed or NULL
				1844	*/
				1845
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1846	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1847	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1848	xmlChar *ret = NULL;
				1849	int i = 0;
				1850	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1851
				1852	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1853	(CUR != ':')) return(NULL);
				1854
				1855	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1856	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1857	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1858	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1859	else loc[i] = CUR;
				1860	i++;
				1861
				1862	NEXT;
				1863	}
				1864
				1865	ret = xmlStrndup(loc, i);
				1866
				1867	return(ret);
				1868	}
				1869
				1870	/**
				1871	* htmlParseName:
				1872	* @ctxt: an HTML parser context
				1873	*
				1874	* parse an HTML name, this routine is case sensistive.
				1875	*
				1876	* Returns the Name parsed or NULL
				1877	*/
				1878
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1879	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1880	htmlParseName(htmlParserCtxtPtr ctxt) {
				1881	xmlChar buf[HTML_MAX_NAMELEN];
				1882	int len = 0;
				1883
				1884	GROW;
				1885	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1886	return(NULL);
				1887	}
				1888
				1889	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1890	(CUR == '.') \|\| (CUR == '-') \|\|
				1891	(CUR == '_') \|\| (CUR == ':') \|\|
				1892	(IS_COMBINING(CUR)) \|\|
				1893	(IS_EXTENDER(CUR))) {
				1894	buf[len++] = CUR;
				1895	NEXT;
				1896	if (len >= HTML_MAX_NAMELEN) {
				1897	xmlGenericError(xmlGenericErrorContext,
				1898	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1899	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1900	(CUR == '.') \|\| (CUR == '-') \|\|
				1901	(CUR == '_') \|\| (CUR == ':') \|\|
				1902	(IS_COMBINING(CUR)) \|\|
				1903	(IS_EXTENDER(CUR)))
				1904	NEXT;
				1905	break;
				1906	}
				1907	}
				1908	return(xmlStrndup(buf, len));
				1909	}
				1910
				1911	/**
				1912	* htmlParseHTMLAttribute:
				1913	* @ctxt: an HTML parser context
				1914	* @stop: a char stop value
				1915	*
				1916	* parse an HTML attribute value till the stop (quote), if
				1917	* stop is 0 then it stops at the first space
				1918	*
				1919	* Returns the attribute parsed or NULL
				1920	*/
				1921
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1922	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1923	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1924	xmlChar *buffer = NULL;
				1925	int buffer_size = 0;
				1926	xmlChar *out = NULL;
				1927	xmlChar *name = NULL;
				1928
				1929	xmlChar *cur = NULL;
				1930	htmlEntityDescPtr ent;
				1931
				1932	/*
				1933	* allocate a translation buffer.
				1934	*/
				1935	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1936	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1937	if (buffer == NULL) {
				1938	perror("htmlParseHTMLAttribute: malloc failed");
				1939	return(NULL);
				1940	}
				1941	out = buffer;
				1942
				1943	/*
				1944	* Ok loop until we reach one of the ending chars
				1945	*/
				1946	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1947	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1948	if (CUR == '&') {
				1949	if (NXT(1) == '#') {
				1950	unsigned int c;
				1951	int bits;
				1952
				1953	c = htmlParseCharRef(ctxt);
				1954	if (c < 0x80)
				1955	{ *out++ = c; bits= -6; }
				1956	else if (c < 0x800)
				1957	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1958	else if (c < 0x10000)
				1959	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1960	else
				1961	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1962
				1963	for ( ; bits >= 0; bits-= 6) {
				1964	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1965	}
				1966	} else {
				1967	ent = htmlParseEntityRef(ctxt, &name);
				1968	if (name == NULL) {
				1969	*out++ = '&';
				1970	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1971	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1972
				1973	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1974	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1975	}
				1976	} else if (ent == NULL) {
				1977	*out++ = '&';
				1978	cur = name;
				1979	while (*cur != 0) {
				1980	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1981	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1982
				1983	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1984	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1985	}
				1986	out++ = cur++;
				1987	}
				1988	xmlFree(name);
				1989	} else {
				1990	unsigned int c;
				1991	int bits;
				1992
				1993	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1994	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1995
				1996	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1997	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1998	}
				1999	c = (xmlChar)ent->value;
				2000	if (c < 0x80)
				2001	{ *out++ = c; bits= -6; }
				2002	else if (c < 0x800)
				2003	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2004	else if (c < 0x10000)
				2005	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2006	else
				2007	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2008
				2009	for ( ; bits >= 0; bits-= 6) {
				2010	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2011	}
				2012	xmlFree(name);
				2013	}
				2014	}
				2015	} else {
				2016	unsigned int c;
				2017	int bits, l;
				2018
				2019	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2020	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2021
				2022	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2023	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2024	}
				2025	c = CUR_CHAR(l);
				2026	if (c < 0x80)
				2027	{ *out++ = c; bits= -6; }
				2028	else if (c < 0x800)
				2029	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2030	else if (c < 0x10000)
				2031	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2032	else
				2033	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2034
				2035	for ( ; bits >= 0; bits-= 6) {
				2036	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2037	}
				2038	NEXT;
				2039	}
				2040	}
				2041	*out++ = 0;
				2042	return(buffer);
				2043	}
				2044
				2045	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2046	* htmlParseEntityRef:
				2047	* @ctxt: an HTML parser context
				2048	* @str: location to store the entity name
				2049	*
				2050	* parse an HTML ENTITY references
				2051	*
				2052	* [68] EntityRef ::= '&' Name ';'
				2053	*
				2054	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2055	* if non-NULL *str will have to be freed by the caller.
				2056	*/
				2057	htmlEntityDescPtr
				2058	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2059	xmlChar *name;
				2060	htmlEntityDescPtr ent = NULL;
				2061	*str = NULL;
				2062
				2063	if (CUR == '&') {
				2064	NEXT;
				2065	name = htmlParseName(ctxt);
				2066	if (name == NULL) {
				2067	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2068	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2069	ctxt->wellFormed = 0;
				2070	} else {
				2071	GROW;
				2072	if (CUR == ';') {
				2073	*str = name;
				2074
				2075	/*
				2076	* Lookup the entity in the table.
				2077	*/
				2078	ent = htmlEntityLookup(name);
				2079	if (ent != NULL) /* OK that's ugly !!! */
				2080	NEXT;
				2081	} else {
				2082	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2083	ctxt->sax->error(ctxt->userData,
				2084	"htmlParseEntityRef: expecting ';'\n");
				2085	*str = name;
				2086	}
				2087	}
				2088	}
				2089	return(ent);
				2090	}
				2091
				2092	/**
				2093	* htmlParseAttValue:
				2094	* @ctxt: an HTML parser context
				2095	*
				2096	* parse a value for an attribute
				2097	* Note: the parser won't do substitution of entities here, this
				2098	* will be handled later in xmlStringGetNodeList, unless it was
				2099	* asked for ctxt->replaceEntities != 0
				2100	*
				2101	* Returns the AttValue parsed or NULL.
				2102	*/
				2103
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2104	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2105	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2106	xmlChar *ret = NULL;
				2107
				2108	if (CUR == '"') {
				2109	NEXT;
				2110	ret = htmlParseHTMLAttribute(ctxt, '"');
				2111	if (CUR != '"') {
				2112	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2113	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2114	ctxt->wellFormed = 0;
				2115	} else
				2116	NEXT;
				2117	} else if (CUR == '\'') {
				2118	NEXT;
				2119	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2120	if (CUR != '\'') {
				2121	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2122	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2123	ctxt->wellFormed = 0;
				2124	} else
				2125	NEXT;
				2126	} else {
				2127	/*
				2128	* That's an HTMLism, the attribute value may not be quoted
				2129	*/
				2130	ret = htmlParseHTMLAttribute(ctxt, 0);
				2131	if (ret == NULL) {
				2132	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2133	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2134	ctxt->wellFormed = 0;
				2135	}
				2136	}
				2137	return(ret);
				2138	}
				2139
				2140	/**
				2141	* htmlParseSystemLiteral:
				2142	* @ctxt: an HTML parser context
				2143	*
				2144	* parse an HTML Literal
				2145	*
				2146	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2147	*
				2148	* Returns the SystemLiteral parsed or NULL
				2149	*/
				2150
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2151	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2152	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2153	const xmlChar *q;
				2154	xmlChar *ret = NULL;
				2155
				2156	if (CUR == '"') {
				2157	NEXT;
				2158	q = CUR_PTR;
				2159	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2160	NEXT;
				2161	if (!IS_CHAR(CUR)) {
				2162	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2163	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2164	ctxt->wellFormed = 0;
				2165	} else {
				2166	ret = xmlStrndup(q, CUR_PTR - q);
				2167	NEXT;
				2168	}
				2169	} else if (CUR == '\'') {
				2170	NEXT;
				2171	q = CUR_PTR;
				2172	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2173	NEXT;
				2174	if (!IS_CHAR(CUR)) {
				2175	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2176	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2177	ctxt->wellFormed = 0;
				2178	} else {
				2179	ret = xmlStrndup(q, CUR_PTR - q);
				2180	NEXT;
				2181	}
				2182	} else {
				2183	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2184	ctxt->sax->error(ctxt->userData,
				2185	"SystemLiteral \" or ' expected\n");
				2186	ctxt->wellFormed = 0;
				2187	}
				2188
				2189	return(ret);
				2190	}
				2191
				2192	/**
				2193	* htmlParsePubidLiteral:
				2194	* @ctxt: an HTML parser context
				2195	*
				2196	* parse an HTML public literal
				2197	*
				2198	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2199	*
				2200	* Returns the PubidLiteral parsed or NULL.
				2201	*/
				2202
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2203	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2204	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2205	const xmlChar *q;
				2206	xmlChar *ret = NULL;
				2207	/*
				2208	* Name ::= (Letter \| '_') (NameChar)*
				2209	*/
				2210	if (CUR == '"') {
				2211	NEXT;
				2212	q = CUR_PTR;
				2213	while (IS_PUBIDCHAR(CUR)) NEXT;
				2214	if (CUR != '"') {
				2215	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2216	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2217	ctxt->wellFormed = 0;
				2218	} else {
				2219	ret = xmlStrndup(q, CUR_PTR - q);
				2220	NEXT;
				2221	}
				2222	} else if (CUR == '\'') {
				2223	NEXT;
				2224	q = CUR_PTR;
				2225	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2226	NEXT;
				2227	if (!IS_LETTER(CUR)) {
				2228	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2229	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2230	ctxt->wellFormed = 0;
				2231	} else {
				2232	ret = xmlStrndup(q, CUR_PTR - q);
				2233	NEXT;
				2234	}
				2235	} else {
				2236	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2237	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2238	ctxt->wellFormed = 0;
				2239	}
				2240
				2241	return(ret);
				2242	}
				2243
				2244	/**
				2245	* htmlParseScript:
				2246	* @ctxt: an HTML parser context
				2247	*
				2248	* parse the content of an HTML SCRIPT or STYLE element
				2249	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2250	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2251	* http://www.w3.org/TR/html4/types.html#type-script
				2252	* http://www.w3.org/TR/html4/types.html#h-6.15
				2253	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2254	*
				2255	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2256	* element and the value of intrinsic event attributes. User agents must
				2257	* not evaluate script data as HTML markup but instead must pass it on as
				2258	* data to a script engine.
				2259	* NOTES:
				2260	* - The content is passed like CDATA
				2261	* - the attributes for style and scripting "onXXX" are also described
				2262	* as CDATA but SGML allows entities references in attributes so their
				2263	* processing is identical as other attributes
				2264	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2265	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2266	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2267	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2268	int nbchar = 0;
				2269	xmlChar cur;
				2270
				2271	SHRINK;
				2272	cur = CUR;
				2273	while (IS_CHAR(cur)) {
				2274	if ((cur == '<') && (NXT(1) == '/')) {
				2275	/*
				2276	* One should break here, the specification is clear:
				2277	* Authors should therefore escape "</" within the content.
				2278	* Escape mechanisms are specific to each scripting or
				2279	* style sheet language.
				2280	*/
				2281	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2282	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2283	break; /* while */
				2284	}
				2285	buf[nbchar++] = cur;
				2286	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2287	if (ctxt->sax->cdataBlock!= NULL) {
				2288	/*
				2289	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2290	*/
				2291	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2292	}
				2293	nbchar = 0;
				2294	}
				2295	NEXT;
				2296	cur = CUR;
				2297	}
				2298	if (!(IS_CHAR(cur))) {
				2299	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2300	ctxt->sax->error(ctxt->userData,
				2301	"Invalid char in CDATA 0x%X\n", cur);
				2302	ctxt->wellFormed = 0;
				2303	NEXT;
				2304	}
				2305
				2306	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2307	if (ctxt->sax->cdataBlock!= NULL) {
				2308	/*
				2309	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2310	*/
				2311	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2312	}
				2313	}
				2314	}
				2315
				2316
				2317	/**
				2318	* htmlParseCharData:
				2319	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2320	*
				2321	* parse a CharData section.
				2322	* if we are within a CDATA section ']]>' marks an end of section.
				2323	*
				2324	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2325	*/
				2326
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2327	static void
				2328	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2329	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2330	int nbchar = 0;
				2331	int cur, l;
				2332
				2333	SHRINK;
				2334	cur = CUR_CHAR(l);
				2335	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2336	((cur != '&') \|\| (ctxt->token == '&')) &&
				2337	(IS_CHAR(cur))) {
				2338	COPY_BUF(l,buf,nbchar,cur);
				2339	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2340	/*
				2341	* Ok the segment is to be consumed as chars.
				2342	*/
				2343	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2344	if (areBlanks(ctxt, buf, nbchar)) {
				2345	if (ctxt->sax->ignorableWhitespace != NULL)
				2346	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2347	buf, nbchar);
				2348	} else {
				2349	htmlCheckParagraph(ctxt);
				2350	if (ctxt->sax->characters != NULL)
				2351	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2352	}
				2353	}
				2354	nbchar = 0;
				2355	}
				2356	NEXTL(l);
				2357	cur = CUR_CHAR(l);
				2358	}
				2359	if (nbchar != 0) {
				2360	/*
				2361	* Ok the segment is to be consumed as chars.
				2362	*/
				2363	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2364	if (areBlanks(ctxt, buf, nbchar)) {
				2365	if (ctxt->sax->ignorableWhitespace != NULL)
				2366	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2367	} else {
				2368	htmlCheckParagraph(ctxt);
				2369	if (ctxt->sax->characters != NULL)
				2370	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2371	}
				2372	}
				2373	}
				2374	}
				2375
				2376	/**
				2377	* htmlParseExternalID:
				2378	* @ctxt: an HTML parser context
				2379	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2380	*
				2381	* Parse an External ID or a Public ID
				2382	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2383	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2384	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2385	*
				2386	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2387	*
				2388	* Returns the function returns SystemLiteral and in the second
				2389	* case publicID receives PubidLiteral, is strict is off
				2390	* it is possible to return NULL and have publicID set.
				2391	*/
				2392
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2393	static xmlChar *
				2394	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2395	xmlChar *URI = NULL;
				2396
				2397	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2398	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2399	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2400	SKIP(6);
				2401	if (!IS_BLANK(CUR)) {
				2402	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2403	ctxt->sax->error(ctxt->userData,
				2404	"Space required after 'SYSTEM'\n");
				2405	ctxt->wellFormed = 0;
				2406	}
				2407	SKIP_BLANKS;
				2408	URI = htmlParseSystemLiteral(ctxt);
				2409	if (URI == NULL) {
				2410	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2411	ctxt->sax->error(ctxt->userData,
				2412	"htmlParseExternalID: SYSTEM, no URI\n");
				2413	ctxt->wellFormed = 0;
				2414	}
				2415	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2416	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2417	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2418	SKIP(6);
				2419	if (!IS_BLANK(CUR)) {
				2420	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2421	ctxt->sax->error(ctxt->userData,
				2422	"Space required after 'PUBLIC'\n");
				2423	ctxt->wellFormed = 0;
				2424	}
				2425	SKIP_BLANKS;
				2426	*publicID = htmlParsePubidLiteral(ctxt);
				2427	if (*publicID == NULL) {
				2428	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2429	ctxt->sax->error(ctxt->userData,
				2430	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2431	ctxt->wellFormed = 0;
				2432	}
				2433	SKIP_BLANKS;
				2434	if ((CUR == '"') \|\| (CUR == '\'')) {
				2435	URI = htmlParseSystemLiteral(ctxt);
				2436	}
				2437	}
				2438	return(URI);
				2439	}
				2440
				2441	/**
				2442	* htmlParseComment:
				2443	* @ctxt: an HTML parser context
				2444	*
				2445	* Parse an XML (SGML) comment <!-- .... -->
				2446	*
				2447	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2448	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2449	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2450	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2451	xmlChar *buf = NULL;
				2452	int len;
				2453	int size = HTML_PARSER_BUFFER_SIZE;
				2454	int q, ql;
				2455	int r, rl;
				2456	int cur, l;
				2457	xmlParserInputState state;
				2458
				2459	/*
				2460	* Check that there is a comment right here.
				2461	*/
				2462	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2463	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2464
				2465	state = ctxt->instate;
				2466	ctxt->instate = XML_PARSER_COMMENT;
				2467	SHRINK;
				2468	SKIP(4);
				2469	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2470	if (buf == NULL) {
				2471	xmlGenericError(xmlGenericErrorContext,
				2472	"malloc of %d byte failed\n", size);
				2473	ctxt->instate = state;
				2474	return;
				2475	}
				2476	q = CUR_CHAR(ql);
				2477	NEXTL(ql);
				2478	r = CUR_CHAR(rl);
				2479	NEXTL(rl);
				2480	cur = CUR_CHAR(l);
				2481	len = 0;
				2482	while (IS_CHAR(cur) &&
				2483	((cur != '>') \|\|
				2484	(r != '-') \|\| (q != '-'))) {
				2485	if (len + 5 >= size) {
				2486	size *= 2;
				2487	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2488	if (buf == NULL) {
				2489	xmlGenericError(xmlGenericErrorContext,
				2490	"realloc of %d byte failed\n", size);
				2491	ctxt->instate = state;
				2492	return;
				2493	}
				2494	}
				2495	COPY_BUF(ql,buf,len,q);
				2496	q = r;
				2497	ql = rl;
				2498	r = cur;
				2499	rl = l;
				2500	NEXTL(l);
				2501	cur = CUR_CHAR(l);
				2502	if (cur == 0) {
				2503	SHRINK;
				2504	GROW;
				2505	cur = CUR_CHAR(l);
				2506	}
				2507	}
				2508	buf[len] = 0;
				2509	if (!IS_CHAR(cur)) {
				2510	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2511	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2512	ctxt->sax->error(ctxt->userData,
				2513	"Comment not terminated \n<!--%.50s\n", buf);
				2514	ctxt->wellFormed = 0;
				2515	xmlFree(buf);
				2516	} else {
				2517	NEXT;
				2518	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2519	(!ctxt->disableSAX))
				2520	ctxt->sax->comment(ctxt->userData, buf);
				2521	xmlFree(buf);
				2522	}
				2523	ctxt->instate = state;
				2524	}
				2525
				2526	/**
				2527	* htmlParseCharRef:
				2528	* @ctxt: an HTML parser context
				2529	*
				2530	* parse Reference declarations
				2531	*
				2532	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2533	* '&#x' [0-9a-fA-F]+ ';'
				2534	*
				2535	* Returns the value parsed (as an int)
				2536	*/
				2537	int
				2538	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2539	int val = 0;
				2540
				2541	if ((CUR == '&') && (NXT(1) == '#') &&
				2542	(NXT(2) == 'x')) {
				2543	SKIP(3);
				2544	while (CUR != ';') {
				2545	if ((CUR >= '0') && (CUR <= '9'))
				2546	val = val * 16 + (CUR - '0');
				2547	else if ((CUR >= 'a') && (CUR <= 'f'))
				2548	val = val * 16 + (CUR - 'a') + 10;
				2549	else if ((CUR >= 'A') && (CUR <= 'F'))
				2550	val = val * 16 + (CUR - 'A') + 10;
				2551	else {
				2552	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2553	ctxt->sax->error(ctxt->userData,
				2554	"htmlParseCharRef: invalid hexadecimal value\n");
				2555	ctxt->wellFormed = 0;
				2556	return(0);
				2557	}
				2558	NEXT;
				2559	}
				2560	if (CUR == ';')
				2561	NEXT;
				2562	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2563	SKIP(2);
				2564	while (CUR != ';') {
				2565	if ((CUR >= '0') && (CUR <= '9'))
				2566	val = val * 10 + (CUR - '0');
				2567	else {
				2568	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2569	ctxt->sax->error(ctxt->userData,
				2570	"htmlParseCharRef: invalid decimal value\n");
				2571	ctxt->wellFormed = 0;
				2572	return(0);
				2573	}
				2574	NEXT;
				2575	}
				2576	if (CUR == ';')
				2577	NEXT;
				2578	} else {
				2579	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2580	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2581	ctxt->wellFormed = 0;
				2582	}
				2583	/*
				2584	* Check the value IS_CHAR ...
				2585	*/
				2586	if (IS_CHAR(val)) {
				2587	return(val);
				2588	} else {
				2589	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2590	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2591	val);
				2592	ctxt->wellFormed = 0;
				2593	}
				2594	return(0);
				2595	}
				2596
				2597
				2598	/**
				2599	* htmlParseDocTypeDecl :
				2600	* @ctxt: an HTML parser context
				2601	*
				2602	* parse a DOCTYPE declaration
				2603	*
				2604	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2605	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2606	*/
				2607
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2608	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2609	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2610	xmlChar *name;
				2611	xmlChar *ExternalID = NULL;
				2612	xmlChar *URI = NULL;
				2613
				2614	/*
				2615	* We know that '<!DOCTYPE' has been detected.
				2616	*/
				2617	SKIP(9);
				2618
				2619	SKIP_BLANKS;
				2620
				2621	/*
				2622	* Parse the DOCTYPE name.
				2623	*/
				2624	name = htmlParseName(ctxt);
				2625	if (name == NULL) {
				2626	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2627	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2628	ctxt->wellFormed = 0;
				2629	}
				2630	/*
				2631	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2632	*/
				2633
				2634	SKIP_BLANKS;
				2635
				2636	/*
				2637	* Check for SystemID and ExternalID
				2638	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2639	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2640	SKIP_BLANKS;
				2641
				2642	/*
				2643	* We should be at the end of the DOCTYPE declaration.
				2644	*/
				2645	if (CUR != '>') {
				2646	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2647	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2648	ctxt->wellFormed = 0;
				2649	/* We shouldn't try to resynchronize ... */
				2650	}
				2651	NEXT;
				2652
				2653	/*
				2654	* Create or update the document accordingly to the DOCTYPE
				2655	*/
				2656	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2657	(!ctxt->disableSAX))
				2658	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2659
				2660	/*
				2661	* Cleanup, since we don't use all those identifiers
				2662	*/
				2663	if (URI != NULL) xmlFree(URI);
				2664	if (ExternalID != NULL) xmlFree(ExternalID);
				2665	if (name != NULL) xmlFree(name);
				2666	}
				2667
				2668	/**
				2669	* htmlParseAttribute:
				2670	* @ctxt: an HTML parser context
				2671	* @value: a xmlChar ** used to store the value of the attribute
				2672	*
				2673	* parse an attribute
				2674	*
				2675	* [41] Attribute ::= Name Eq AttValue
				2676	*
				2677	* [25] Eq ::= S? '=' S?
				2678	*
				2679	* With namespace:
				2680	*
				2681	* [NS 11] Attribute ::= QName Eq AttValue
				2682	*
				2683	* Also the case QName == xmlns:??? is handled independently as a namespace
				2684	* definition.
				2685	*
				2686	* Returns the attribute name, and the value in *value.
				2687	*/
				2688
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2689	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2690	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2691	xmlChar name, val = NULL;
				2692
				2693	*value = NULL;
				2694	name = htmlParseHTMLName(ctxt);
				2695	if (name == NULL) {
				2696	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2697	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2698	ctxt->wellFormed = 0;
				2699	return(NULL);
				2700	}
				2701
				2702	/*
				2703	* read the value
				2704	*/
				2705	SKIP_BLANKS;
				2706	if (CUR == '=') {
				2707	NEXT;
				2708	SKIP_BLANKS;
				2709	val = htmlParseAttValue(ctxt);
				2710	/******
				2711	} else {
				2712	* TODO : some attribute must have values, some may not
				2713	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2714	ctxt->sax->warning(ctxt->userData,
				2715	"No value for attribute %s\n", name); */
				2716	}
				2717
				2718	*value = val;
				2719	return(name);
				2720	}
				2721
				2722	/**
				2723	* htmlCheckEncoding:
				2724	* @ctxt: an HTML parser context
				2725	* @attvalue: the attribute value
				2726	*
				2727	* Checks an http-equiv attribute from a Meta tag to detect
				2728	* the encoding
				2729	* If a new encoding is detected the parser is switched to decode
				2730	* it and pass UTF8
				2731	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2732	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2733	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2734	const xmlChar *encoding;
				2735
				2736	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2737	return;
				2738
				2739	/* do not change encoding */
				2740	if (ctxt->input->encoding != NULL)
				2741	return;
				2742
				2743	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2744	if (encoding != NULL) {
				2745	encoding += 8;
				2746	} else {
				2747	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2748	if (encoding != NULL)
				2749	encoding += 9;
				2750	}
				2751	if (encoding != NULL) {
				2752	xmlCharEncoding enc;
				2753	xmlCharEncodingHandlerPtr handler;
				2754
				2755	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2756
				2757	if (ctxt->input->encoding != NULL)
				2758	xmlFree((xmlChar *) ctxt->input->encoding);
				2759	ctxt->input->encoding = xmlStrdup(encoding);
				2760
				2761	enc = xmlParseCharEncoding((const char *) encoding);
				2762	/*
				2763	* registered set of known encodings
				2764	*/
				2765	if (enc != XML_CHAR_ENCODING_ERROR) {
				2766	xmlSwitchEncoding(ctxt, enc);
				2767	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2768	} else {
				2769	/*
				2770	* fallback for unknown encodings
				2771	*/
				2772	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2773	if (handler != NULL) {
				2774	xmlSwitchToEncoding(ctxt, handler);
				2775	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2776	} else {
				2777	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2778	}
				2779	}
				2780
				2781	if ((ctxt->input->buf != NULL) &&
				2782	(ctxt->input->buf->encoder != NULL) &&
				2783	(ctxt->input->buf->raw != NULL) &&
				2784	(ctxt->input->buf->buffer != NULL)) {
				2785	int nbchars;
				2786	int processed;
				2787
				2788	/*
				2789	* convert as much as possible to the parser reading buffer.
				2790	*/
				2791	processed = ctxt->input->cur - ctxt->input->base;
				2792	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2793	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2794	ctxt->input->buf->buffer,
				2795	ctxt->input->buf->raw);
				2796	if (nbchars < 0) {
				2797	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2798	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2799	ctxt->sax->error(ctxt->userData,
				2800	"htmlCheckEncoding: encoder error\n");
				2801	}
				2802	ctxt->input->base =
				2803	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2804	}
				2805	}
				2806	}
				2807
				2808	/**
				2809	* htmlCheckMeta:
				2810	* @ctxt: an HTML parser context
				2811	* @atts: the attributes values
				2812	*
				2813	* Checks an attributes from a Meta tag
				2814	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2815	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2816	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2817	int i;
				2818	const xmlChar att, value;
				2819	int http = 0;
				2820	const xmlChar *content = NULL;
				2821
				2822	if ((ctxt == NULL) \|\| (atts == NULL))
				2823	return;
				2824
				2825	i = 0;
				2826	att = atts[i++];
				2827	while (att != NULL) {
				2828	value = atts[i++];
				2829	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2830	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2831	http = 1;
				2832	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2833	content = value;
				2834	att = atts[i++];
				2835	}
				2836	if ((http) && (content != NULL))
				2837	htmlCheckEncoding(ctxt, content);
				2838
				2839	}
				2840
				2841	/**
				2842	* htmlParseStartTag:
				2843	* @ctxt: an HTML parser context
				2844	*
				2845	* parse a start of tag either for rule element or
				2846	* EmptyElement. In both case we don't parse the tag closing chars.
				2847	*
				2848	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2849	*
				2850	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2851	*
				2852	* With namespace:
				2853	*
				2854	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2855	*
				2856	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2857	*
				2858	*/
				2859
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2860	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2861	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2862	xmlChar *name;
				2863	xmlChar *attname;
				2864	xmlChar *attvalue;
				2865	const xmlChar **atts = NULL;
				2866	int nbatts = 0;
				2867	int maxatts = 0;
				2868	int meta = 0;
				2869	int i;
				2870
				2871	if (CUR != '<') return;
				2872	NEXT;
				2873
				2874	GROW;
				2875	name = htmlParseHTMLName(ctxt);
				2876	if (name == NULL) {
				2877	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2878	ctxt->sax->error(ctxt->userData,
				2879	"htmlParseStartTag: invalid element name\n");
				2880	ctxt->wellFormed = 0;
				2881	/* Dump the bogus tag like browsers do */
				2882	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2883	NEXT;
				2884	return;
				2885	}
				2886	if (xmlStrEqual(name, BAD_CAST"meta"))
				2887	meta = 1;
				2888
				2889	/*
				2890	* Check for auto-closure of HTML elements.
				2891	*/
				2892	htmlAutoClose(ctxt, name);
				2893
				2894	/*
				2895	* Check for implied HTML elements.
				2896	*/
				2897	htmlCheckImplied(ctxt, name);
				2898
				2899	/*
				2900	* Avoid html at any level > 0, head at any level != 1
				2901	* or any attempt to recurse body
				2902	*/
				2903	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2904	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2905	ctxt->sax->error(ctxt->userData,
				2906	"htmlParseStartTag: misplaced <html> tag\n");
				2907	ctxt->wellFormed = 0;
				2908	xmlFree(name);
				2909	return;
				2910	}
				2911	if ((ctxt->nameNr != 1) &&
				2912	(xmlStrEqual(name, BAD_CAST"head"))) {
				2913	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2914	ctxt->sax->error(ctxt->userData,
				2915	"htmlParseStartTag: misplaced <head> tag\n");
				2916	ctxt->wellFormed = 0;
				2917	xmlFree(name);
				2918	return;
				2919	}
				2920	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2921	int indx;
				2922	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2923	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2924	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2925	ctxt->sax->error(ctxt->userData,
				2926	"htmlParseStartTag: misplaced <body> tag\n");
				2927	ctxt->wellFormed = 0;
				2928	xmlFree(name);
				2929	return;
				2930	}
				2931	}
				2932	}
				2933
				2934	/*
				2935	* Now parse the attributes, it ends up with the ending
				2936	*
				2937	* (S Attribute)* S?
				2938	*/
				2939	SKIP_BLANKS;
				2940	while ((IS_CHAR(CUR)) &&
				2941	(CUR != '>') &&
				2942	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2943	long cons = ctxt->nbChars;
				2944
				2945	GROW;
				2946	attname = htmlParseAttribute(ctxt, &attvalue);
				2947	if (attname != NULL) {
				2948
				2949	/*
				2950	* Well formedness requires at most one declaration of an attribute
				2951	*/
				2952	for (i = 0; i < nbatts;i += 2) {
				2953	if (xmlStrEqual(atts[i], attname)) {
				2954	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2955	ctxt->sax->error(ctxt->userData,
				2956	"Attribute %s redefined\n",
				2957	attname);
				2958	ctxt->wellFormed = 0;
				2959	xmlFree(attname);
				2960	if (attvalue != NULL)
				2961	xmlFree(attvalue);
				2962	goto failed;
				2963	}
				2964	}
				2965
				2966	/*
				2967	* Add the pair to atts
				2968	*/
				2969	if (atts == NULL) {
				2970	maxatts = 10;
				2971	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				2972	if (atts == NULL) {
				2973	xmlGenericError(xmlGenericErrorContext,
				2974	"malloc of %ld byte failed\n",
				2975	maxatts * (long)sizeof(xmlChar *));
				2976	if (name != NULL) xmlFree(name);
				2977	return;
				2978	}
				2979	} else if (nbatts + 4 > maxatts) {
				2980	maxatts *= 2;
				2981	atts = (const xmlChar *) xmlRealloc((void ) atts,
				2982	maxatts * sizeof(xmlChar *));
				2983	if (atts == NULL) {
				2984	xmlGenericError(xmlGenericErrorContext,
				2985	"realloc of %ld byte failed\n",
				2986	maxatts * (long)sizeof(xmlChar *));
				2987	if (name != NULL) xmlFree(name);
				2988	return;
				2989	}
				2990	}
				2991	atts[nbatts++] = attname;
				2992	atts[nbatts++] = attvalue;
				2993	atts[nbatts] = NULL;
				2994	atts[nbatts + 1] = NULL;
				2995	}
				2996	else {
				2997	/* Dump the bogus attribute string up to the next blank or
				2998	* the end of the tag. */
				2999	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3000	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3001	NEXT;
				3002	}
				3003
				3004	failed:
				3005	SKIP_BLANKS;
				3006	if (cons == ctxt->nbChars) {
				3007	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3008	ctxt->sax->error(ctxt->userData,
				3009	"htmlParseStartTag: problem parsing attributes\n");
				3010	ctxt->wellFormed = 0;
				3011	break;
				3012	}
				3013	}
				3014
				3015	/*
				3016	* Handle specific association to the META tag
				3017	*/
				3018	if (meta)
				3019	htmlCheckMeta(ctxt, atts);
				3020
				3021	/*
				3022	* SAX: Start of Element !
				3023	*/
				3024	htmlnamePush(ctxt, xmlStrdup(name));
				3025	#ifdef DEBUG
				3026	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3027	#endif
				3028	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3029	ctxt->sax->startElement(ctxt->userData, name, atts);
				3030
				3031	if (atts != NULL) {
				3032	for (i = 0;i < nbatts;i++) {
				3033	if (atts[i] != NULL)
				3034	xmlFree((xmlChar *) atts[i]);
				3035	}
				3036	xmlFree((void *) atts);
				3037	}
				3038	if (name != NULL) xmlFree(name);
				3039	}
				3040
				3041	/**
				3042	* htmlParseEndTag:
				3043	* @ctxt: an HTML parser context
				3044	*
				3045	* parse an end of tag
				3046	*
				3047	* [42] ETag ::= '</' Name S? '>'
				3048	*
				3049	* With namespace
				3050	*
				3051	* [NS 9] ETag ::= '</' QName S? '>'
				3052	*/
				3053
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3054	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3055	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3056	xmlChar *name;
				3057	xmlChar *oldname;
				3058	int i;
				3059
				3060	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3061	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3062	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3063	ctxt->wellFormed = 0;
				3064	return;
				3065	}
				3066	SKIP(2);
				3067
				3068	name = htmlParseHTMLName(ctxt);
				3069	if (name == NULL) return;
				3070
				3071	/*
				3072	* We should definitely be at the ending "S? '>'" part
				3073	*/
				3074	SKIP_BLANKS;
				3075	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3076	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3077	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3078	ctxt->wellFormed = 0;
				3079	} else
				3080	NEXT;
				3081
				3082	/*
				3083	* If the name read is not one of the element in the parsing stack
				3084	* then return, it's just an error.
				3085	*/
				3086	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3087	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3088	}
				3089	if (i < 0) {
				3090	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3091	ctxt->sax->error(ctxt->userData,
				3092	"Unexpected end tag : %s\n", name);
				3093	xmlFree(name);
				3094	ctxt->wellFormed = 0;
				3095	return;
				3096	}
				3097
				3098
				3099	/*
				3100	* Check for auto-closure of HTML elements.
				3101	*/
				3102
				3103	htmlAutoCloseOnClose(ctxt, name);
				3104
				3105	/*
				3106	* Well formedness constraints, opening and closing must match.
				3107	* With the exception that the autoclose may have popped stuff out
				3108	* of the stack.
				3109	*/
				3110	if (!xmlStrEqual(name, ctxt->name)) {
				3111	#ifdef DEBUG
				3112	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3113	#endif
				3114	if ((ctxt->name != NULL) &&
				3115	(!xmlStrEqual(ctxt->name, name))) {
				3116	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3117	ctxt->sax->error(ctxt->userData,
				3118	"Opening and ending tag mismatch: %s and %s\n",
				3119	name, ctxt->name);
				3120	ctxt->wellFormed = 0;
				3121	}
				3122	}
				3123
				3124	/*
				3125	* SAX: End of Tag
				3126	*/
				3127	oldname = ctxt->name;
				3128	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3129	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3130	ctxt->sax->endElement(ctxt->userData, name);
				3131	oldname = htmlnamePop(ctxt);
				3132	if (oldname != NULL) {
				3133	#ifdef DEBUG
				3134	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3135	#endif
				3136	xmlFree(oldname);
				3137	#ifdef DEBUG
				3138	} else {
				3139	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3140	#endif
				3141	}
				3142	}
				3143
				3144	if (name != NULL)
				3145	xmlFree(name);
				3146
				3147	return;
				3148	}
				3149
				3150
				3151	/**
				3152	* htmlParseReference:
				3153	* @ctxt: an HTML parser context
				3154	*
				3155	* parse and handle entity references in content,
				3156	* this will end-up in a call to character() since this is either a
				3157	* CharRef, or a predefined entity.
				3158	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3159	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3160	htmlParseReference(htmlParserCtxtPtr ctxt) {
				3161	htmlEntityDescPtr ent;
				3162	xmlChar out[6];
				3163	xmlChar *name;
				3164	if (CUR != '&') return;
				3165
				3166	if (NXT(1) == '#') {
				3167	unsigned int c;
				3168	int bits, i = 0;
				3169
				3170	c = htmlParseCharRef(ctxt);
				3171	if (c == 0)
				3172	return;
				3173
				3174	if (c < 0x80) { out[i++]= c; bits= -6; }
				3175	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3176	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3177	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3178
				3179	for ( ; bits >= 0; bits-= 6) {
				3180	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3181	}
				3182	out[i] = 0;
				3183
				3184	htmlCheckParagraph(ctxt);
				3185	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3186	ctxt->sax->characters(ctxt->userData, out, i);
				3187	} else {
				3188	ent = htmlParseEntityRef(ctxt, &name);
				3189	if (name == NULL) {
				3190	htmlCheckParagraph(ctxt);
				3191	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3192	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3193	return;
				3194	}
				3195	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3196	htmlCheckParagraph(ctxt);
				3197	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3198	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3199	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3200	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3201	}
				3202	} else {
				3203	unsigned int c;
				3204	int bits, i = 0;
				3205
				3206	c = ent->value;
				3207	if (c < 0x80)
				3208	{ out[i++]= c; bits= -6; }
				3209	else if (c < 0x800)
				3210	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3211	else if (c < 0x10000)
				3212	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3213	else
				3214	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3215
				3216	for ( ; bits >= 0; bits-= 6) {
				3217	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3218	}
				3219	out[i] = 0;
				3220
				3221	htmlCheckParagraph(ctxt);
				3222	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3223	ctxt->sax->characters(ctxt->userData, out, i);
				3224	}
				3225	xmlFree(name);
				3226	}
				3227	}
				3228
				3229	/**
				3230	* htmlParseContent:
				3231	* @ctxt: an HTML parser context
				3232	* @name: the node name
				3233	*
				3234	* Parse a content: comment, sub-element, reference or text.
				3235	*
				3236	*/
				3237
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3238	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3239	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3240	xmlChar *currentNode;
				3241	int depth;
				3242
				3243	currentNode = xmlStrdup(ctxt->name);
				3244	depth = ctxt->nameNr;
				3245	while (1) {
				3246	long cons = ctxt->nbChars;
				3247
				3248	GROW;
				3249	/*
				3250	* Our tag or one of it's parent or children is ending.
				3251	*/
				3252	if ((CUR == '<') && (NXT(1) == '/')) {
				3253	htmlParseEndTag(ctxt);
				3254	if (currentNode != NULL) xmlFree(currentNode);
				3255	return;
				3256	}
				3257
				3258	/*
				3259	* Has this node been popped out during parsing of
				3260	* the next element
				3261	*/
				3262	if ((!xmlStrEqual(currentNode, ctxt->name)) &&
				3263	(depth >= ctxt->nameNr)) {
				3264	if (currentNode != NULL) xmlFree(currentNode);
				3265	return;
				3266	}
				3267
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3268	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3269	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3270	/*
				3271	* Handle SCRIPT/STYLE separately
				3272	*/
				3273	htmlParseScript(ctxt);
				3274	} else {
				3275	/*
				3276	* Sometimes DOCTYPE arrives in the middle of the document
				3277	*/
				3278	if ((CUR == '<') && (NXT(1) == '!') &&
				3279	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3280	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3281	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3282	(UPP(8) == 'E')) {
				3283	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3284	ctxt->sax->error(ctxt->userData,
				3285	"Misplaced DOCTYPE declaration\n");
				3286	ctxt->wellFormed = 0;
				3287	htmlParseDocTypeDecl(ctxt);
				3288	}
				3289
				3290	/*
				3291	* First case : a comment
				3292	*/
				3293	if ((CUR == '<') && (NXT(1) == '!') &&
				3294	(NXT(2) == '-') && (NXT(3) == '-')) {
				3295	htmlParseComment(ctxt);
				3296	}
				3297
				3298	/*
				3299	* Second case : a sub-element.
				3300	*/
				3301	else if (CUR == '<') {
				3302	htmlParseElement(ctxt);
				3303	}
				3304
				3305	/*
				3306	* Third case : a reference. If if has not been resolved,
				3307	* parsing returns it's Name, create the node
				3308	*/
				3309	else if (CUR == '&') {
				3310	htmlParseReference(ctxt);
				3311	}
				3312
				3313	/*
				3314	* Fourth : end of the resource
				3315	*/
				3316	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3317	htmlAutoCloseOnEnd(ctxt);
				3318	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3319	}
				3320
				3321	/*
				3322	* Last case, text. Note that References are handled directly.
				3323	*/
				3324	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3325	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3326	}
				3327
				3328	if (cons == ctxt->nbChars) {
				3329	if (ctxt->node != NULL) {
				3330	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3331	ctxt->sax->error(ctxt->userData,
				3332	"detected an error in element content\n");
				3333	ctxt->wellFormed = 0;
				3334	}
				3335	break;
				3336	}
				3337	}
				3338	GROW;
				3339	}
				3340	if (currentNode != NULL) xmlFree(currentNode);
				3341	}
				3342
				3343	/**
				3344	* htmlParseElement:
				3345	* @ctxt: an HTML parser context
				3346	*
				3347	* parse an HTML element, this is highly recursive
				3348	*
				3349	* [39] element ::= EmptyElemTag \| STag content ETag
				3350	*
				3351	* [41] Attribute ::= Name Eq AttValue
				3352	*/
				3353
				3354	void
				3355	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3356	xmlChar *name;
				3357	xmlChar *currentNode = NULL;
				3358	htmlElemDescPtr info;
				3359	htmlParserNodeInfo node_info;
				3360	xmlChar *oldname;
				3361	int depth = ctxt->nameNr;
				3362
				3363	/* Capture start position */
				3364	if (ctxt->record_info) {
				3365	node_info.begin_pos = ctxt->input->consumed +
				3366	(CUR_PTR - ctxt->input->base);
				3367	node_info.begin_line = ctxt->input->line;
				3368	}
				3369
				3370	oldname = xmlStrdup(ctxt->name);
				3371	htmlParseStartTag(ctxt);
				3372	name = ctxt->name;
				3373	#ifdef DEBUG
				3374	if (oldname == NULL)
				3375	xmlGenericError(xmlGenericErrorContext,
				3376	"Start of element %s\n", name);
				3377	else if (name == NULL)
				3378	xmlGenericError(xmlGenericErrorContext,
				3379	"Start of element failed, was %s\n", oldname);
				3380	else
				3381	xmlGenericError(xmlGenericErrorContext,
				3382	"Start of element %s, was %s\n", name, oldname);
				3383	#endif
				3384	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3385	(name == NULL)) {
				3386	if (CUR == '>')
				3387	NEXT;
				3388	if (oldname != NULL)
				3389	xmlFree(oldname);
				3390	return;
				3391	}
				3392	if (oldname != NULL)
				3393	xmlFree(oldname);
				3394
				3395	/*
				3396	* Lookup the info for that element.
				3397	*/
				3398	info = htmlTagLookup(name);
				3399	if (info == NULL) {
				3400	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3401	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3402	name);
				3403	ctxt->wellFormed = 0;
				3404	} else if (info->depr) {
				3405	/***************************
				3406	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3407	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3408	name);
				3409	***************************/
				3410	}
				3411
				3412	/*
				3413	* Check for an Empty Element labelled the XML/SGML way
				3414	*/
				3415	if ((CUR == '/') && (NXT(1) == '>')) {
				3416	SKIP(2);
				3417	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3418	ctxt->sax->endElement(ctxt->userData, name);
				3419	oldname = htmlnamePop(ctxt);
				3420	#ifdef DEBUG
				3421	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3422	#endif
				3423	if (oldname != NULL)
				3424	xmlFree(oldname);
				3425	return;
				3426	}
				3427
				3428	if (CUR == '>') {
				3429	NEXT;
				3430	} else {
				3431	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3432	ctxt->sax->error(ctxt->userData,
				3433	"Couldn't find end of Start Tag %s\n",
				3434	name);
				3435	ctxt->wellFormed = 0;
				3436
				3437	/*
				3438	* end of parsing of this node.
				3439	*/
				3440	if (xmlStrEqual(name, ctxt->name)) {
				3441	nodePop(ctxt);
				3442	oldname = htmlnamePop(ctxt);
				3443	#ifdef DEBUG
				3444	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3445	#endif
				3446	if (oldname != NULL)
				3447	xmlFree(oldname);
				3448	}
				3449
				3450	/*
				3451	* Capture end position and add node
				3452	*/
				3453	if ( currentNode != NULL && ctxt->record_info ) {
				3454	node_info.end_pos = ctxt->input->consumed +
				3455	(CUR_PTR - ctxt->input->base);
				3456	node_info.end_line = ctxt->input->line;
				3457	node_info.node = ctxt->node;
				3458	xmlParserAddNodeInfo(ctxt, &node_info);
				3459	}
				3460	return;
				3461	}
				3462
				3463	/*
				3464	* Check for an Empty Element from DTD definition
				3465	*/
				3466	if ((info != NULL) && (info->empty)) {
				3467	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3468	ctxt->sax->endElement(ctxt->userData, name);
				3469	oldname = htmlnamePop(ctxt);
				3470	#ifdef DEBUG
				3471	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3472	#endif
				3473	if (oldname != NULL)
				3474	xmlFree(oldname);
				3475	return;
				3476	}
				3477
				3478	/*
				3479	* Parse the content of the element:
				3480	*/
				3481	currentNode = xmlStrdup(ctxt->name);
				3482	depth = ctxt->nameNr;
				3483	while (IS_CHAR(CUR)) {
				3484	htmlParseContent(ctxt);
				3485	if (ctxt->nameNr < depth) break;
				3486	}
				3487
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3488	/*
				3489	* Capture end position and add node
				3490	*/
				3491	if ( currentNode != NULL && ctxt->record_info ) {
				3492	node_info.end_pos = ctxt->input->consumed +
				3493	(CUR_PTR - ctxt->input->base);
				3494	node_info.end_line = ctxt->input->line;
				3495	node_info.node = ctxt->node;
				3496	xmlParserAddNodeInfo(ctxt, &node_info);
				3497	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3498	if (!IS_CHAR(CUR)) {
				3499	htmlAutoCloseOnEnd(ctxt);
				3500	}
				3501
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3502	if (currentNode != NULL)
				3503	xmlFree(currentNode);
				3504	}
				3505
				3506	/**
				3507	* htmlParseDocument :
				3508	* @ctxt: an HTML parser context
				3509	*
				3510	* parse an HTML document (and build a tree if using the standard SAX
				3511	* interface).
				3512	*
				3513	* Returns 0, -1 in case of error. the parser context is augmented
				3514	* as a result of the parsing.
				3515	*/
				3516
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3517	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3518	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3519	xmlDtdPtr dtd;
				3520
				3521	htmlDefaultSAXHandlerInit();
				3522	ctxt->html = 1;
				3523
				3524	GROW;
				3525	/*
				3526	* SAX: beginning of the document processing.
				3527	*/
				3528	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3529	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3530
				3531	/*
				3532	* Wipe out everything which is before the first '<'
				3533	*/
				3534	SKIP_BLANKS;
				3535	if (CUR == 0) {
				3536	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3537	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3538	ctxt->wellFormed = 0;
				3539	}
				3540
				3541	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3542	ctxt->sax->startDocument(ctxt->userData);
				3543
				3544
				3545	/*
				3546	* Parse possible comments before any content
				3547	*/
				3548	while ((CUR == '<') && (NXT(1) == '!') &&
				3549	(NXT(2) == '-') && (NXT(3) == '-')) {
				3550	htmlParseComment(ctxt);
				3551	SKIP_BLANKS;
				3552	}
				3553
				3554
				3555	/*
				3556	* Then possibly doc type declaration(s) and more Misc
				3557	* (doctypedecl Misc*)?
				3558	*/
				3559	if ((CUR == '<') && (NXT(1) == '!') &&
				3560	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3561	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3562	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3563	(UPP(8) == 'E')) {
				3564	htmlParseDocTypeDecl(ctxt);
				3565	}
				3566	SKIP_BLANKS;
				3567
				3568	/*
				3569	* Parse possible comments before any content
				3570	*/
				3571	while ((CUR == '<') && (NXT(1) == '!') &&
				3572	(NXT(2) == '-') && (NXT(3) == '-')) {
				3573	htmlParseComment(ctxt);
				3574	SKIP_BLANKS;
				3575	}
				3576
				3577	/*
				3578	* Time to start parsing the tree itself
				3579	*/
				3580	htmlParseContent(ctxt);
				3581
				3582	/*
				3583	* autoclose
				3584	*/
				3585	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3586	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3587
				3588
				3589	/*
				3590	* SAX: end of the document processing.
				3591	*/
				3592	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3593	ctxt->sax->endDocument(ctxt->userData);
				3594
				3595	if (ctxt->myDoc != NULL) {
				3596	dtd = xmlGetIntSubset(ctxt->myDoc);
				3597	if (dtd == NULL)
				3598	ctxt->myDoc->intSubset =
				3599	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3600	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3601	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3602	}
				3603	if (! ctxt->wellFormed) return(-1);
				3604	return(0);
				3605	}
				3606
				3607
				3608	/************************************************************************
				3609	* *
				3610	* Parser contexts handling *
				3611	* *
				3612	************************************************************************/
				3613
				3614	/**
				3615	* xmlInitParserCtxt:
				3616	* @ctxt: an HTML parser context
				3617	*
				3618	* Initialize a parser context
				3619	*/
				3620
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3621	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3622	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3623	{
				3624	htmlSAXHandler *sax;
				3625
				3626	if (ctxt == NULL) return;
				3627	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3628
				3629	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3630	if (sax == NULL) {
				3631	xmlGenericError(xmlGenericErrorContext,
				3632	"htmlInitParserCtxt: out of memory\n");
				3633	}
				3634	else
				3635	memset(sax, 0, sizeof(htmlSAXHandler));
				3636
				3637	/* Allocate the Input stack */
				3638	ctxt->inputTab = (htmlParserInputPtr *)
				3639	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3640	if (ctxt->inputTab == NULL) {
				3641	xmlGenericError(xmlGenericErrorContext,
				3642	"htmlInitParserCtxt: out of memory\n");
				3643	ctxt->inputNr = 0;
				3644	ctxt->inputMax = 0;
				3645	ctxt->input = NULL;
				3646	return;
				3647	}
				3648	ctxt->inputNr = 0;
				3649	ctxt->inputMax = 5;
				3650	ctxt->input = NULL;
				3651	ctxt->version = NULL;
				3652	ctxt->encoding = NULL;
				3653	ctxt->standalone = -1;
				3654	ctxt->instate = XML_PARSER_START;
				3655
				3656	/* Allocate the Node stack */
				3657	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3658	if (ctxt->nodeTab == NULL) {
				3659	xmlGenericError(xmlGenericErrorContext,
				3660	"htmlInitParserCtxt: out of memory\n");
				3661	ctxt->nodeNr = 0;
				3662	ctxt->nodeMax = 0;
				3663	ctxt->node = NULL;
				3664	ctxt->inputNr = 0;
				3665	ctxt->inputMax = 0;
				3666	ctxt->input = NULL;
				3667	return;
				3668	}
				3669	ctxt->nodeNr = 0;
				3670	ctxt->nodeMax = 10;
				3671	ctxt->node = NULL;
				3672
				3673	/* Allocate the Name stack */
				3674	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3675	if (ctxt->nameTab == NULL) {
				3676	xmlGenericError(xmlGenericErrorContext,
				3677	"htmlInitParserCtxt: out of memory\n");
				3678	ctxt->nameNr = 0;
				3679	ctxt->nameMax = 10;
				3680	ctxt->name = NULL;
				3681	ctxt->nodeNr = 0;
				3682	ctxt->nodeMax = 0;
				3683	ctxt->node = NULL;
				3684	ctxt->inputNr = 0;
				3685	ctxt->inputMax = 0;
				3686	ctxt->input = NULL;
				3687	return;
				3688	}
				3689	ctxt->nameNr = 0;
				3690	ctxt->nameMax = 10;
				3691	ctxt->name = NULL;
				3692
				3693	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3694	else {
				3695	ctxt->sax = sax;
				3696	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3697	}
				3698	ctxt->userData = ctxt;
				3699	ctxt->myDoc = NULL;
				3700	ctxt->wellFormed = 1;
				3701	ctxt->replaceEntities = 0;
				3702	ctxt->html = 1;
				3703	ctxt->record_info = 0;
				3704	ctxt->validate = 0;
				3705	ctxt->nbChars = 0;
				3706	ctxt->checkIndex = 0;
				3707	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3708	}
				3709
				3710	/**
				3711	* htmlFreeParserCtxt:
				3712	* @ctxt: an HTML parser context
				3713	*
				3714	* Free all the memory used by a parser context. However the parsed
				3715	* document in ctxt->myDoc is not freed.
				3716	*/
				3717
				3718	void
				3719	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3720	{
				3721	xmlFreeParserCtxt(ctxt);
				3722	}
				3723
				3724	/**
				3725	* htmlCreateDocParserCtxt :
				3726	* @cur: a pointer to an array of xmlChar
				3727	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3728	*
				3729	* Create a parser context for an HTML document.
				3730	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3731	* TODO: check the need to add encoding handling there
				3732	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3733	* Returns the new parser context or NULL
				3734	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3735	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3736	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3737	htmlParserCtxtPtr ctxt;
				3738	htmlParserInputPtr input;
				3739	/* htmlCharEncoding enc; */
				3740
				3741	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3742	if (ctxt == NULL) {
				3743	perror("malloc");
				3744	return(NULL);
				3745	}
				3746	htmlInitParserCtxt(ctxt);
				3747	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3748	if (input == NULL) {
				3749	perror("malloc");
				3750	xmlFree(ctxt);
				3751	return(NULL);
				3752	}
				3753	memset(input, 0, sizeof(htmlParserInput));
				3754
				3755	input->line = 1;
				3756	input->col = 1;
				3757	input->base = cur;
				3758	input->cur = cur;
				3759
				3760	inputPush(ctxt, input);
				3761	return(ctxt);
				3762	}
				3763
				3764	/************************************************************************
				3765	* *
				3766	* Progressive parsing interfaces *
				3767	* *
				3768	************************************************************************/
				3769
				3770	/**
				3771	* htmlParseLookupSequence:
				3772	* @ctxt: an HTML parser context
				3773	* @first: the first char to lookup
				3774	* @next: the next char to lookup or zero
				3775	* @third: the next char to lookup or zero
				3776	*
				3777	* Try to find if a sequence (first, next, third) or just (first next) or
				3778	* (first) is available in the input stream.
				3779	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3780	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3781	* parser, do not use liberally.
				3782	* This is basically similar to xmlParseLookupSequence()
				3783	*
				3784	* Returns the index to the current parsing point if the full sequence
				3785	* is available, -1 otherwise.
				3786	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3787	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3788	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3789	xmlChar next, xmlChar third) {
				3790	int base, len;
				3791	htmlParserInputPtr in;
				3792	const xmlChar *buf;
				3793
				3794	in = ctxt->input;
				3795	if (in == NULL) return(-1);
				3796	base = in->cur - in->base;
				3797	if (base < 0) return(-1);
				3798	if (ctxt->checkIndex > base)
				3799	base = ctxt->checkIndex;
				3800	if (in->buf == NULL) {
				3801	buf = in->base;
				3802	len = in->length;
				3803	} else {
				3804	buf = in->buf->buffer->content;
				3805	len = in->buf->buffer->use;
				3806	}
				3807	/* take into account the sequence length */
				3808	if (third) len -= 2;
				3809	else if (next) len --;
				3810	for (;base < len;base++) {
				3811	if (buf[base] == first) {
				3812	if (third != 0) {
				3813	if ((buf[base + 1] != next) \|\|
				3814	(buf[base + 2] != third)) continue;
				3815	} else if (next != 0) {
				3816	if (buf[base + 1] != next) continue;
				3817	}
				3818	ctxt->checkIndex = 0;
				3819	#ifdef DEBUG_PUSH
				3820	if (next == 0)
				3821	xmlGenericError(xmlGenericErrorContext,
				3822	"HPP: lookup '%c' found at %d\n",
				3823	first, base);
				3824	else if (third == 0)
				3825	xmlGenericError(xmlGenericErrorContext,
				3826	"HPP: lookup '%c%c' found at %d\n",
				3827	first, next, base);
				3828	else
				3829	xmlGenericError(xmlGenericErrorContext,
				3830	"HPP: lookup '%c%c%c' found at %d\n",
				3831	first, next, third, base);
				3832	#endif
				3833	return(base - (in->cur - in->base));
				3834	}
				3835	}
				3836	ctxt->checkIndex = base;
				3837	#ifdef DEBUG_PUSH
				3838	if (next == 0)
				3839	xmlGenericError(xmlGenericErrorContext,
				3840	"HPP: lookup '%c' failed\n", first);
				3841	else if (third == 0)
				3842	xmlGenericError(xmlGenericErrorContext,
				3843	"HPP: lookup '%c%c' failed\n", first, next);
				3844	else
				3845	xmlGenericError(xmlGenericErrorContext,
				3846	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3847	#endif
				3848	return(-1);
				3849	}
				3850
				3851	/**
				3852	* htmlParseTryOrFinish:
				3853	* @ctxt: an HTML parser context
				3854	* @terminate: last chunk indicator
				3855	*
				3856	* Try to progress on parsing
				3857	*
				3858	* Returns zero if no parsing was possible
				3859	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3860	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3861	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3862	int ret = 0;
				3863	htmlParserInputPtr in;
				3864	int avail = 0;
				3865	xmlChar cur, next;
				3866
				3867	#ifdef DEBUG_PUSH
				3868	switch (ctxt->instate) {
				3869	case XML_PARSER_EOF:
				3870	xmlGenericError(xmlGenericErrorContext,
				3871	"HPP: try EOF\n"); break;
				3872	case XML_PARSER_START:
				3873	xmlGenericError(xmlGenericErrorContext,
				3874	"HPP: try START\n"); break;
				3875	case XML_PARSER_MISC:
				3876	xmlGenericError(xmlGenericErrorContext,
				3877	"HPP: try MISC\n");break;
				3878	case XML_PARSER_COMMENT:
				3879	xmlGenericError(xmlGenericErrorContext,
				3880	"HPP: try COMMENT\n");break;
				3881	case XML_PARSER_PROLOG:
				3882	xmlGenericError(xmlGenericErrorContext,
				3883	"HPP: try PROLOG\n");break;
				3884	case XML_PARSER_START_TAG:
				3885	xmlGenericError(xmlGenericErrorContext,
				3886	"HPP: try START_TAG\n");break;
				3887	case XML_PARSER_CONTENT:
				3888	xmlGenericError(xmlGenericErrorContext,
				3889	"HPP: try CONTENT\n");break;
				3890	case XML_PARSER_CDATA_SECTION:
				3891	xmlGenericError(xmlGenericErrorContext,
				3892	"HPP: try CDATA_SECTION\n");break;
				3893	case XML_PARSER_END_TAG:
				3894	xmlGenericError(xmlGenericErrorContext,
				3895	"HPP: try END_TAG\n");break;
				3896	case XML_PARSER_ENTITY_DECL:
				3897	xmlGenericError(xmlGenericErrorContext,
				3898	"HPP: try ENTITY_DECL\n");break;
				3899	case XML_PARSER_ENTITY_VALUE:
				3900	xmlGenericError(xmlGenericErrorContext,
				3901	"HPP: try ENTITY_VALUE\n");break;
				3902	case XML_PARSER_ATTRIBUTE_VALUE:
				3903	xmlGenericError(xmlGenericErrorContext,
				3904	"HPP: try ATTRIBUTE_VALUE\n");break;
				3905	case XML_PARSER_DTD:
				3906	xmlGenericError(xmlGenericErrorContext,
				3907	"HPP: try DTD\n");break;
				3908	case XML_PARSER_EPILOG:
				3909	xmlGenericError(xmlGenericErrorContext,
				3910	"HPP: try EPILOG\n");break;
				3911	case XML_PARSER_PI:
				3912	xmlGenericError(xmlGenericErrorContext,
				3913	"HPP: try PI\n");break;
				3914	case XML_PARSER_SYSTEM_LITERAL:
				3915	xmlGenericError(xmlGenericErrorContext,
				3916	"HPP: try SYSTEM_LITERAL\n");break;
				3917	}
				3918	#endif
				3919
				3920	while (1) {
				3921
				3922	in = ctxt->input;
				3923	if (in == NULL) break;
				3924	if (in->buf == NULL)
				3925	avail = in->length - (in->cur - in->base);
				3926	else
				3927	avail = in->buf->buffer->use - (in->cur - in->base);
				3928	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3929	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3930	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3931	/*
				3932	* SAX: end of the document processing.
				3933	*/
				3934	ctxt->instate = XML_PARSER_EOF;
				3935	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3936	ctxt->sax->endDocument(ctxt->userData);
				3937	}
				3938	}
				3939	if (avail < 1)
				3940	goto done;
				3941	switch (ctxt->instate) {
				3942	case XML_PARSER_EOF:
				3943	/*
				3944	* Document parsing is done !
				3945	*/
				3946	goto done;
				3947	case XML_PARSER_START:
				3948	/*
				3949	* Very first chars read from the document flow.
				3950	*/
				3951	cur = in->cur[0];
				3952	if (IS_BLANK(cur)) {
				3953	SKIP_BLANKS;
				3954	if (in->buf == NULL)
				3955	avail = in->length - (in->cur - in->base);
				3956	else
				3957	avail = in->buf->buffer->use - (in->cur - in->base);
				3958	}
				3959	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3960	ctxt->sax->setDocumentLocator(ctxt->userData,
				3961	&xmlDefaultSAXLocator);
				3962	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				3963	(!ctxt->disableSAX))
				3964	ctxt->sax->startDocument(ctxt->userData);
				3965
				3966	cur = in->cur[0];
				3967	next = in->cur[1];
				3968	if ((cur == '<') && (next == '!') &&
				3969	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3970	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3971	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3972	(UPP(8) == 'E')) {
				3973	if ((!terminate) &&
				3974	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				3975	goto done;
				3976	#ifdef DEBUG_PUSH
				3977	xmlGenericError(xmlGenericErrorContext,
				3978	"HPP: Parsing internal subset\n");
				3979	#endif
				3980	htmlParseDocTypeDecl(ctxt);
				3981	ctxt->instate = XML_PARSER_PROLOG;
				3982	#ifdef DEBUG_PUSH
				3983	xmlGenericError(xmlGenericErrorContext,
				3984	"HPP: entering PROLOG\n");
				3985	#endif
				3986	} else {
				3987	ctxt->instate = XML_PARSER_MISC;
				3988	}
				3989	#ifdef DEBUG_PUSH
				3990	xmlGenericError(xmlGenericErrorContext,
				3991	"HPP: entering MISC\n");
				3992	#endif
				3993	break;
				3994	case XML_PARSER_MISC:
				3995	SKIP_BLANKS;
				3996	if (in->buf == NULL)
				3997	avail = in->length - (in->cur - in->base);
				3998	else
				3999	avail = in->buf->buffer->use - (in->cur - in->base);
				4000	if (avail < 2)
				4001	goto done;
				4002	cur = in->cur[0];
				4003	next = in->cur[1];
				4004	if ((cur == '<') && (next == '!') &&
				4005	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4006	if ((!terminate) &&
				4007	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4008	goto done;
				4009	#ifdef DEBUG_PUSH
				4010	xmlGenericError(xmlGenericErrorContext,
				4011	"HPP: Parsing Comment\n");
				4012	#endif
				4013	htmlParseComment(ctxt);
				4014	ctxt->instate = XML_PARSER_MISC;
				4015	} else if ((cur == '<') && (next == '!') &&
				4016	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4017	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4018	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4019	(UPP(8) == 'E')) {
				4020	if ((!terminate) &&
				4021	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4022	goto done;
				4023	#ifdef DEBUG_PUSH
				4024	xmlGenericError(xmlGenericErrorContext,
				4025	"HPP: Parsing internal subset\n");
				4026	#endif
				4027	htmlParseDocTypeDecl(ctxt);
				4028	ctxt->instate = XML_PARSER_PROLOG;
				4029	#ifdef DEBUG_PUSH
				4030	xmlGenericError(xmlGenericErrorContext,
				4031	"HPP: entering PROLOG\n");
				4032	#endif
				4033	} else if ((cur == '<') && (next == '!') &&
				4034	(avail < 9)) {
				4035	goto done;
				4036	} else {
				4037	ctxt->instate = XML_PARSER_START_TAG;
				4038	#ifdef DEBUG_PUSH
				4039	xmlGenericError(xmlGenericErrorContext,
				4040	"HPP: entering START_TAG\n");
				4041	#endif
				4042	}
				4043	break;
				4044	case XML_PARSER_PROLOG:
				4045	SKIP_BLANKS;
				4046	if (in->buf == NULL)
				4047	avail = in->length - (in->cur - in->base);
				4048	else
				4049	avail = in->buf->buffer->use - (in->cur - in->base);
				4050	if (avail < 2)
				4051	goto done;
				4052	cur = in->cur[0];
				4053	next = in->cur[1];
				4054	if ((cur == '<') && (next == '!') &&
				4055	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4056	if ((!terminate) &&
				4057	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4058	goto done;
				4059	#ifdef DEBUG_PUSH
				4060	xmlGenericError(xmlGenericErrorContext,
				4061	"HPP: Parsing Comment\n");
				4062	#endif
				4063	htmlParseComment(ctxt);
				4064	ctxt->instate = XML_PARSER_PROLOG;
				4065	} else if ((cur == '<') && (next == '!') &&
				4066	(avail < 4)) {
				4067	goto done;
				4068	} else {
				4069	ctxt->instate = XML_PARSER_START_TAG;
				4070	#ifdef DEBUG_PUSH
				4071	xmlGenericError(xmlGenericErrorContext,
				4072	"HPP: entering START_TAG\n");
				4073	#endif
				4074	}
				4075	break;
				4076	case XML_PARSER_EPILOG:
				4077	if (in->buf == NULL)
				4078	avail = in->length - (in->cur - in->base);
				4079	else
				4080	avail = in->buf->buffer->use - (in->cur - in->base);
				4081	if (avail < 1)
				4082	goto done;
				4083	cur = in->cur[0];
				4084	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4085	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4086	goto done;
				4087	}
				4088	if (avail < 2)
				4089	goto done;
				4090	next = in->cur[1];
				4091	if ((cur == '<') && (next == '!') &&
				4092	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4093	if ((!terminate) &&
				4094	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4095	goto done;
				4096	#ifdef DEBUG_PUSH
				4097	xmlGenericError(xmlGenericErrorContext,
				4098	"HPP: Parsing Comment\n");
				4099	#endif
				4100	htmlParseComment(ctxt);
				4101	ctxt->instate = XML_PARSER_EPILOG;
				4102	} else if ((cur == '<') && (next == '!') &&
				4103	(avail < 4)) {
				4104	goto done;
				4105	} else {
				4106	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4107	ctxt->wellFormed = 0;
				4108	ctxt->instate = XML_PARSER_EOF;
				4109	#ifdef DEBUG_PUSH
				4110	xmlGenericError(xmlGenericErrorContext,
				4111	"HPP: entering EOF\n");
				4112	#endif
				4113	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4114	ctxt->sax->endDocument(ctxt->userData);
				4115	goto done;
				4116	}
				4117	break;
				4118	case XML_PARSER_START_TAG: {
				4119	xmlChar name, oldname;
				4120	int depth = ctxt->nameNr;
				4121	htmlElemDescPtr info;
				4122
				4123	if (avail < 2)
				4124	goto done;
				4125	cur = in->cur[0];
				4126	if (cur != '<') {
				4127	ctxt->instate = XML_PARSER_CONTENT;
				4128	#ifdef DEBUG_PUSH
				4129	xmlGenericError(xmlGenericErrorContext,
				4130	"HPP: entering CONTENT\n");
				4131	#endif
				4132	break;
				4133	}
				4134	if ((!terminate) &&
				4135	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4136	goto done;
				4137
				4138	oldname = xmlStrdup(ctxt->name);
				4139	htmlParseStartTag(ctxt);
				4140	name = ctxt->name;
				4141	#ifdef DEBUG
				4142	if (oldname == NULL)
				4143	xmlGenericError(xmlGenericErrorContext,
				4144	"Start of element %s\n", name);
				4145	else if (name == NULL)
				4146	xmlGenericError(xmlGenericErrorContext,
				4147	"Start of element failed, was %s\n",
				4148	oldname);
				4149	else
				4150	xmlGenericError(xmlGenericErrorContext,
				4151	"Start of element %s, was %s\n",
				4152	name, oldname);
				4153	#endif
				4154	if (((depth == ctxt->nameNr) &&
				4155	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4156	(name == NULL)) {
				4157	if (CUR == '>')
				4158	NEXT;
				4159	if (oldname != NULL)
				4160	xmlFree(oldname);
				4161	break;
				4162	}
				4163	if (oldname != NULL)
				4164	xmlFree(oldname);
				4165
				4166	/*
				4167	* Lookup the info for that element.
				4168	*/
				4169	info = htmlTagLookup(name);
				4170	if (info == NULL) {
				4171	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4172	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4173	name);
				4174	ctxt->wellFormed = 0;
				4175	} else if (info->depr) {
				4176	/***************************
				4177	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4178	ctxt->sax->warning(ctxt->userData,
				4179	"Tag %s is deprecated\n",
				4180	name);
				4181	***************************/
				4182	}
				4183
				4184	/*
				4185	* Check for an Empty Element labelled the XML/SGML way
				4186	*/
				4187	if ((CUR == '/') && (NXT(1) == '>')) {
				4188	SKIP(2);
				4189	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4190	ctxt->sax->endElement(ctxt->userData, name);
				4191	oldname = htmlnamePop(ctxt);
				4192	#ifdef DEBUG
				4193	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4194	oldname);
				4195	#endif
				4196	if (oldname != NULL)
				4197	xmlFree(oldname);
				4198	ctxt->instate = XML_PARSER_CONTENT;
				4199	#ifdef DEBUG_PUSH
				4200	xmlGenericError(xmlGenericErrorContext,
				4201	"HPP: entering CONTENT\n");
				4202	#endif
				4203	break;
				4204	}
				4205
				4206	if (CUR == '>') {
				4207	NEXT;
				4208	} else {
				4209	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4210	ctxt->sax->error(ctxt->userData,
				4211	"Couldn't find end of Start Tag %s\n",
				4212	name);
				4213	ctxt->wellFormed = 0;
				4214
				4215	/*
				4216	* end of parsing of this node.
				4217	*/
				4218	if (xmlStrEqual(name, ctxt->name)) {
				4219	nodePop(ctxt);
				4220	oldname = htmlnamePop(ctxt);
				4221	#ifdef DEBUG
				4222	xmlGenericError(xmlGenericErrorContext,
				4223	"End of start tag problem: popping out %s\n", oldname);
				4224	#endif
				4225	if (oldname != NULL)
				4226	xmlFree(oldname);
				4227	}
				4228
				4229	ctxt->instate = XML_PARSER_CONTENT;
				4230	#ifdef DEBUG_PUSH
				4231	xmlGenericError(xmlGenericErrorContext,
				4232	"HPP: entering CONTENT\n");
				4233	#endif
				4234	break;
				4235	}
				4236
				4237	/*
				4238	* Check for an Empty Element from DTD definition
				4239	*/
				4240	if ((info != NULL) && (info->empty)) {
				4241	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4242	ctxt->sax->endElement(ctxt->userData, name);
				4243	oldname = htmlnamePop(ctxt);
				4244	#ifdef DEBUG
				4245	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4246	#endif
				4247	if (oldname != NULL)
				4248	xmlFree(oldname);
				4249	}
				4250	ctxt->instate = XML_PARSER_CONTENT;
				4251	#ifdef DEBUG_PUSH
				4252	xmlGenericError(xmlGenericErrorContext,
				4253	"HPP: entering CONTENT\n");
				4254	#endif
				4255	break;
				4256	}
				4257	case XML_PARSER_CONTENT: {
				4258	long cons;
				4259	/*
				4260	* Handle preparsed entities and charRef
				4261	*/
				4262	if (ctxt->token != 0) {
				4263	xmlChar chr[2] = { 0 , 0 } ;
				4264
				4265	chr[0] = (xmlChar) ctxt->token;
				4266	htmlCheckParagraph(ctxt);
				4267	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4268	ctxt->sax->characters(ctxt->userData, chr, 1);
				4269	ctxt->token = 0;
				4270	ctxt->checkIndex = 0;
				4271	}
				4272	if ((avail == 1) && (terminate)) {
				4273	cur = in->cur[0];
				4274	if ((cur != '<') && (cur != '&')) {
				4275	if (ctxt->sax != NULL) {
				4276	if (IS_BLANK(cur)) {
				4277	if (ctxt->sax->ignorableWhitespace != NULL)
				4278	ctxt->sax->ignorableWhitespace(
				4279	ctxt->userData, &cur, 1);
				4280	} else {
				4281	htmlCheckParagraph(ctxt);
				4282	if (ctxt->sax->characters != NULL)
				4283	ctxt->sax->characters(
				4284	ctxt->userData, &cur, 1);
				4285	}
				4286	}
				4287	ctxt->token = 0;
				4288	ctxt->checkIndex = 0;
				4289	NEXT;
				4290	}
				4291	break;
				4292	}
				4293	if (avail < 2)
				4294	goto done;
				4295	cur = in->cur[0];
				4296	next = in->cur[1];
				4297	cons = ctxt->nbChars;
				4298	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4299	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4300	/*
				4301	* Handle SCRIPT/STYLE separately
				4302	*/
				4303	if ((!terminate) &&
				4304	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4305	goto done;
				4306	htmlParseScript(ctxt);
				4307	if ((cur == '<') && (next == '/')) {
				4308	ctxt->instate = XML_PARSER_END_TAG;
				4309	ctxt->checkIndex = 0;
				4310	#ifdef DEBUG_PUSH
				4311	xmlGenericError(xmlGenericErrorContext,
				4312	"HPP: entering END_TAG\n");
				4313	#endif
				4314	break;
				4315	}
				4316	} else {
				4317	/*
				4318	* Sometimes DOCTYPE arrives in the middle of the document
				4319	*/
				4320	if ((cur == '<') && (next == '!') &&
				4321	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4322	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4323	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4324	(UPP(8) == 'E')) {
				4325	if ((!terminate) &&
				4326	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4327	goto done;
				4328	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4329	ctxt->sax->error(ctxt->userData,
				4330	"Misplaced DOCTYPE declaration\n");
				4331	ctxt->wellFormed = 0;
				4332	htmlParseDocTypeDecl(ctxt);
				4333	} else if ((cur == '<') && (next == '!') &&
				4334	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4335	if ((!terminate) &&
				4336	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4337	goto done;
				4338	#ifdef DEBUG_PUSH
				4339	xmlGenericError(xmlGenericErrorContext,
				4340	"HPP: Parsing Comment\n");
				4341	#endif
				4342	htmlParseComment(ctxt);
				4343	ctxt->instate = XML_PARSER_CONTENT;
				4344	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4345	goto done;
				4346	} else if ((cur == '<') && (next == '/')) {
				4347	ctxt->instate = XML_PARSER_END_TAG;
				4348	ctxt->checkIndex = 0;
				4349	#ifdef DEBUG_PUSH
				4350	xmlGenericError(xmlGenericErrorContext,
				4351	"HPP: entering END_TAG\n");
				4352	#endif
				4353	break;
				4354	} else if (cur == '<') {
				4355	ctxt->instate = XML_PARSER_START_TAG;
				4356	ctxt->checkIndex = 0;
				4357	#ifdef DEBUG_PUSH
				4358	xmlGenericError(xmlGenericErrorContext,
				4359	"HPP: entering START_TAG\n");
				4360	#endif
				4361	break;
				4362	} else if (cur == '&') {
				4363	if ((!terminate) &&
				4364	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4365	goto done;
				4366	#ifdef DEBUG_PUSH
				4367	xmlGenericError(xmlGenericErrorContext,
				4368	"HPP: Parsing Reference\n");
				4369	#endif
				4370	/* TODO: check generation of subtrees if noent !!! */
				4371	htmlParseReference(ctxt);
				4372	} else {
				4373	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4374	/*
				4375	* Goal of the following test is :
				4376	* - minimize calls to the SAX 'character' callback
				4377	* when they are mergeable
				4378	*/
				4379	if ((ctxt->inputNr == 1) &&
				4380	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4381	if ((!terminate) &&
				4382	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4383	goto done;
				4384	}
				4385	ctxt->checkIndex = 0;
				4386	#ifdef DEBUG_PUSH
				4387	xmlGenericError(xmlGenericErrorContext,
				4388	"HPP: Parsing char data\n");
				4389	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4390	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4391	}
				4392	}
				4393	if (cons == ctxt->nbChars) {
				4394	if (ctxt->node != NULL) {
				4395	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4396	ctxt->sax->error(ctxt->userData,
				4397	"detected an error in element content\n");
				4398	ctxt->wellFormed = 0;
				4399	}
				4400	NEXT;
				4401	break;
				4402	}
				4403
				4404	break;
				4405	}
				4406	case XML_PARSER_END_TAG:
				4407	if (avail < 2)
				4408	goto done;
				4409	if ((!terminate) &&
				4410	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4411	goto done;
				4412	htmlParseEndTag(ctxt);
				4413	if (ctxt->nameNr == 0) {
				4414	ctxt->instate = XML_PARSER_EPILOG;
				4415	} else {
				4416	ctxt->instate = XML_PARSER_CONTENT;
				4417	}
				4418	ctxt->checkIndex = 0;
				4419	#ifdef DEBUG_PUSH
				4420	xmlGenericError(xmlGenericErrorContext,
				4421	"HPP: entering CONTENT\n");
				4422	#endif
				4423	break;
				4424	case XML_PARSER_CDATA_SECTION:
				4425	xmlGenericError(xmlGenericErrorContext,
				4426	"HPP: internal error, state == CDATA\n");
				4427	ctxt->instate = XML_PARSER_CONTENT;
				4428	ctxt->checkIndex = 0;
				4429	#ifdef DEBUG_PUSH
				4430	xmlGenericError(xmlGenericErrorContext,
				4431	"HPP: entering CONTENT\n");
				4432	#endif
				4433	break;
				4434	case XML_PARSER_DTD:
				4435	xmlGenericError(xmlGenericErrorContext,
				4436	"HPP: internal error, state == DTD\n");
				4437	ctxt->instate = XML_PARSER_CONTENT;
				4438	ctxt->checkIndex = 0;
				4439	#ifdef DEBUG_PUSH
				4440	xmlGenericError(xmlGenericErrorContext,
				4441	"HPP: entering CONTENT\n");
				4442	#endif
				4443	break;
				4444	case XML_PARSER_COMMENT:
				4445	xmlGenericError(xmlGenericErrorContext,
				4446	"HPP: internal error, state == COMMENT\n");
				4447	ctxt->instate = XML_PARSER_CONTENT;
				4448	ctxt->checkIndex = 0;
				4449	#ifdef DEBUG_PUSH
				4450	xmlGenericError(xmlGenericErrorContext,
				4451	"HPP: entering CONTENT\n");
				4452	#endif
				4453	break;
				4454	case XML_PARSER_PI:
				4455	xmlGenericError(xmlGenericErrorContext,
				4456	"HPP: internal error, state == PI\n");
				4457	ctxt->instate = XML_PARSER_CONTENT;
				4458	ctxt->checkIndex = 0;
				4459	#ifdef DEBUG_PUSH
				4460	xmlGenericError(xmlGenericErrorContext,
				4461	"HPP: entering CONTENT\n");
				4462	#endif
				4463	break;
				4464	case XML_PARSER_ENTITY_DECL:
				4465	xmlGenericError(xmlGenericErrorContext,
				4466	"HPP: internal error, state == ENTITY_DECL\n");
				4467	ctxt->instate = XML_PARSER_CONTENT;
				4468	ctxt->checkIndex = 0;
				4469	#ifdef DEBUG_PUSH
				4470	xmlGenericError(xmlGenericErrorContext,
				4471	"HPP: entering CONTENT\n");
				4472	#endif
				4473	break;
				4474	case XML_PARSER_ENTITY_VALUE:
				4475	xmlGenericError(xmlGenericErrorContext,
				4476	"HPP: internal error, state == ENTITY_VALUE\n");
				4477	ctxt->instate = XML_PARSER_CONTENT;
				4478	ctxt->checkIndex = 0;
				4479	#ifdef DEBUG_PUSH
				4480	xmlGenericError(xmlGenericErrorContext,
				4481	"HPP: entering DTD\n");
				4482	#endif
				4483	break;
				4484	case XML_PARSER_ATTRIBUTE_VALUE:
				4485	xmlGenericError(xmlGenericErrorContext,
				4486	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4487	ctxt->instate = XML_PARSER_START_TAG;
				4488	ctxt->checkIndex = 0;
				4489	#ifdef DEBUG_PUSH
				4490	xmlGenericError(xmlGenericErrorContext,
				4491	"HPP: entering START_TAG\n");
				4492	#endif
				4493	break;
				4494	case XML_PARSER_SYSTEM_LITERAL:
				4495	xmlGenericError(xmlGenericErrorContext,
				4496	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4497	ctxt->instate = XML_PARSER_CONTENT;
				4498	ctxt->checkIndex = 0;
				4499	#ifdef DEBUG_PUSH
				4500	xmlGenericError(xmlGenericErrorContext,
				4501	"HPP: entering CONTENT\n");
				4502	#endif
				4503	break;
				4504	case XML_PARSER_IGNORE:
				4505	xmlGenericError(xmlGenericErrorContext,
				4506	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4507	ctxt->instate = XML_PARSER_CONTENT;
				4508	ctxt->checkIndex = 0;
				4509	#ifdef DEBUG_PUSH
				4510	xmlGenericError(xmlGenericErrorContext,
				4511	"HPP: entering CONTENT\n");
				4512	#endif
				4513	break;
				4514	}
				4515	}
				4516	done:
				4517	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4518	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4519	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4520	/*
				4521	* SAX: end of the document processing.
				4522	*/
				4523	ctxt->instate = XML_PARSER_EOF;
				4524	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4525	ctxt->sax->endDocument(ctxt->userData);
				4526	}
				4527	}
				4528	if ((ctxt->myDoc != NULL) &&
				4529	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4530	(ctxt->instate == XML_PARSER_EPILOG))) {
				4531	xmlDtdPtr dtd;
				4532	dtd = xmlGetIntSubset(ctxt->myDoc);
				4533	if (dtd == NULL)
				4534	ctxt->myDoc->intSubset =
				4535	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4536	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4537	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4538	}
				4539	#ifdef DEBUG_PUSH
				4540	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4541	#endif
				4542	return(ret);
				4543	}
				4544
				4545	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4546	* htmlParseChunk:
				4547	* @ctxt: an XML parser context
				4548	* @chunk: an char array
				4549	* @size: the size in byte of the chunk
				4550	* @terminate: last chunk indicator
				4551	*
				4552	* Parse a Chunk of memory
				4553	*
				4554	* Returns zero if no error, the xmlParserErrors otherwise.
				4555	*/
				4556	int
				4557	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4558	int terminate) {
				4559	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4560	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4561	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4562	int cur = ctxt->input->cur - ctxt->input->base;
				4563
				4564	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4565	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4566	ctxt->input->cur = ctxt->input->base + cur;
				4567	#ifdef DEBUG_PUSH
				4568	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4569	#endif
				4570
				4571	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4572	htmlParseTryOrFinish(ctxt, terminate);
				4573	} else if (ctxt->instate != XML_PARSER_EOF) {
				4574	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4575	htmlParseTryOrFinish(ctxt, terminate);
				4576	}
				4577	if (terminate) {
				4578	if ((ctxt->instate != XML_PARSER_EOF) &&
				4579	(ctxt->instate != XML_PARSER_EPILOG) &&
				4580	(ctxt->instate != XML_PARSER_MISC)) {
				4581	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4582	ctxt->wellFormed = 0;
				4583	}
				4584	if (ctxt->instate != XML_PARSER_EOF) {
				4585	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4586	ctxt->sax->endDocument(ctxt->userData);
				4587	}
				4588	ctxt->instate = XML_PARSER_EOF;
				4589	}
				4590	return((xmlParserErrors) ctxt->errNo);
				4591	}
				4592
				4593	/************************************************************************
				4594	* *
				4595	* User entry points *
				4596	* *
				4597	************************************************************************/
				4598
				4599	/**
				4600	* htmlCreatePushParserCtxt :
				4601	* @sax: a SAX handler
				4602	* @user_data: The user data returned on SAX callbacks
				4603	* @chunk: a pointer to an array of chars
				4604	* @size: number of chars in the array
				4605	* @filename: an optional file name or URI
				4606	* @enc: an optional encoding
				4607	*
				4608	* Create a parser context for using the HTML parser in push mode
				4609	* To allow content encoding detection, @size should be >= 4
				4610	* The value of @filename is used for fetching external entities
				4611	* and error/warning reports.
				4612	*
				4613	* Returns the new parser context or NULL
				4614	*/
				4615	htmlParserCtxtPtr
				4616	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4617	const char chunk, int size, const char filename,
				4618	xmlCharEncoding enc) {
				4619	htmlParserCtxtPtr ctxt;
				4620	htmlParserInputPtr inputStream;
				4621	xmlParserInputBufferPtr buf;
				4622
				4623	buf = xmlAllocParserInputBuffer(enc);
				4624	if (buf == NULL) return(NULL);
				4625
				4626	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4627	if (ctxt == NULL) {
				4628	xmlFree(buf);
				4629	return(NULL);
				4630	}
				4631	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4632	htmlInitParserCtxt(ctxt);
				4633	if (sax != NULL) {
				4634	if (ctxt->sax != &htmlDefaultSAXHandler)
				4635	xmlFree(ctxt->sax);
				4636	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4637	if (ctxt->sax == NULL) {
				4638	xmlFree(buf);
				4639	xmlFree(ctxt);
				4640	return(NULL);
				4641	}
				4642	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4643	if (user_data != NULL)
				4644	ctxt->userData = user_data;
				4645	}
				4646	if (filename == NULL) {
				4647	ctxt->directory = NULL;
				4648	} else {
				4649	ctxt->directory = xmlParserGetDirectory(filename);
				4650	}
				4651
				4652	inputStream = htmlNewInputStream(ctxt);
				4653	if (inputStream == NULL) {
				4654	xmlFreeParserCtxt(ctxt);
				4655	return(NULL);
				4656	}
				4657
				4658	if (filename == NULL)
				4659	inputStream->filename = NULL;
				4660	else
				4661	inputStream->filename = xmlMemStrdup(filename);
				4662	inputStream->buf = buf;
				4663	inputStream->base = inputStream->buf->buffer->content;
				4664	inputStream->cur = inputStream->buf->buffer->content;
				4665
				4666	inputPush(ctxt, inputStream);
				4667
				4668	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4669	(ctxt->input->buf != NULL)) {
				4670	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4671	#ifdef DEBUG_PUSH
				4672	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4673	#endif
				4674	}
				4675
				4676	return(ctxt);
				4677	}
				4678
				4679	/**
				4680	* htmlSAXParseDoc :
				4681	* @cur: a pointer to an array of xmlChar
				4682	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4683	* @sax: the SAX handler block
				4684	* @userData: if using SAX, this pointer will be provided on callbacks.
				4685	*
				4686	* parse an HTML in-memory document and build a tree.
				4687	* It use the given SAX function block to handle the parsing callback.
				4688	* If sax is NULL, fallback to the default DOM tree building routines.
				4689	*
				4690	* Returns the resulting document tree
				4691	*/
				4692
				4693	htmlDocPtr
				4694	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4695	htmlDocPtr ret;
				4696	htmlParserCtxtPtr ctxt;
				4697
				4698	if (cur == NULL) return(NULL);
				4699
				4700
				4701	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4702	if (ctxt == NULL) return(NULL);
				4703	if (sax != NULL) {
				4704	ctxt->sax = sax;
				4705	ctxt->userData = userData;
				4706	}
				4707
				4708	htmlParseDocument(ctxt);
				4709	ret = ctxt->myDoc;
				4710	if (sax != NULL) {
				4711	ctxt->sax = NULL;
				4712	ctxt->userData = NULL;
				4713	}
				4714	htmlFreeParserCtxt(ctxt);
				4715
				4716	return(ret);
				4717	}
				4718
				4719	/**
				4720	* htmlParseDoc :
				4721	* @cur: a pointer to an array of xmlChar
				4722	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4723	*
				4724	* parse an HTML in-memory document and build a tree.
				4725	*
				4726	* Returns the resulting document tree
				4727	*/
				4728
				4729	htmlDocPtr
				4730	htmlParseDoc(xmlChar cur, const char encoding) {
				4731	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4732	}
				4733
				4734
				4735	/**
				4736	* htmlCreateFileParserCtxt :
				4737	* @filename: the filename
				4738	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4739	*
				4740	* Create a parser context for a file content.
				4741	* Automatic support for ZLIB/Compress compressed document is provided
				4742	* by default if found at compile-time.
				4743	*
				4744	* Returns the new parser context or NULL
				4745	*/
				4746	htmlParserCtxtPtr
				4747	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4748	{
				4749	htmlParserCtxtPtr ctxt;
				4750	htmlParserInputPtr inputStream;
				4751	xmlParserInputBufferPtr buf;
				4752	/* htmlCharEncoding enc; */
				4753	xmlChar content, content_line = (xmlChar *) "charset=";
				4754
				4755	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4756	if (buf == NULL) return(NULL);
				4757
				4758	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4759	if (ctxt == NULL) {
				4760	perror("malloc");
				4761	return(NULL);
				4762	}
				4763	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4764	htmlInitParserCtxt(ctxt);
				4765	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4766	if (inputStream == NULL) {
				4767	perror("malloc");
				4768	xmlFree(ctxt);
				4769	return(NULL);
				4770	}
				4771	memset(inputStream, 0, sizeof(htmlParserInput));
				4772
				4773	inputStream->filename = xmlMemStrdup(filename);
				4774	inputStream->line = 1;
				4775	inputStream->col = 1;
				4776	inputStream->buf = buf;
				4777	inputStream->directory = NULL;
				4778
				4779	inputStream->base = inputStream->buf->buffer->content;
				4780	inputStream->cur = inputStream->buf->buffer->content;
				4781	inputStream->free = NULL;
				4782
				4783	inputPush(ctxt, inputStream);
				4784
				4785	/* set encoding */
				4786	if (encoding) {
				4787	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4788	if (content) {
				4789	strcpy ((char )content, (char )content_line);
				4790	strcat ((char )content, (char )encoding);
				4791	htmlCheckEncoding (ctxt, content);
				4792	xmlFree (content);
				4793	}
				4794	}
				4795
				4796	return(ctxt);
				4797	}
				4798
				4799	/**
				4800	* htmlSAXParseFile :
				4801	* @filename: the filename
				4802	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4803	* @sax: the SAX handler block
				4804	* @userData: if using SAX, this pointer will be provided on callbacks.
				4805	*
				4806	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4807	* compressed document is provided by default if found at compile-time.
				4808	* It use the given SAX function block to handle the parsing callback.
				4809	* If sax is NULL, fallback to the default DOM tree building routines.
				4810	*
				4811	* Returns the resulting document tree
				4812	*/
				4813
				4814	htmlDocPtr
				4815	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4816	void *userData) {
				4817	htmlDocPtr ret;
				4818	htmlParserCtxtPtr ctxt;
				4819	htmlSAXHandlerPtr oldsax = NULL;
				4820
				4821	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4822	if (ctxt == NULL) return(NULL);
				4823	if (sax != NULL) {
				4824	oldsax = ctxt->sax;
				4825	ctxt->sax = sax;
				4826	ctxt->userData = userData;
				4827	}
				4828
				4829	htmlParseDocument(ctxt);
				4830
				4831	ret = ctxt->myDoc;
				4832	if (sax != NULL) {
				4833	ctxt->sax = oldsax;
				4834	ctxt->userData = NULL;
				4835	}
				4836	htmlFreeParserCtxt(ctxt);
				4837
				4838	return(ret);
				4839	}
				4840
				4841	/**
				4842	* htmlParseFile :
				4843	* @filename: the filename
				4844	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4845	*
				4846	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4847	* compressed document is provided by default if found at compile-time.
				4848	*
				4849	* Returns the resulting document tree
				4850	*/
				4851
				4852	htmlDocPtr
				4853	htmlParseFile(const char filename, const char encoding) {
				4854	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4855	}
				4856
				4857	/**
				4858	* htmlHandleOmittedElem:
				4859	* @val: int 0 or 1
				4860	*
				4861	* Set and return the previous value for handling HTML omitted tags.
				4862	*
				4863	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4864	*/
				4865
				4866	int
				4867	htmlHandleOmittedElem(int val) {
				4868	int old = htmlOmittedDefaultValue;
				4869
				4870	htmlOmittedDefaultValue = val;
				4871	return(old);
				4872	}
				4873
				4874	#endif /* LIBXML_HTML_ENABLED */