Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: af9281b5d84f67154180805823b54855b597b831 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
				43
				44	#define HTML_MAX_NAMELEN 1000
				45	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				46	#define HTML_PARSER_BUFFER_SIZE 100
				47
				48	/* #define DEBUG */
				49	/* #define DEBUG_PUSH */
				50
				51	int htmlOmittedDefaultValue = 1;
				52
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	53	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				54	xmlChar end, xmlChar end2, xmlChar end3);
				55
				56	/************************************************************************
				57	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	58	* Parser stacks related functions and macros *
				59	* *
				60	************************************************************************/
				61
				62	/*
				63	* Generic function for accessing stacks in the Parser Context
				64	*/
				65
				66	#define PUSH_AND_POP(scope, type, name) \
				67	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				68	if (ctxt->name##Nr >= ctxt->name##Max) { \
				69	ctxt->name##Max *= 2; \
				70	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				71	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				72	if (ctxt->name##Tab == NULL) { \
				73	xmlGenericError(xmlGenericErrorContext, \
				74	"realloc failed !\n"); \
				75	return(0); \
				76	} \
				77	} \
				78	ctxt->name##Tab[ctxt->name##Nr] = value; \
				79	ctxt->name = value; \
				80	return(ctxt->name##Nr++); \
				81	} \
				82	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				83	type ret; \
				84	if (ctxt->name##Nr < 0) return(0); \
				85	ctxt->name##Nr--; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	if (ctxt->name##Nr > 0) \
				88	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				89	else \
				90	ctxt->name = NULL; \
				91	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				92	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				93	return(ret); \
				94	} \
				95
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	96	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				97	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	98
				99	/*
				100	* Macros for accessing the content. Those should be used only by the parser,
				101	* and not exported.
				102	*
				103	* Dirty macros, i.e. one need to make assumption on the context to use them
				104	*
				105	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				106	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				107	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				108	* in UNICODE mode. This should be used internally by the parser
				109	* only to compare to ASCII values otherwise it would break when
				110	* running with UTF-8 encoding.
				111	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				112	* to compare on ASCII based substring.
				113	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				114	* it should be used only to compare on ASCII based substring.
				115	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				116	* strings within the parser.
				117	*
				118	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				119	*
				120	* CURRENT Returns the current char value, with the full decoding of
				121	* UTF-8 if we are using this mode. It returns an int.
				122	* NEXT Skip to the next character, this does the proper decoding
				123	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				124	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				125	*/
				126
				127	#define UPPER (toupper(*ctxt->input->cur))
				128
				129	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				130
				131	#define NXT(val) ctxt->input->cur[(val)]
				132
				133	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				134
				135	#define CUR_PTR ctxt->input->cur
				136
				137	#define SHRINK xmlParserInputShrink(ctxt->input)
				138
				139	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				140
				141	#define CURRENT ((int) (*ctxt->input->cur))
				142
				143	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				144
				145	/* Inported from XML */
				146
				147	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				148	#define CUR ((int) (*ctxt->input->cur))
				149	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				150
				151	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				152	#define NXT(val) ctxt->input->cur[(val)]
				153	#define CUR_PTR ctxt->input->cur
				154
				155
				156	#define NEXTL(l) do { \
				157	if (*(ctxt->input->cur) == '\n') { \
				158	ctxt->input->line++; ctxt->input->col = 1; \
				159	} else ctxt->input->col++; \
				160	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				161	} while (0)
				162
				163	/************
				164	\
				165	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				166	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				167	************/
				168
				169	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				170	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				171
				172	#define COPY_BUF(l,b,i,v) \
				173	if (l == 1) b[i++] = (xmlChar) v; \
				174	else i += xmlCopyChar(l,&b[i],v)
				175
				176	/**
				177	* htmlCurrentChar:
				178	* @ctxt: the HTML parser context
				179	* @len: pointer to the length of the char read
				180	*
				181	* The current char value, if using UTF-8 this may actaully span multiple
				182	* bytes in the input buffer. Implement the end of line normalization:
				183	* 2.11 End-of-Line Handling
				184	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				185	* char, then the encoding converter is plugged in automatically.
				186	*
				187	* Returns the current char value and its lenght
				188	*/
				189
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	190	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				192	if (ctxt->instate == XML_PARSER_EOF)
				193	return(0);
				194
				195	if (ctxt->token != 0) {
				196	*len = 0;
				197	return(ctxt->token);
				198	}
				199	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				200	/*
				201	* We are supposed to handle UTF8, check it's valid
				202	* From rfc2044: encoding of the Unicode values on UTF-8:
				203	*
				204	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				205	* 0000 0000-0000 007F 0xxxxxxx
				206	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				207	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				208	*
				209	* Check for the 0x110000 limit too
				210	*/
				211	const unsigned char *cur = ctxt->input->cur;
				212	unsigned char c;
				213	unsigned int val;
				214
				215	c = *cur;
				216	if (c & 0x80) {
				217	if (cur[1] == 0)
				218	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				219	if ((cur[1] & 0xc0) != 0x80)
				220	goto encoding_error;
				221	if ((c & 0xe0) == 0xe0) {
				222
				223	if (cur[2] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[2] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xf0) == 0xf0) {
				228	if (cur[3] == 0)
				229	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				230	if (((c & 0xf8) != 0xf0) \|\|
				231	((cur[3] & 0xc0) != 0x80))
				232	goto encoding_error;
				233	/* 4-byte code */
				234	*len = 4;
				235	val = (cur[0] & 0x7) << 18;
				236	val \|= (cur[1] & 0x3f) << 12;
				237	val \|= (cur[2] & 0x3f) << 6;
				238	val \|= cur[3] & 0x3f;
				239	} else {
				240	/* 3-byte code */
				241	*len = 3;
				242	val = (cur[0] & 0xf) << 12;
				243	val \|= (cur[1] & 0x3f) << 6;
				244	val \|= cur[2] & 0x3f;
				245	}
				246	} else {
				247	/* 2-byte code */
				248	*len = 2;
				249	val = (cur[0] & 0x1f) << 6;
				250	val \|= cur[1] & 0x3f;
				251	}
				252	if (!IS_CHAR(val)) {
				253	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				254	if ((ctxt->sax != NULL) &&
				255	(ctxt->sax->error != NULL))
				256	ctxt->sax->error(ctxt->userData,
				257	"Char 0x%X out of allowed range\n", val);
				258	ctxt->wellFormed = 0;
				259	ctxt->disableSAX = 1;
				260	}
				261	return(val);
				262	} else {
				263	/* 1-byte code */
				264	*len = 1;
				265	return((int) *ctxt->input->cur);
				266	}
				267	}
				268	/*
				269	* Assume it's a fixed lenght encoding (1) with
				270	* a compatibke encoding for the ASCII set, since
				271	* XML constructs only use < 128 chars
				272	*/
				273	*len = 1;
				274	if ((int) *ctxt->input->cur < 0x80)
				275	return((int) *ctxt->input->cur);
				276
				277	/*
				278	* Humm this is bad, do an automatic flow conversion
				279	*/
				280	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				281	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				282	return(xmlCurrentChar(ctxt, len));
				283
				284	encoding_error:
				285	/*
				286	* If we detect an UTF8 error that probably mean that the
				287	* input encoding didn't get properly advertized in the
				288	* declaration header. Report the error and switch the encoding
				289	* to ISO-Latin-1 (if you don't like this policy, just declare the
				290	* encoding !)
				291	*/
				292	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				293	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				294	ctxt->sax->error(ctxt->userData,
				295	"Input is not proper UTF-8, indicate encoding !\n");
				296	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				297	ctxt->input->cur[0], ctxt->input->cur[1],
				298	ctxt->input->cur[2], ctxt->input->cur[3]);
				299	}
				300
				301	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				302	*len = 1;
				303	return((int) *ctxt->input->cur);
				304	}
				305
				306	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	307	* htmlSkipBlankChars:
				308	* @ctxt: the HTML parser context
				309	*
				310	* skip all blanks character found at that point in the input streams.
				311	*
				312	* Returns the number of space chars skipped
				313	*/
				314
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	315	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	316	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				317	int res = 0;
				318
				319	while (IS_BLANK(*(ctxt->input->cur))) {
				320	if ((*ctxt->input->cur == 0) &&
				321	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				322	xmlPopInput(ctxt);
				323	} else {
				324	if (*(ctxt->input->cur) == '\n') {
				325	ctxt->input->line++; ctxt->input->col = 1;
				326	} else ctxt->input->col++;
				327	ctxt->input->cur++;
				328	ctxt->nbChars++;
				329	if (*ctxt->input->cur == 0)
				330	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				331	}
				332	res++;
				333	}
				334	return(res);
				335	}
				336
				337
				338
				339	/************************************************************************
				340	* *
				341	* The list of HTML elements and their properties *
				342	* *
				343	************************************************************************/
				344
				345	/*
				346	* Start Tag: 1 means the start tag can be ommited
				347	* End Tag: 1 means the end tag can be ommited
				348	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	349	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	350	* Depr: this element is deprecated
				351	* DTD: 1 means that this element is valid only in the Loose DTD
				352	* 2 means that this element is valid only in the Frameset DTD
				353	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	354	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	355	*/
				356	htmlElemDesc html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	357	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				358	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				359	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				360	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				361	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				362	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				363	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				364	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				365	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				366	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				367	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				368	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				369	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				370	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				371	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				372	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				373	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				374	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				375	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				376	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				377	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				378	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				379	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				380	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				381	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				382	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				383	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				384	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				385	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				386	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				387	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				388	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				389	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				390	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				391	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				392	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				393	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				398	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				399	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				400	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				401	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				402	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				403	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				404	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				405	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				406	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				407	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				408	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				409	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				410	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				411	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				412	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				413	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				414	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				415	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				416	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				417	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				418	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				419	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				420	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				421	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				422	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				423	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				424	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				425	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				426	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				427	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				428	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				429	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				430	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				431	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				432	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				433	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				434	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				435	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				436	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				437	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				438	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				439	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				440	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				441	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				442	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				443	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				444	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				445	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				446	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				447	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	448	};
				449
				450	/*
				451	* start tags that imply the end of a current element
				452	* any tag of each line implies the end of the current element if the type of
				453	* that element is in the same line
				454	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	455	const char *htmlEquEnd[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	456	"dt", "dd", "li", "option", NULL,
				457	"h1", "h2", "h3", "h4", "h5", "h6", NULL,
				458	"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
				459	NULL
				460	};
				461	/*
				462	* acording the HTML DTD, HR should be added to the 2nd line above, as it
				463	* is not allowed within a H1, H2, H3, etc. But we should tolerate that case
				464	* because many documents contain rules in headings...
				465	*/
				466
				467	/*
				468	* start tags that imply the end of current element
				469	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	470	const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	471	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				472	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				473	"listing", "xmp", "head", NULL,
				474	"head", "p", NULL,
				475	"title", "p", NULL,
				476	"body", "head", "style", "link", "title", "p", NULL,
				477	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				478	"pre", "listing", "xmp", "head", "li", NULL,
				479	"hr", "p", "head", NULL,
				480	"h1", "p", "head", NULL,
				481	"h2", "p", "head", NULL,
				482	"h3", "p", "head", NULL,
				483	"h4", "p", "head", NULL,
				484	"h5", "p", "head", NULL,
				485	"h6", "p", "head", NULL,
				486	"dir", "p", "head", NULL,
				487	"address", "p", "head", "ul", NULL,
				488	"pre", "p", "head", "ul", NULL,
				489	"listing", "p", "head", NULL,
				490	"xmp", "p", "head", NULL,
				491	"blockquote", "p", "head", NULL,
				492	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				493	"xmp", "head", NULL,
				494	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				495	"head", "dd", NULL,
				496	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				497	"head", "dt", NULL,
				498	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				499	"listing", "xmp", NULL,
				500	"ol", "p", "head", "ul", NULL,
				501	"menu", "p", "head", "ul", NULL,
				502	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				503	"div", "p", "head", NULL,
				504	"noscript", "p", "head", NULL,
				505	"center", "font", "b", "i", "p", "head", NULL,
				506	"a", "a", NULL,
				507	"caption", "p", NULL,
				508	"colgroup", "caption", "colgroup", "col", "p", NULL,
				509	"col", "caption", "col", "p", NULL,
				510	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				511	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	512	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				513	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	514	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				515	"thead", "caption", "col", "colgroup", NULL,
				516	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				517	"tbody", "p", NULL,
				518	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				519	"tfoot", "tbody", "p", NULL,
				520	"optgroup", "option", NULL,
				521	"option", "option", NULL,
				522	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				523	"pre", "listing", "xmp", "a", NULL,
				524	NULL
				525	};
				526
				527	/*
				528	* The list of HTML elements which are supposed not to have
				529	* CDATA content and where a p element will be implied
				530	*
				531	* TODO: extend that list by reading the HTML SGML DtD on
				532	* implied paragraph
				533	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	534	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	535	"html",
				536	"head",
				537	"body",
				538	NULL
				539	};
				540
				541	/*
				542	* The list of HTML attributes which are of content %Script;
				543	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				544	* it assumes the name starts with 'on'
				545	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	546	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	547	"onclick",
				548	"ondblclick",
				549	"onmousedown",
				550	"onmouseup",
				551	"onmouseover",
				552	"onmousemove",
				553	"onmouseout",
				554	"onkeypress",
				555	"onkeydown",
				556	"onkeyup",
				557	"onload",
				558	"onunload",
				559	"onfocus",
				560	"onblur",
				561	"onsubmit",
				562	"onrest",
				563	"onchange",
				564	"onselect"
				565	};
				566
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	567	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	568	* This table is used by the htmlparser to know what to do with
				569	* broken html pages. By assigning different priorities to different
				570	* elements the parser can decide how to handle extra endtags.
				571	* Endtags are only allowed to close elements with lower or equal
				572	* priority.
				573	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	574
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	575	typedef struct {
				576	const char *name;
				577	int priority;
				578	} elementPriority;
				579
				580	const elementPriority htmlEndPriority[] = {
				581	{"div", 150},
				582	{"td", 160},
				583	{"th", 160},
				584	{"tr", 170},
				585	{"thead", 180},
				586	{"tbody", 180},
				587	{"tfoot", 180},
				588	{"table", 190},
				589	{"head", 200},
				590	{"body", 200},
				591	{"html", 220},
				592	{NULL, 100} /* Default priority */
				593	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	594
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	595	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	596	static int htmlStartCloseIndexinitialized = 0;
				597
				598	/************************************************************************
				599	* *
				600	* functions to handle HTML specific data *
				601	* *
				602	************************************************************************/
				603
				604	/**
				605	* htmlInitAutoClose:
				606	*
				607	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				608	* This is not reentrant. Call xmlInitParser() once before processing in
				609	* case of use in multithreaded programs.
				610	*/
				611	void
				612	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	613	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	614
				615	if (htmlStartCloseIndexinitialized) return;
				616
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	617	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				618	indx = 0;
				619	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				620	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	621	while (htmlStartClose[i] != NULL) i++;
				622	i++;
				623	}
				624	htmlStartCloseIndexinitialized = 1;
				625	}
				626
				627	/**
				628	* htmlTagLookup:
				629	* @tag: The tag name in lowercase
				630	*
				631	* Lookup the HTML tag in the ElementTable
				632	*
				633	* Returns the related htmlElemDescPtr or NULL if not found.
				634	*/
				635	htmlElemDescPtr
				636	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	637	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	638
				639	for (i = 0; i < (sizeof(html40ElementTable) /
				640	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	641	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	642	return(&html40ElementTable[i]);
				643	}
				644	return(NULL);
				645	}
				646
				647	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	648	* htmlGetEndPriority:
				649	* @name: The name of the element to look up the priority for.
				650	*
				651	* Return value: The "endtag" priority.
				652	**/
				653	static int
				654	htmlGetEndPriority (const xmlChar *name) {
				655	int i = 0;
				656
				657	while ((htmlEndPriority[i].name != NULL) &&
				658	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				659	i++;
				660
				661	return(htmlEndPriority[i].priority);
				662	}
				663
				664	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	665	* htmlCheckAutoClose:
				666	* @newtag: The new tag name
				667	* @oldtag: The old tag name
				668	*
				669	* Checks wether the new tag is one of the registered valid tags for closing old.
				670	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				671	*
				672	* Returns 0 if no, 1 if yes.
				673	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	674	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	675	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	676	int i, indx;
				677	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	678
				679	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				680
				681	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	682	for (indx = 0; indx < 100;indx++) {
				683	closed = htmlStartCloseIndex[indx];
				684	if (closed == NULL) return(0);
				685	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	686	}
				687
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	688	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	689	i++;
				690	while (htmlStartClose[i] != NULL) {
				691	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				692	return(1);
				693	}
				694	i++;
				695	}
				696	return(0);
				697	}
				698
				699	/**
				700	* htmlAutoCloseOnClose:
				701	* @ctxt: an HTML parser context
				702	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	703	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	704	*
				705	* The HTmL DtD allows an ending tag to implicitely close other tags.
				706	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	707	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	708	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				709	htmlElemDescPtr info;
				710	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	711	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	712
				713	#ifdef DEBUG
				714	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				715	for (i = 0;i < ctxt->nameNr;i++)
				716	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				717	#endif
				718
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	719	priority = htmlGetEndPriority (newtag);
				720
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	721	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	722
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	723	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	724	/*
				725	* A missplaced endtagad can only close elements with lower
				726	* or equal priority, so if we find an element with higher
				727	* priority before we find an element with
				728	* matching name, we just ignore this endtag
				729	*/
				730	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	731	}
				732	if (i < 0) return;
				733
				734	while (!xmlStrEqual(newtag, ctxt->name)) {
				735	info = htmlTagLookup(ctxt->name);
				736	if ((info == NULL) \|\| (info->endTag == 1)) {
				737	#ifdef DEBUG
				738	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				739	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	740	} else if (info->endTag == 3) {
				741	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	742	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	743	#endif
				744	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				745	ctxt->sax->error(ctxt->userData,
				746	"Opening and ending tag mismatch: %s and %s\n",
				747	newtag, ctxt->name);
				748	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	749	}
				750	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				751	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				752	oldname = htmlnamePop(ctxt);
				753	if (oldname != NULL) {
				754	#ifdef DEBUG
				755	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				756	#endif
				757	xmlFree(oldname);
				758	}
				759	}
				760	}
				761
				762	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	763	* htmlAutoCloseOnEnd:
				764	* @ctxt: an HTML parser context
				765	*
				766	* Close all remaining tags at the end of the stream
				767	*/
				768	static void
				769	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				770	xmlChar *oldname;
				771	int i;
				772
				773	if (ctxt->nameNr == 0)
				774	return;
				775	#ifdef DEBUG
				776	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				777	#endif
				778
				779	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				780	#ifdef DEBUG
				781	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				782	#endif
				783	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				784	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				785	oldname = htmlnamePop(ctxt);
				786	if (oldname != NULL) {
				787	#ifdef DEBUG
				788	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				789	#endif
				790	xmlFree(oldname);
				791	}
				792	}
				793	}
				794
				795	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	796	* htmlAutoClose:
				797	* @ctxt: an HTML parser context
				798	* @newtag: The new tag name or NULL
				799	*
				800	* The HTmL DtD allows a tag to implicitely close other tags.
				801	* The list is kept in htmlStartClose array. This function is
				802	* called when a new tag has been detected and generates the
				803	* appropriates closes if possible/needed.
				804	* If newtag is NULL this mean we are at the end of the resource
				805	* and we should check
				806	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	807	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	808	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				809	xmlChar *oldname;
				810	while ((newtag != NULL) && (ctxt->name != NULL) &&
				811	(htmlCheckAutoClose(newtag, ctxt->name))) {
				812	#ifdef DEBUG
				813	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				814	#endif
				815	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				816	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				817	oldname = htmlnamePop(ctxt);
				818	if (oldname != NULL) {
				819	#ifdef DEBUG
				820	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				821	#endif
				822	xmlFree(oldname);
				823	}
				824	}
				825	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	826	htmlAutoCloseOnEnd(ctxt);
				827	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	828	}
				829	while ((newtag == NULL) && (ctxt->name != NULL) &&
				830	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				831	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				832	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				833	#ifdef DEBUG
				834	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				835	#endif
				836	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				837	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				838	oldname = htmlnamePop(ctxt);
				839	if (oldname != NULL) {
				840	#ifdef DEBUG
				841	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				842	#endif
				843	xmlFree(oldname);
				844	}
				845	}
				846
				847	}
				848
				849	/**
				850	* htmlAutoCloseTag:
				851	* @doc: the HTML document
				852	* @name: The tag name
				853	* @elem: the HTML element
				854	*
				855	* The HTmL DtD allows a tag to implicitely close other tags.
				856	* The list is kept in htmlStartClose array. This function checks
				857	* if the element or one of it's children would autoclose the
				858	* given tag.
				859	*
				860	* Returns 1 if autoclose, 0 otherwise
				861	*/
				862	int
				863	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				864	htmlNodePtr child;
				865
				866	if (elem == NULL) return(1);
				867	if (xmlStrEqual(name, elem->name)) return(0);
				868	if (htmlCheckAutoClose(elem->name, name)) return(1);
				869	child = elem->children;
				870	while (child != NULL) {
				871	if (htmlAutoCloseTag(doc, name, child)) return(1);
				872	child = child->next;
				873	}
				874	return(0);
				875	}
				876
				877	/**
				878	* htmlIsAutoClosed:
				879	* @doc: the HTML document
				880	* @elem: the HTML element
				881	*
				882	* The HTmL DtD allows a tag to implicitely close other tags.
				883	* The list is kept in htmlStartClose array. This function checks
				884	* if a tag is autoclosed by one of it's child
				885	*
				886	* Returns 1 if autoclosed, 0 otherwise
				887	*/
				888	int
				889	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				890	htmlNodePtr child;
				891
				892	if (elem == NULL) return(1);
				893	child = elem->children;
				894	while (child != NULL) {
				895	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				896	child = child->next;
				897	}
				898	return(0);
				899	}
				900
				901	/**
				902	* htmlCheckImplied:
				903	* @ctxt: an HTML parser context
				904	* @newtag: The new tag name
				905	*
				906	* The HTML DtD allows a tag to exists only implicitely
				907	* called when a new tag has been detected and generates the
				908	* appropriates implicit tags if missing
				909	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	910	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	911	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				912	if (!htmlOmittedDefaultValue)
				913	return;
				914	if (xmlStrEqual(newtag, BAD_CAST"html"))
				915	return;
				916	if (ctxt->nameNr <= 0) {
				917	#ifdef DEBUG
				918	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				919	#endif
				920	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				921	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				922	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				923	}
				924	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				925	return;
				926	if ((ctxt->nameNr <= 1) &&
				927	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				928	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				929	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				930	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				931	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				932	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				933	/*
				934	* dropped OBJECT ... i you put it first BODY will be
				935	* assumed !
				936	*/
				937	#ifdef DEBUG
				938	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				939	#endif
				940	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				941	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				942	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				943	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				944	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				945	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				946	int i;
				947	for (i = 0;i < ctxt->nameNr;i++) {
				948	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				949	return;
				950	}
				951	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				952	return;
				953	}
				954	}
				955
				956	#ifdef DEBUG
				957	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				958	#endif
				959	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				960	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				961	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				962	}
				963	}
				964
				965	/**
				966	* htmlCheckParagraph
				967	* @ctxt: an HTML parser context
				968	*
				969	* Check whether a p element need to be implied before inserting
				970	* characters in the current element.
				971	*
				972	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				973	* in case of error.
				974	*/
				975
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	976	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	977	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				978	const xmlChar *tag;
				979	int i;
				980
				981	if (ctxt == NULL)
				982	return(-1);
				983	tag = ctxt->name;
				984	if (tag == NULL) {
				985	htmlAutoClose(ctxt, BAD_CAST"p");
				986	htmlCheckImplied(ctxt, BAD_CAST"p");
				987	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				988	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				989	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				990	return(1);
				991	}
				992	if (!htmlOmittedDefaultValue)
				993	return(0);
				994	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				995	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				996	#ifdef DEBUG
				997	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				998	#endif
				999	htmlAutoClose(ctxt, BAD_CAST"p");
				1000	htmlCheckImplied(ctxt, BAD_CAST"p");
				1001	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				1002	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1003	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1004	return(1);
				1005	}
				1006	}
				1007	return(0);
				1008	}
				1009
				1010	/**
				1011	* htmlIsScriptAttribute:
				1012	* @name: an attribute name
				1013	*
				1014	* Check if an attribute is of content type Script
				1015	*
				1016	* Returns 1 is the attribute is a script 0 otherwise
				1017	*/
				1018	int
				1019	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1020	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1021
				1022	if (name == NULL)
				1023	return(0);
				1024	/*
				1025	* all script attributes start with 'on'
				1026	*/
				1027	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1028	return(0);
				1029	for (i = 0;
				1030	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1031	i++) {
				1032	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1033	return(1);
				1034	}
				1035	return(0);
				1036	}
				1037
				1038	/************************************************************************
				1039	* *
				1040	* The list of HTML predefined entities *
				1041	* *
				1042	************************************************************************/
				1043
				1044
				1045	htmlEntityDesc html40EntitiesTable[] = {
				1046	/*
				1047	* the 4 absolute ones, plus apostrophe.
				1048	*/
				1049	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1050	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1051	{ 39, "apos", "single quote" },
				1052	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1053	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1054
				1055	/*
				1056	* A bunch still in the 128-255 range
				1057	* Replacing them depend really on the charset used.
				1058	*/
				1059	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1060	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1061	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1062	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1063	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1064	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1065	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1066	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1067	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1068	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1069	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1070	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1071	{ 172, "not", "not sign, U+00AC ISOnum" },
				1072	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1073	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1074	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1075	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1076	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1077	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1078	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1079	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1080	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1081	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1082	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1083	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1084	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1085	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1086	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1087	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1088	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1089	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1090	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1091	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1092	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1093	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1094	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1095	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1096	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1097	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1098	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1099	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1100	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1101	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1102	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1103	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1104	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1105	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1106	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1107	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1108	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1109	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1110	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1111	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1112	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1113	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1114	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1115	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1116	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1117	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1118	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1119	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1120	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1121	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1122	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1123	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1124	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1125	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1126	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1127	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1128	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1129	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1130	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1131	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1132	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1133	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1134	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1135	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1136	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1137	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1138	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1139	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1140	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1141	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1142	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1143	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1144	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1145	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1146	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1147	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1148	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1149	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1150	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1151	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1152	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1153	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1154	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1155
				1156	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1157	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1158	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1159	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1160	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1161
				1162	/*
				1163	* Anything below should really be kept as entities references
				1164	*/
				1165	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1166
				1167	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1168	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1169
				1170	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1171	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1172	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1173	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1174	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1175	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1176	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1177	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1178	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1179	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1180	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1181	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1182	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1183	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1184	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1185	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1186	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1187	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1188	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1189	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1190	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1191	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1192	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1193	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1194
				1195	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1196	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1197	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1198	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1199	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1200	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1201	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1202	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1203	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1204	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1205	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1206	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1207	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1208	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1209	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1210	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1211	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1212	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1213	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1214	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1215	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1216	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1217	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1218	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1219	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1220	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1221	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1222	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1223
				1224	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1225	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1226	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1227	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1228	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1229	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1230	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1231	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1232	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1233	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1234	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1235	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1236	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1237	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1238	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1239	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1240	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1241
				1242	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1243	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1244
				1245	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1246
				1247	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1248	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1249
				1250	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1251	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1252
				1253	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1254	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1255
				1256	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1257
				1258	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1259	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1260	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1261	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1262	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1263	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1264	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1265	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1266	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1267	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1268	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1269	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1270	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1271	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1272	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1273	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1274
				1275	{ 8704, "forall","for all, U+2200 ISOtech" },
				1276	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1277	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1278	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1279	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1280	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1281	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1282	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1283	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1284	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1285	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1286	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1287	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1288	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1289	{ 8734, "infin","infinity, U+221E ISOtech" },
				1290	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1291	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1292	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1293	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1294	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1295	{ 8747, "int", "integral, U+222B ISOtech" },
				1296	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1297	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1298	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1299	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1300	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1301	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1302	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1303	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1304	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1305	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1306	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1307	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1308	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1309	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1310	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1311	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1312	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1313	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1314	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1315	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1316	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1317	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1318	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1319	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1320
				1321	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1322	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1323	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1324	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1325
				1326	};
				1327
				1328	/************************************************************************
				1329	* *
				1330	* Commodity functions to handle entities *
				1331	* *
				1332	************************************************************************/
				1333
				1334	/*
				1335	* Macro used to grow the current buffer.
				1336	*/
				1337	#define growBuffer(buffer) { \
				1338	buffer##_size *= 2; \
				1339	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1340	if (buffer == NULL) { \
				1341	perror("realloc failed"); \
				1342	return(NULL); \
				1343	} \
				1344	}
				1345
				1346	/**
				1347	* htmlEntityLookup:
				1348	* @name: the entity name
				1349	*
				1350	* Lookup the given entity in EntitiesTable
				1351	*
				1352	* TODO: the linear scan is really ugly, an hash table is really needed.
				1353	*
				1354	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1355	*/
				1356	htmlEntityDescPtr
				1357	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1358	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1359
				1360	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1361	sizeof(html40EntitiesTable[0]));i++) {
				1362	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1363	#ifdef DEBUG
				1364	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1365	#endif
				1366	return(&html40EntitiesTable[i]);
				1367	}
				1368	}
				1369	return(NULL);
				1370	}
				1371
				1372	/**
				1373	* htmlEntityValueLookup:
				1374	* @value: the entity's unicode value
				1375	*
				1376	* Lookup the given entity in EntitiesTable
				1377	*
				1378	* TODO: the linear scan is really ugly, an hash table is really needed.
				1379	*
				1380	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1381	*/
				1382	htmlEntityDescPtr
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1383	htmlEntityValueLookup(unsigned int value) {
				1384	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1385	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1386	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1387	#endif
				1388
				1389	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1390	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1391	if (html40EntitiesTable[i].value >= value) {
				1392	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1393	break;
				1394	#ifdef DEBUG
				1395	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1396	#endif
				1397	return(&html40EntitiesTable[i]);
				1398	}
				1399	#ifdef DEBUG
				1400	if (lv > html40EntitiesTable[i].value) {
				1401	xmlGenericError(xmlGenericErrorContext,
				1402	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1403	lv, html40EntitiesTable[i].value);
				1404	}
				1405	lv = html40EntitiesTable[i].value;
				1406	#endif
				1407	}
				1408	return(NULL);
				1409	}
				1410
				1411	/**
				1412	* UTF8ToHtml:
				1413	* @out: a pointer to an array of bytes to store the result
				1414	* @outlen: the length of @out
				1415	* @in: a pointer to an array of UTF-8 chars
				1416	* @inlen: the length of @in
				1417	*
				1418	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1419	* plus HTML entities block of chars out.
				1420	*
				1421	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1422	* The value of @inlen after return is the number of octets consumed
				1423	* as the return value is positive, else unpredictiable.
				1424	* The value of @outlen after return is the number of octets consumed.
				1425	*/
				1426	int
				1427	UTF8ToHtml(unsigned char* out, int *outlen,
				1428	const unsigned char* in, int *inlen) {
				1429	const unsigned char* processed = in;
				1430	const unsigned char* outend;
				1431	const unsigned char* outstart = out;
				1432	const unsigned char* instart = in;
				1433	const unsigned char* inend;
				1434	unsigned int c, d;
				1435	int trailing;
				1436
				1437	if (in == NULL) {
				1438	/*
				1439	* initialization nothing to do
				1440	*/
				1441	*outlen = 0;
				1442	*inlen = 0;
				1443	return(0);
				1444	}
				1445	inend = in + (*inlen);
				1446	outend = out + (*outlen);
				1447	while (in < inend) {
				1448	d = *in++;
				1449	if (d < 0x80) { c= d; trailing= 0; }
				1450	else if (d < 0xC0) {
				1451	/* trailing byte in leading position */
				1452	*outlen = out - outstart;
				1453	*inlen = processed - instart;
				1454	return(-2);
				1455	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1456	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1457	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1458	else {
				1459	/* no chance for this in Ascii */
				1460	*outlen = out - outstart;
				1461	*inlen = processed - instart;
				1462	return(-2);
				1463	}
				1464
				1465	if (inend - in < trailing) {
				1466	break;
				1467	}
				1468
				1469	for ( ; trailing; trailing--) {
				1470	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1471	break;
				1472	c <<= 6;
				1473	c \|= d & 0x3F;
				1474	}
				1475
				1476	/* assertion: c is a single UTF-4 value */
				1477	if (c < 0x80) {
				1478	if (out + 1 >= outend)
				1479	break;
				1480	*out++ = c;
				1481	} else {
				1482	int len;
				1483	htmlEntityDescPtr ent;
				1484
				1485	/*
				1486	* Try to lookup a predefined HTML entity for it
				1487	*/
				1488
				1489	ent = htmlEntityValueLookup(c);
				1490	if (ent == NULL) {
				1491	/* no chance for this in Ascii */
				1492	*outlen = out - outstart;
				1493	*inlen = processed - instart;
				1494	return(-2);
				1495	}
				1496	len = strlen(ent->name);
				1497	if (out + 2 + len >= outend)
				1498	break;
				1499	*out++ = '&';
				1500	memcpy(out, ent->name, len);
				1501	out += len;
				1502	*out++ = ';';
				1503	}
				1504	processed = in;
				1505	}
				1506	*outlen = out - outstart;
				1507	*inlen = processed - instart;
				1508	return(0);
				1509	}
				1510
				1511	/**
				1512	* htmlEncodeEntities:
				1513	* @out: a pointer to an array of bytes to store the result
				1514	* @outlen: the length of @out
				1515	* @in: a pointer to an array of UTF-8 chars
				1516	* @inlen: the length of @in
				1517	* @quoteChar: the quote character to escape (' or ") or zero.
				1518	*
				1519	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1520	* plus HTML entities block of chars out.
				1521	*
				1522	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1523	* The value of @inlen after return is the number of octets consumed
				1524	* as the return value is positive, else unpredictiable.
				1525	* The value of @outlen after return is the number of octets consumed.
				1526	*/
				1527	int
				1528	htmlEncodeEntities(unsigned char* out, int *outlen,
				1529	const unsigned char* in, int *inlen, int quoteChar) {
				1530	const unsigned char* processed = in;
				1531	const unsigned char* outend = out + (*outlen);
				1532	const unsigned char* outstart = out;
				1533	const unsigned char* instart = in;
				1534	const unsigned char* inend = in + (*inlen);
				1535	unsigned int c, d;
				1536	int trailing;
				1537
				1538	while (in < inend) {
				1539	d = *in++;
				1540	if (d < 0x80) { c= d; trailing= 0; }
				1541	else if (d < 0xC0) {
				1542	/* trailing byte in leading position */
				1543	*outlen = out - outstart;
				1544	*inlen = processed - instart;
				1545	return(-2);
				1546	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1547	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1548	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1549	else {
				1550	/* no chance for this in Ascii */
				1551	*outlen = out - outstart;
				1552	*inlen = processed - instart;
				1553	return(-2);
				1554	}
				1555
				1556	if (inend - in < trailing)
				1557	break;
				1558
				1559	while (trailing--) {
				1560	if (((d= *in++) & 0xC0) != 0x80) {
				1561	*outlen = out - outstart;
				1562	*inlen = processed - instart;
				1563	return(-2);
				1564	}
				1565	c <<= 6;
				1566	c \|= d & 0x3F;
				1567	}
				1568
				1569	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1570	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1571	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1572	if (out >= outend)
				1573	break;
				1574	*out++ = c;
				1575	} else {
				1576	htmlEntityDescPtr ent;
				1577	const char *cp;
				1578	char nbuf[16];
				1579	int len;
				1580
				1581	/*
				1582	* Try to lookup a predefined HTML entity for it
				1583	*/
				1584	ent = htmlEntityValueLookup(c);
				1585	if (ent == NULL) {
				1586	sprintf(nbuf, "#%u", c);
				1587	cp = nbuf;
				1588	}
				1589	else
				1590	cp = ent->name;
				1591	len = strlen(cp);
				1592	if (out + 2 + len > outend)
				1593	break;
				1594	*out++ = '&';
				1595	memcpy(out, cp, len);
				1596	out += len;
				1597	*out++ = ';';
				1598	}
				1599	processed = in;
				1600	}
				1601	*outlen = out - outstart;
				1602	*inlen = processed - instart;
				1603	return(0);
				1604	}
				1605
				1606	/**
				1607	* htmlDecodeEntities:
				1608	* @ctxt: the parser context
				1609	* @len: the len to decode (in bytes !), -1 for no size limit
				1610	* @end: an end marker xmlChar, 0 if none
				1611	* @end2: an end marker xmlChar, 0 if none
				1612	* @end3: an end marker xmlChar, 0 if none
				1613	*
				1614	* Subtitute the HTML entities by their value
				1615	*
				1616	* DEPRECATED !!!!
				1617	*
				1618	* Returns A newly allocated string with the substitution done. The caller
				1619	* must deallocate it !
				1620	*/
				1621	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1622	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1623	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1624	static int deprecated = 0;
				1625	if (!deprecated) {
				1626	xmlGenericError(xmlGenericErrorContext,
				1627	"htmlDecodeEntities() deprecated function reached\n");
				1628	deprecated = 1;
				1629	}
				1630	return(NULL);
				1631	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1632	xmlChar *name = NULL;
				1633	xmlChar *buffer = NULL;
				1634	unsigned int buffer_size = 0;
				1635	unsigned int nbchars = 0;
				1636	htmlEntityDescPtr ent;
				1637	unsigned int max = (unsigned int) len;
				1638	int c,l;
				1639
				1640	if (ctxt->depth > 40) {
				1641	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1642	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1643	ctxt->sax->error(ctxt->userData,
				1644	"Detected entity reference loop\n");
				1645	ctxt->wellFormed = 0;
				1646	ctxt->disableSAX = 1;
				1647	return(NULL);
				1648	}
				1649
				1650	/*
				1651	* allocate a translation buffer.
				1652	*/
				1653	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1654	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1655	if (buffer == NULL) {
				1656	perror("xmlDecodeEntities: malloc failed");
				1657	return(NULL);
				1658	}
				1659
				1660	/*
				1661	* Ok loop until we reach one of the ending char or a size limit.
				1662	*/
				1663	c = CUR_CHAR(l);
				1664	while ((nbchars < max) && (c != end) &&
				1665	(c != end2) && (c != end3)) {
				1666
				1667	if (c == 0) break;
				1668	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1669	int val = htmlParseCharRef(ctxt);
				1670	COPY_BUF(0,buffer,nbchars,val);
				1671	NEXTL(l);
				1672	} else if ((c == '&') && (ctxt->token != '&')) {
				1673	ent = htmlParseEntityRef(ctxt, &name);
				1674	if (name != NULL) {
				1675	if (ent != NULL) {
				1676	int val = ent->value;
				1677	COPY_BUF(0,buffer,nbchars,val);
				1678	NEXTL(l);
				1679	} else {
				1680	const xmlChar *cur = name;
				1681
				1682	buffer[nbchars++] = '&';
				1683	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1684	growBuffer(buffer);
				1685	}
				1686	while (*cur != 0) {
				1687	buffer[nbchars++] = *cur++;
				1688	}
				1689	buffer[nbchars++] = ';';
				1690	}
				1691	}
				1692	} else {
				1693	COPY_BUF(l,buffer,nbchars,c);
				1694	NEXTL(l);
				1695	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1696	growBuffer(buffer);
				1697	}
				1698	}
				1699	c = CUR_CHAR(l);
				1700	}
				1701	buffer[nbchars++] = 0;
				1702	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1703	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1704	}
				1705
				1706	/************************************************************************
				1707	* *
				1708	* Commodity functions to handle streams *
				1709	* *
				1710	************************************************************************/
				1711
				1712	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1713	* htmlNewInputStream:
				1714	* @ctxt: an HTML parser context
				1715	*
				1716	* Create a new input stream structure
				1717	* Returns the new input stream or NULL
				1718	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1719	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1720	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1721	htmlParserInputPtr input;
				1722
				1723	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1724	if (input == NULL) {
				1725	ctxt->errNo = XML_ERR_NO_MEMORY;
				1726	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1727	ctxt->sax->error(ctxt->userData,
				1728	"malloc: couldn't allocate a new input stream\n");
				1729	return(NULL);
				1730	}
				1731	memset(input, 0, sizeof(htmlParserInput));
				1732	input->filename = NULL;
				1733	input->directory = NULL;
				1734	input->base = NULL;
				1735	input->cur = NULL;
				1736	input->buf = NULL;
				1737	input->line = 1;
				1738	input->col = 1;
				1739	input->buf = NULL;
				1740	input->free = NULL;
				1741	input->version = NULL;
				1742	input->consumed = 0;
				1743	input->length = 0;
				1744	return(input);
				1745	}
				1746
				1747
				1748	/************************************************************************
				1749	* *
				1750	* Commodity functions, cleanup needed ? *
				1751	* *
				1752	************************************************************************/
				1753
				1754	/**
				1755	* areBlanks:
				1756	* @ctxt: an HTML parser context
				1757	* @str: a xmlChar *
				1758	* @len: the size of @str
				1759	*
				1760	* Is this a sequence of blank chars that one can ignore ?
				1761	*
				1762	* Returns 1 if ignorable 0 otherwise.
				1763	*/
				1764
				1765	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1766	int i;
				1767	xmlNodePtr lastChild;
				1768
				1769	for (i = 0;i < len;i++)
				1770	if (!(IS_BLANK(str[i]))) return(0);
				1771
				1772	if (CUR == 0) return(1);
				1773	if (CUR != '<') return(0);
				1774	if (ctxt->name == NULL)
				1775	return(1);
				1776	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1777	return(1);
				1778	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1779	return(1);
				1780	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1781	return(1);
				1782	if (ctxt->node == NULL) return(0);
				1783	lastChild = xmlGetLastChild(ctxt->node);
				1784	if (lastChild == NULL) {
				1785	if (ctxt->node->content != NULL) return(0);
				1786	} else if (xmlNodeIsText(lastChild)) {
				1787	return(0);
				1788	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1789	return(0);
				1790	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1791	return(0);
				1792	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1793	return(0);
				1794	}
				1795	return(1);
				1796	}
				1797
				1798	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1799	* htmlNewDocNoDtD:
				1800	* @URI: URI for the dtd, or NULL
				1801	* @ExternalID: the external ID of the DTD, or NULL
				1802	*
				1803	* Returns a new document, do not intialize the DTD if not provided
				1804	*/
				1805	htmlDocPtr
				1806	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1807	xmlDocPtr cur;
				1808
				1809	/*
				1810	* Allocate a new document and fill the fields.
				1811	*/
				1812	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1813	if (cur == NULL) {
				1814	xmlGenericError(xmlGenericErrorContext,
				1815	"xmlNewDoc : malloc failed\n");
				1816	return(NULL);
				1817	}
				1818	memset(cur, 0, sizeof(xmlDoc));
				1819
				1820	cur->type = XML_HTML_DOCUMENT_NODE;
				1821	cur->version = NULL;
				1822	cur->intSubset = NULL;
				1823	if ((ExternalID != NULL) \|\|
				1824	(URI != NULL))
				1825	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1826	cur->doc = cur;
				1827	cur->name = NULL;
				1828	cur->children = NULL;
				1829	cur->extSubset = NULL;
				1830	cur->oldNs = NULL;
				1831	cur->encoding = NULL;
				1832	cur->standalone = 1;
				1833	cur->compression = 0;
				1834	cur->ids = NULL;
				1835	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1836	cur->_private = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1837	return(cur);
				1838	}
				1839
				1840	/**
				1841	* htmlNewDoc:
				1842	* @URI: URI for the dtd, or NULL
				1843	* @ExternalID: the external ID of the DTD, or NULL
				1844	*
				1845	* Returns a new document
				1846	*/
				1847	htmlDocPtr
				1848	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1849	if ((URI == NULL) && (ExternalID == NULL))
				1850	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1851	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1852	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1853
				1854	return(htmlNewDocNoDtD(URI, ExternalID));
				1855	}
				1856
				1857
				1858	/************************************************************************
				1859	* *
				1860	* The parser itself *
				1861	* Relates to http://www.w3.org/TR/html40 *
				1862	* *
				1863	************************************************************************/
				1864
				1865	/************************************************************************
				1866	* *
				1867	* The parser itself *
				1868	* *
				1869	************************************************************************/
				1870
				1871	/**
				1872	* htmlParseHTMLName:
				1873	* @ctxt: an HTML parser context
				1874	*
				1875	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1876	* since HTML names are not case-sensitive.
				1877	*
				1878	* Returns the Tag Name parsed or NULL
				1879	*/
				1880
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1881	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1882	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1883	xmlChar *ret = NULL;
				1884	int i = 0;
				1885	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1886
				1887	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1888	(CUR != ':')) return(NULL);
				1889
				1890	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1891	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1892	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1893	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1894	else loc[i] = CUR;
				1895	i++;
				1896
				1897	NEXT;
				1898	}
				1899
				1900	ret = xmlStrndup(loc, i);
				1901
				1902	return(ret);
				1903	}
				1904
				1905	/**
				1906	* htmlParseName:
				1907	* @ctxt: an HTML parser context
				1908	*
				1909	* parse an HTML name, this routine is case sensistive.
				1910	*
				1911	* Returns the Name parsed or NULL
				1912	*/
				1913
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1914	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1915	htmlParseName(htmlParserCtxtPtr ctxt) {
				1916	xmlChar buf[HTML_MAX_NAMELEN];
				1917	int len = 0;
				1918
				1919	GROW;
				1920	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1921	return(NULL);
				1922	}
				1923
				1924	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1925	(CUR == '.') \|\| (CUR == '-') \|\|
				1926	(CUR == '_') \|\| (CUR == ':') \|\|
				1927	(IS_COMBINING(CUR)) \|\|
				1928	(IS_EXTENDER(CUR))) {
				1929	buf[len++] = CUR;
				1930	NEXT;
				1931	if (len >= HTML_MAX_NAMELEN) {
				1932	xmlGenericError(xmlGenericErrorContext,
				1933	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1934	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1935	(CUR == '.') \|\| (CUR == '-') \|\|
				1936	(CUR == '_') \|\| (CUR == ':') \|\|
				1937	(IS_COMBINING(CUR)) \|\|
				1938	(IS_EXTENDER(CUR)))
				1939	NEXT;
				1940	break;
				1941	}
				1942	}
				1943	return(xmlStrndup(buf, len));
				1944	}
				1945
				1946	/**
				1947	* htmlParseHTMLAttribute:
				1948	* @ctxt: an HTML parser context
				1949	* @stop: a char stop value
				1950	*
				1951	* parse an HTML attribute value till the stop (quote), if
				1952	* stop is 0 then it stops at the first space
				1953	*
				1954	* Returns the attribute parsed or NULL
				1955	*/
				1956
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1957	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1958	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1959	xmlChar *buffer = NULL;
				1960	int buffer_size = 0;
				1961	xmlChar *out = NULL;
				1962	xmlChar *name = NULL;
				1963
				1964	xmlChar *cur = NULL;
				1965	htmlEntityDescPtr ent;
				1966
				1967	/*
				1968	* allocate a translation buffer.
				1969	*/
				1970	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1971	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1972	if (buffer == NULL) {
				1973	perror("htmlParseHTMLAttribute: malloc failed");
				1974	return(NULL);
				1975	}
				1976	out = buffer;
				1977
				1978	/*
				1979	* Ok loop until we reach one of the ending chars
				1980	*/
				1981	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1982	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1983	if (CUR == '&') {
				1984	if (NXT(1) == '#') {
				1985	unsigned int c;
				1986	int bits;
				1987
				1988	c = htmlParseCharRef(ctxt);
				1989	if (c < 0x80)
				1990	{ *out++ = c; bits= -6; }
				1991	else if (c < 0x800)
				1992	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1993	else if (c < 0x10000)
				1994	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1995	else
				1996	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1997
				1998	for ( ; bits >= 0; bits-= 6) {
				1999	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2000	}
				2001	} else {
				2002	ent = htmlParseEntityRef(ctxt, &name);
				2003	if (name == NULL) {
				2004	*out++ = '&';
				2005	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2006	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2007
				2008	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2009	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2010	}
				2011	} else if (ent == NULL) {
				2012	*out++ = '&';
				2013	cur = name;
				2014	while (*cur != 0) {
				2015	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2016	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2017
				2018	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2019	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2020	}
				2021	out++ = cur++;
				2022	}
				2023	xmlFree(name);
				2024	} else {
				2025	unsigned int c;
				2026	int bits;
				2027
				2028	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2029	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2030
				2031	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2032	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2033	}
				2034	c = (xmlChar)ent->value;
				2035	if (c < 0x80)
				2036	{ *out++ = c; bits= -6; }
				2037	else if (c < 0x800)
				2038	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2039	else if (c < 0x10000)
				2040	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2041	else
				2042	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2043
				2044	for ( ; bits >= 0; bits-= 6) {
				2045	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2046	}
				2047	xmlFree(name);
				2048	}
				2049	}
				2050	} else {
				2051	unsigned int c;
				2052	int bits, l;
				2053
				2054	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2055	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2056
				2057	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2058	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2059	}
				2060	c = CUR_CHAR(l);
				2061	if (c < 0x80)
				2062	{ *out++ = c; bits= -6; }
				2063	else if (c < 0x800)
				2064	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2065	else if (c < 0x10000)
				2066	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2067	else
				2068	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2069
				2070	for ( ; bits >= 0; bits-= 6) {
				2071	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2072	}
				2073	NEXT;
				2074	}
				2075	}
				2076	*out++ = 0;
				2077	return(buffer);
				2078	}
				2079
				2080	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2081	* htmlParseEntityRef:
				2082	* @ctxt: an HTML parser context
				2083	* @str: location to store the entity name
				2084	*
				2085	* parse an HTML ENTITY references
				2086	*
				2087	* [68] EntityRef ::= '&' Name ';'
				2088	*
				2089	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2090	* if non-NULL *str will have to be freed by the caller.
				2091	*/
				2092	htmlEntityDescPtr
				2093	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2094	xmlChar *name;
				2095	htmlEntityDescPtr ent = NULL;
				2096	*str = NULL;
				2097
				2098	if (CUR == '&') {
				2099	NEXT;
				2100	name = htmlParseName(ctxt);
				2101	if (name == NULL) {
				2102	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2103	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2104	ctxt->wellFormed = 0;
				2105	} else {
				2106	GROW;
				2107	if (CUR == ';') {
				2108	*str = name;
				2109
				2110	/*
				2111	* Lookup the entity in the table.
				2112	*/
				2113	ent = htmlEntityLookup(name);
				2114	if (ent != NULL) /* OK that's ugly !!! */
				2115	NEXT;
				2116	} else {
				2117	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2118	ctxt->sax->error(ctxt->userData,
				2119	"htmlParseEntityRef: expecting ';'\n");
				2120	*str = name;
				2121	}
				2122	}
				2123	}
				2124	return(ent);
				2125	}
				2126
				2127	/**
				2128	* htmlParseAttValue:
				2129	* @ctxt: an HTML parser context
				2130	*
				2131	* parse a value for an attribute
				2132	* Note: the parser won't do substitution of entities here, this
				2133	* will be handled later in xmlStringGetNodeList, unless it was
				2134	* asked for ctxt->replaceEntities != 0
				2135	*
				2136	* Returns the AttValue parsed or NULL.
				2137	*/
				2138
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2139	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2140	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2141	xmlChar *ret = NULL;
				2142
				2143	if (CUR == '"') {
				2144	NEXT;
				2145	ret = htmlParseHTMLAttribute(ctxt, '"');
				2146	if (CUR != '"') {
				2147	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2148	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2149	ctxt->wellFormed = 0;
				2150	} else
				2151	NEXT;
				2152	} else if (CUR == '\'') {
				2153	NEXT;
				2154	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2155	if (CUR != '\'') {
				2156	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2157	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2158	ctxt->wellFormed = 0;
				2159	} else
				2160	NEXT;
				2161	} else {
				2162	/*
				2163	* That's an HTMLism, the attribute value may not be quoted
				2164	*/
				2165	ret = htmlParseHTMLAttribute(ctxt, 0);
				2166	if (ret == NULL) {
				2167	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2168	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2169	ctxt->wellFormed = 0;
				2170	}
				2171	}
				2172	return(ret);
				2173	}
				2174
				2175	/**
				2176	* htmlParseSystemLiteral:
				2177	* @ctxt: an HTML parser context
				2178	*
				2179	* parse an HTML Literal
				2180	*
				2181	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2182	*
				2183	* Returns the SystemLiteral parsed or NULL
				2184	*/
				2185
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2186	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2187	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2188	const xmlChar *q;
				2189	xmlChar *ret = NULL;
				2190
				2191	if (CUR == '"') {
				2192	NEXT;
				2193	q = CUR_PTR;
				2194	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2195	NEXT;
				2196	if (!IS_CHAR(CUR)) {
				2197	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2198	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2199	ctxt->wellFormed = 0;
				2200	} else {
				2201	ret = xmlStrndup(q, CUR_PTR - q);
				2202	NEXT;
				2203	}
				2204	} else if (CUR == '\'') {
				2205	NEXT;
				2206	q = CUR_PTR;
				2207	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2208	NEXT;
				2209	if (!IS_CHAR(CUR)) {
				2210	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2211	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2212	ctxt->wellFormed = 0;
				2213	} else {
				2214	ret = xmlStrndup(q, CUR_PTR - q);
				2215	NEXT;
				2216	}
				2217	} else {
				2218	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2219	ctxt->sax->error(ctxt->userData,
				2220	"SystemLiteral \" or ' expected\n");
				2221	ctxt->wellFormed = 0;
				2222	}
				2223
				2224	return(ret);
				2225	}
				2226
				2227	/**
				2228	* htmlParsePubidLiteral:
				2229	* @ctxt: an HTML parser context
				2230	*
				2231	* parse an HTML public literal
				2232	*
				2233	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2234	*
				2235	* Returns the PubidLiteral parsed or NULL.
				2236	*/
				2237
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2238	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2239	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2240	const xmlChar *q;
				2241	xmlChar *ret = NULL;
				2242	/*
				2243	* Name ::= (Letter \| '_') (NameChar)*
				2244	*/
				2245	if (CUR == '"') {
				2246	NEXT;
				2247	q = CUR_PTR;
				2248	while (IS_PUBIDCHAR(CUR)) NEXT;
				2249	if (CUR != '"') {
				2250	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2251	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2252	ctxt->wellFormed = 0;
				2253	} else {
				2254	ret = xmlStrndup(q, CUR_PTR - q);
				2255	NEXT;
				2256	}
				2257	} else if (CUR == '\'') {
				2258	NEXT;
				2259	q = CUR_PTR;
				2260	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2261	NEXT;
				2262	if (!IS_LETTER(CUR)) {
				2263	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2264	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2265	ctxt->wellFormed = 0;
				2266	} else {
				2267	ret = xmlStrndup(q, CUR_PTR - q);
				2268	NEXT;
				2269	}
				2270	} else {
				2271	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2272	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2273	ctxt->wellFormed = 0;
				2274	}
				2275
				2276	return(ret);
				2277	}
				2278
				2279	/**
				2280	* htmlParseScript:
				2281	* @ctxt: an HTML parser context
				2282	*
				2283	* parse the content of an HTML SCRIPT or STYLE element
				2284	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2285	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2286	* http://www.w3.org/TR/html4/types.html#type-script
				2287	* http://www.w3.org/TR/html4/types.html#h-6.15
				2288	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2289	*
				2290	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2291	* element and the value of intrinsic event attributes. User agents must
				2292	* not evaluate script data as HTML markup but instead must pass it on as
				2293	* data to a script engine.
				2294	* NOTES:
				2295	* - The content is passed like CDATA
				2296	* - the attributes for style and scripting "onXXX" are also described
				2297	* as CDATA but SGML allows entities references in attributes so their
				2298	* processing is identical as other attributes
				2299	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2300	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2301	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2302	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2303	int nbchar = 0;
				2304	xmlChar cur;
				2305
				2306	SHRINK;
				2307	cur = CUR;
				2308	while (IS_CHAR(cur)) {
				2309	if ((cur == '<') && (NXT(1) == '/')) {
				2310	/*
				2311	* One should break here, the specification is clear:
				2312	* Authors should therefore escape "</" within the content.
				2313	* Escape mechanisms are specific to each scripting or
				2314	* style sheet language.
				2315	*/
				2316	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2317	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2318	break; /* while */
				2319	}
				2320	buf[nbchar++] = cur;
				2321	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2322	if (ctxt->sax->cdataBlock!= NULL) {
				2323	/*
				2324	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2325	*/
				2326	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2327	}
				2328	nbchar = 0;
				2329	}
				2330	NEXT;
				2331	cur = CUR;
				2332	}
				2333	if (!(IS_CHAR(cur))) {
				2334	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2335	ctxt->sax->error(ctxt->userData,
				2336	"Invalid char in CDATA 0x%X\n", cur);
				2337	ctxt->wellFormed = 0;
				2338	NEXT;
				2339	}
				2340
				2341	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2342	if (ctxt->sax->cdataBlock!= NULL) {
				2343	/*
				2344	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2345	*/
				2346	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2347	}
				2348	}
				2349	}
				2350
				2351
				2352	/**
				2353	* htmlParseCharData:
				2354	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2355	*
				2356	* parse a CharData section.
				2357	* if we are within a CDATA section ']]>' marks an end of section.
				2358	*
				2359	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2360	*/
				2361
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2362	static void
				2363	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2364	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2365	int nbchar = 0;
				2366	int cur, l;
				2367
				2368	SHRINK;
				2369	cur = CUR_CHAR(l);
				2370	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2371	((cur != '&') \|\| (ctxt->token == '&')) &&
				2372	(IS_CHAR(cur))) {
				2373	COPY_BUF(l,buf,nbchar,cur);
				2374	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2375	/*
				2376	* Ok the segment is to be consumed as chars.
				2377	*/
				2378	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2379	if (areBlanks(ctxt, buf, nbchar)) {
				2380	if (ctxt->sax->ignorableWhitespace != NULL)
				2381	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2382	buf, nbchar);
				2383	} else {
				2384	htmlCheckParagraph(ctxt);
				2385	if (ctxt->sax->characters != NULL)
				2386	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2387	}
				2388	}
				2389	nbchar = 0;
				2390	}
				2391	NEXTL(l);
				2392	cur = CUR_CHAR(l);
				2393	}
				2394	if (nbchar != 0) {
				2395	/*
				2396	* Ok the segment is to be consumed as chars.
				2397	*/
				2398	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2399	if (areBlanks(ctxt, buf, nbchar)) {
				2400	if (ctxt->sax->ignorableWhitespace != NULL)
				2401	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2402	} else {
				2403	htmlCheckParagraph(ctxt);
				2404	if (ctxt->sax->characters != NULL)
				2405	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2406	}
				2407	}
				2408	}
				2409	}
				2410
				2411	/**
				2412	* htmlParseExternalID:
				2413	* @ctxt: an HTML parser context
				2414	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2415	*
				2416	* Parse an External ID or a Public ID
				2417	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2418	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2419	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2420	*
				2421	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2422	*
				2423	* Returns the function returns SystemLiteral and in the second
				2424	* case publicID receives PubidLiteral, is strict is off
				2425	* it is possible to return NULL and have publicID set.
				2426	*/
				2427
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2428	static xmlChar *
				2429	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2430	xmlChar *URI = NULL;
				2431
				2432	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2433	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2434	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2435	SKIP(6);
				2436	if (!IS_BLANK(CUR)) {
				2437	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2438	ctxt->sax->error(ctxt->userData,
				2439	"Space required after 'SYSTEM'\n");
				2440	ctxt->wellFormed = 0;
				2441	}
				2442	SKIP_BLANKS;
				2443	URI = htmlParseSystemLiteral(ctxt);
				2444	if (URI == NULL) {
				2445	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2446	ctxt->sax->error(ctxt->userData,
				2447	"htmlParseExternalID: SYSTEM, no URI\n");
				2448	ctxt->wellFormed = 0;
				2449	}
				2450	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2451	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2452	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2453	SKIP(6);
				2454	if (!IS_BLANK(CUR)) {
				2455	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2456	ctxt->sax->error(ctxt->userData,
				2457	"Space required after 'PUBLIC'\n");
				2458	ctxt->wellFormed = 0;
				2459	}
				2460	SKIP_BLANKS;
				2461	*publicID = htmlParsePubidLiteral(ctxt);
				2462	if (*publicID == NULL) {
				2463	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2464	ctxt->sax->error(ctxt->userData,
				2465	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2466	ctxt->wellFormed = 0;
				2467	}
				2468	SKIP_BLANKS;
				2469	if ((CUR == '"') \|\| (CUR == '\'')) {
				2470	URI = htmlParseSystemLiteral(ctxt);
				2471	}
				2472	}
				2473	return(URI);
				2474	}
				2475
				2476	/**
				2477	* htmlParseComment:
				2478	* @ctxt: an HTML parser context
				2479	*
				2480	* Parse an XML (SGML) comment <!-- .... -->
				2481	*
				2482	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2483	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2484	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2485	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2486	xmlChar *buf = NULL;
				2487	int len;
				2488	int size = HTML_PARSER_BUFFER_SIZE;
				2489	int q, ql;
				2490	int r, rl;
				2491	int cur, l;
				2492	xmlParserInputState state;
				2493
				2494	/*
				2495	* Check that there is a comment right here.
				2496	*/
				2497	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2498	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2499
				2500	state = ctxt->instate;
				2501	ctxt->instate = XML_PARSER_COMMENT;
				2502	SHRINK;
				2503	SKIP(4);
				2504	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2505	if (buf == NULL) {
				2506	xmlGenericError(xmlGenericErrorContext,
				2507	"malloc of %d byte failed\n", size);
				2508	ctxt->instate = state;
				2509	return;
				2510	}
				2511	q = CUR_CHAR(ql);
				2512	NEXTL(ql);
				2513	r = CUR_CHAR(rl);
				2514	NEXTL(rl);
				2515	cur = CUR_CHAR(l);
				2516	len = 0;
				2517	while (IS_CHAR(cur) &&
				2518	((cur != '>') \|\|
				2519	(r != '-') \|\| (q != '-'))) {
				2520	if (len + 5 >= size) {
				2521	size *= 2;
				2522	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2523	if (buf == NULL) {
				2524	xmlGenericError(xmlGenericErrorContext,
				2525	"realloc of %d byte failed\n", size);
				2526	ctxt->instate = state;
				2527	return;
				2528	}
				2529	}
				2530	COPY_BUF(ql,buf,len,q);
				2531	q = r;
				2532	ql = rl;
				2533	r = cur;
				2534	rl = l;
				2535	NEXTL(l);
				2536	cur = CUR_CHAR(l);
				2537	if (cur == 0) {
				2538	SHRINK;
				2539	GROW;
				2540	cur = CUR_CHAR(l);
				2541	}
				2542	}
				2543	buf[len] = 0;
				2544	if (!IS_CHAR(cur)) {
				2545	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2546	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2547	ctxt->sax->error(ctxt->userData,
				2548	"Comment not terminated \n<!--%.50s\n", buf);
				2549	ctxt->wellFormed = 0;
				2550	xmlFree(buf);
				2551	} else {
				2552	NEXT;
				2553	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2554	(!ctxt->disableSAX))
				2555	ctxt->sax->comment(ctxt->userData, buf);
				2556	xmlFree(buf);
				2557	}
				2558	ctxt->instate = state;
				2559	}
				2560
				2561	/**
				2562	* htmlParseCharRef:
				2563	* @ctxt: an HTML parser context
				2564	*
				2565	* parse Reference declarations
				2566	*
				2567	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2568	* '&#x' [0-9a-fA-F]+ ';'
				2569	*
				2570	* Returns the value parsed (as an int)
				2571	*/
				2572	int
				2573	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2574	int val = 0;
				2575
				2576	if ((CUR == '&') && (NXT(1) == '#') &&
				2577	(NXT(2) == 'x')) {
				2578	SKIP(3);
				2579	while (CUR != ';') {
				2580	if ((CUR >= '0') && (CUR <= '9'))
				2581	val = val * 16 + (CUR - '0');
				2582	else if ((CUR >= 'a') && (CUR <= 'f'))
				2583	val = val * 16 + (CUR - 'a') + 10;
				2584	else if ((CUR >= 'A') && (CUR <= 'F'))
				2585	val = val * 16 + (CUR - 'A') + 10;
				2586	else {
				2587	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2588	ctxt->sax->error(ctxt->userData,
				2589	"htmlParseCharRef: invalid hexadecimal value\n");
				2590	ctxt->wellFormed = 0;
				2591	return(0);
				2592	}
				2593	NEXT;
				2594	}
				2595	if (CUR == ';')
				2596	NEXT;
				2597	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2598	SKIP(2);
				2599	while (CUR != ';') {
				2600	if ((CUR >= '0') && (CUR <= '9'))
				2601	val = val * 10 + (CUR - '0');
				2602	else {
				2603	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2604	ctxt->sax->error(ctxt->userData,
				2605	"htmlParseCharRef: invalid decimal value\n");
				2606	ctxt->wellFormed = 0;
				2607	return(0);
				2608	}
				2609	NEXT;
				2610	}
				2611	if (CUR == ';')
				2612	NEXT;
				2613	} else {
				2614	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2615	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2616	ctxt->wellFormed = 0;
				2617	}
				2618	/*
				2619	* Check the value IS_CHAR ...
				2620	*/
				2621	if (IS_CHAR(val)) {
				2622	return(val);
				2623	} else {
				2624	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2625	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2626	val);
				2627	ctxt->wellFormed = 0;
				2628	}
				2629	return(0);
				2630	}
				2631
				2632
				2633	/**
				2634	* htmlParseDocTypeDecl :
				2635	* @ctxt: an HTML parser context
				2636	*
				2637	* parse a DOCTYPE declaration
				2638	*
				2639	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2640	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2641	*/
				2642
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2643	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2644	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2645	xmlChar *name;
				2646	xmlChar *ExternalID = NULL;
				2647	xmlChar *URI = NULL;
				2648
				2649	/*
				2650	* We know that '<!DOCTYPE' has been detected.
				2651	*/
				2652	SKIP(9);
				2653
				2654	SKIP_BLANKS;
				2655
				2656	/*
				2657	* Parse the DOCTYPE name.
				2658	*/
				2659	name = htmlParseName(ctxt);
				2660	if (name == NULL) {
				2661	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2662	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2663	ctxt->wellFormed = 0;
				2664	}
				2665	/*
				2666	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2667	*/
				2668
				2669	SKIP_BLANKS;
				2670
				2671	/*
				2672	* Check for SystemID and ExternalID
				2673	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2674	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2675	SKIP_BLANKS;
				2676
				2677	/*
				2678	* We should be at the end of the DOCTYPE declaration.
				2679	*/
				2680	if (CUR != '>') {
				2681	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2682	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2683	ctxt->wellFormed = 0;
				2684	/* We shouldn't try to resynchronize ... */
				2685	}
				2686	NEXT;
				2687
				2688	/*
				2689	* Create or update the document accordingly to the DOCTYPE
				2690	*/
				2691	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2692	(!ctxt->disableSAX))
				2693	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2694
				2695	/*
				2696	* Cleanup, since we don't use all those identifiers
				2697	*/
				2698	if (URI != NULL) xmlFree(URI);
				2699	if (ExternalID != NULL) xmlFree(ExternalID);
				2700	if (name != NULL) xmlFree(name);
				2701	}
				2702
				2703	/**
				2704	* htmlParseAttribute:
				2705	* @ctxt: an HTML parser context
				2706	* @value: a xmlChar ** used to store the value of the attribute
				2707	*
				2708	* parse an attribute
				2709	*
				2710	* [41] Attribute ::= Name Eq AttValue
				2711	*
				2712	* [25] Eq ::= S? '=' S?
				2713	*
				2714	* With namespace:
				2715	*
				2716	* [NS 11] Attribute ::= QName Eq AttValue
				2717	*
				2718	* Also the case QName == xmlns:??? is handled independently as a namespace
				2719	* definition.
				2720	*
				2721	* Returns the attribute name, and the value in *value.
				2722	*/
				2723
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2724	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2725	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2726	xmlChar name, val = NULL;
				2727
				2728	*value = NULL;
				2729	name = htmlParseHTMLName(ctxt);
				2730	if (name == NULL) {
				2731	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2732	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2733	ctxt->wellFormed = 0;
				2734	return(NULL);
				2735	}
				2736
				2737	/*
				2738	* read the value
				2739	*/
				2740	SKIP_BLANKS;
				2741	if (CUR == '=') {
				2742	NEXT;
				2743	SKIP_BLANKS;
				2744	val = htmlParseAttValue(ctxt);
				2745	/******
				2746	} else {
				2747	* TODO : some attribute must have values, some may not
				2748	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2749	ctxt->sax->warning(ctxt->userData,
				2750	"No value for attribute %s\n", name); */
				2751	}
				2752
				2753	*value = val;
				2754	return(name);
				2755	}
				2756
				2757	/**
				2758	* htmlCheckEncoding:
				2759	* @ctxt: an HTML parser context
				2760	* @attvalue: the attribute value
				2761	*
				2762	* Checks an http-equiv attribute from a Meta tag to detect
				2763	* the encoding
				2764	* If a new encoding is detected the parser is switched to decode
				2765	* it and pass UTF8
				2766	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2767	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2768	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2769	const xmlChar *encoding;
				2770
				2771	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2772	return;
				2773
				2774	/* do not change encoding */
				2775	if (ctxt->input->encoding != NULL)
				2776	return;
				2777
				2778	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2779	if (encoding != NULL) {
				2780	encoding += 8;
				2781	} else {
				2782	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2783	if (encoding != NULL)
				2784	encoding += 9;
				2785	}
				2786	if (encoding != NULL) {
				2787	xmlCharEncoding enc;
				2788	xmlCharEncodingHandlerPtr handler;
				2789
				2790	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2791
				2792	if (ctxt->input->encoding != NULL)
				2793	xmlFree((xmlChar *) ctxt->input->encoding);
				2794	ctxt->input->encoding = xmlStrdup(encoding);
				2795
				2796	enc = xmlParseCharEncoding((const char *) encoding);
				2797	/*
				2798	* registered set of known encodings
				2799	*/
				2800	if (enc != XML_CHAR_ENCODING_ERROR) {
				2801	xmlSwitchEncoding(ctxt, enc);
				2802	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2803	} else {
				2804	/*
				2805	* fallback for unknown encodings
				2806	*/
				2807	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2808	if (handler != NULL) {
				2809	xmlSwitchToEncoding(ctxt, handler);
				2810	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2811	} else {
				2812	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2813	}
				2814	}
				2815
				2816	if ((ctxt->input->buf != NULL) &&
				2817	(ctxt->input->buf->encoder != NULL) &&
				2818	(ctxt->input->buf->raw != NULL) &&
				2819	(ctxt->input->buf->buffer != NULL)) {
				2820	int nbchars;
				2821	int processed;
				2822
				2823	/*
				2824	* convert as much as possible to the parser reading buffer.
				2825	*/
				2826	processed = ctxt->input->cur - ctxt->input->base;
				2827	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2828	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2829	ctxt->input->buf->buffer,
				2830	ctxt->input->buf->raw);
				2831	if (nbchars < 0) {
				2832	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2833	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2834	ctxt->sax->error(ctxt->userData,
				2835	"htmlCheckEncoding: encoder error\n");
				2836	}
				2837	ctxt->input->base =
				2838	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2839	}
				2840	}
				2841	}
				2842
				2843	/**
				2844	* htmlCheckMeta:
				2845	* @ctxt: an HTML parser context
				2846	* @atts: the attributes values
				2847	*
				2848	* Checks an attributes from a Meta tag
				2849	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2850	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2851	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2852	int i;
				2853	const xmlChar att, value;
				2854	int http = 0;
				2855	const xmlChar *content = NULL;
				2856
				2857	if ((ctxt == NULL) \|\| (atts == NULL))
				2858	return;
				2859
				2860	i = 0;
				2861	att = atts[i++];
				2862	while (att != NULL) {
				2863	value = atts[i++];
				2864	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2865	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2866	http = 1;
				2867	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2868	content = value;
				2869	att = atts[i++];
				2870	}
				2871	if ((http) && (content != NULL))
				2872	htmlCheckEncoding(ctxt, content);
				2873
				2874	}
				2875
				2876	/**
				2877	* htmlParseStartTag:
				2878	* @ctxt: an HTML parser context
				2879	*
				2880	* parse a start of tag either for rule element or
				2881	* EmptyElement. In both case we don't parse the tag closing chars.
				2882	*
				2883	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2884	*
				2885	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2886	*
				2887	* With namespace:
				2888	*
				2889	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2890	*
				2891	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2892	*
				2893	*/
				2894
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2895	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2896	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2897	xmlChar *name;
				2898	xmlChar *attname;
				2899	xmlChar *attvalue;
				2900	const xmlChar **atts = NULL;
				2901	int nbatts = 0;
				2902	int maxatts = 0;
				2903	int meta = 0;
				2904	int i;
				2905
				2906	if (CUR != '<') return;
				2907	NEXT;
				2908
				2909	GROW;
				2910	name = htmlParseHTMLName(ctxt);
				2911	if (name == NULL) {
				2912	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2913	ctxt->sax->error(ctxt->userData,
				2914	"htmlParseStartTag: invalid element name\n");
				2915	ctxt->wellFormed = 0;
				2916	/* Dump the bogus tag like browsers do */
				2917	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2918	NEXT;
				2919	return;
				2920	}
				2921	if (xmlStrEqual(name, BAD_CAST"meta"))
				2922	meta = 1;
				2923
				2924	/*
				2925	* Check for auto-closure of HTML elements.
				2926	*/
				2927	htmlAutoClose(ctxt, name);
				2928
				2929	/*
				2930	* Check for implied HTML elements.
				2931	*/
				2932	htmlCheckImplied(ctxt, name);
				2933
				2934	/*
				2935	* Avoid html at any level > 0, head at any level != 1
				2936	* or any attempt to recurse body
				2937	*/
				2938	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2939	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2940	ctxt->sax->error(ctxt->userData,
				2941	"htmlParseStartTag: misplaced <html> tag\n");
				2942	ctxt->wellFormed = 0;
				2943	xmlFree(name);
				2944	return;
				2945	}
				2946	if ((ctxt->nameNr != 1) &&
				2947	(xmlStrEqual(name, BAD_CAST"head"))) {
				2948	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2949	ctxt->sax->error(ctxt->userData,
				2950	"htmlParseStartTag: misplaced <head> tag\n");
				2951	ctxt->wellFormed = 0;
				2952	xmlFree(name);
				2953	return;
				2954	}
				2955	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2956	int indx;
				2957	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2958	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2959	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2960	ctxt->sax->error(ctxt->userData,
				2961	"htmlParseStartTag: misplaced <body> tag\n");
				2962	ctxt->wellFormed = 0;
				2963	xmlFree(name);
				2964	return;
				2965	}
				2966	}
				2967	}
				2968
				2969	/*
				2970	* Now parse the attributes, it ends up with the ending
				2971	*
				2972	* (S Attribute)* S?
				2973	*/
				2974	SKIP_BLANKS;
				2975	while ((IS_CHAR(CUR)) &&
				2976	(CUR != '>') &&
				2977	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2978	long cons = ctxt->nbChars;
				2979
				2980	GROW;
				2981	attname = htmlParseAttribute(ctxt, &attvalue);
				2982	if (attname != NULL) {
				2983
				2984	/*
				2985	* Well formedness requires at most one declaration of an attribute
				2986	*/
				2987	for (i = 0; i < nbatts;i += 2) {
				2988	if (xmlStrEqual(atts[i], attname)) {
				2989	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2990	ctxt->sax->error(ctxt->userData,
				2991	"Attribute %s redefined\n",
				2992	attname);
				2993	ctxt->wellFormed = 0;
				2994	xmlFree(attname);
				2995	if (attvalue != NULL)
				2996	xmlFree(attvalue);
				2997	goto failed;
				2998	}
				2999	}
				3000
				3001	/*
				3002	* Add the pair to atts
				3003	*/
				3004	if (atts == NULL) {
				3005	maxatts = 10;
				3006	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				3007	if (atts == NULL) {
				3008	xmlGenericError(xmlGenericErrorContext,
				3009	"malloc of %ld byte failed\n",
				3010	maxatts * (long)sizeof(xmlChar *));
				3011	if (name != NULL) xmlFree(name);
				3012	return;
				3013	}
				3014	} else if (nbatts + 4 > maxatts) {
				3015	maxatts *= 2;
				3016	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3017	maxatts * sizeof(xmlChar *));
				3018	if (atts == NULL) {
				3019	xmlGenericError(xmlGenericErrorContext,
				3020	"realloc of %ld byte failed\n",
				3021	maxatts * (long)sizeof(xmlChar *));
				3022	if (name != NULL) xmlFree(name);
				3023	return;
				3024	}
				3025	}
				3026	atts[nbatts++] = attname;
				3027	atts[nbatts++] = attvalue;
				3028	atts[nbatts] = NULL;
				3029	atts[nbatts + 1] = NULL;
				3030	}
				3031	else {
				3032	/* Dump the bogus attribute string up to the next blank or
				3033	* the end of the tag. */
				3034	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3035	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3036	NEXT;
				3037	}
				3038
				3039	failed:
				3040	SKIP_BLANKS;
				3041	if (cons == ctxt->nbChars) {
				3042	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3043	ctxt->sax->error(ctxt->userData,
				3044	"htmlParseStartTag: problem parsing attributes\n");
				3045	ctxt->wellFormed = 0;
				3046	break;
				3047	}
				3048	}
				3049
				3050	/*
				3051	* Handle specific association to the META tag
				3052	*/
				3053	if (meta)
				3054	htmlCheckMeta(ctxt, atts);
				3055
				3056	/*
				3057	* SAX: Start of Element !
				3058	*/
				3059	htmlnamePush(ctxt, xmlStrdup(name));
				3060	#ifdef DEBUG
				3061	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3062	#endif
				3063	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3064	ctxt->sax->startElement(ctxt->userData, name, atts);
				3065
				3066	if (atts != NULL) {
				3067	for (i = 0;i < nbatts;i++) {
				3068	if (atts[i] != NULL)
				3069	xmlFree((xmlChar *) atts[i]);
				3070	}
				3071	xmlFree((void *) atts);
				3072	}
				3073	if (name != NULL) xmlFree(name);
				3074	}
				3075
				3076	/**
				3077	* htmlParseEndTag:
				3078	* @ctxt: an HTML parser context
				3079	*
				3080	* parse an end of tag
				3081	*
				3082	* [42] ETag ::= '</' Name S? '>'
				3083	*
				3084	* With namespace
				3085	*
				3086	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3087	*
				3088	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3089	*/
				3090
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3091	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3092	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3093	xmlChar *name;
				3094	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3095	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3096
				3097	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3098	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3099	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3100	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3101	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3102	}
				3103	SKIP(2);
				3104
				3105	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3106	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3107
				3108	/*
				3109	* We should definitely be at the ending "S? '>'" part
				3110	*/
				3111	SKIP_BLANKS;
				3112	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3113	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3114	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3115	ctxt->wellFormed = 0;
				3116	} else
				3117	NEXT;
				3118
				3119	/*
				3120	* If the name read is not one of the element in the parsing stack
				3121	* then return, it's just an error.
				3122	*/
				3123	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3124	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3125	}
				3126	if (i < 0) {
				3127	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3128	ctxt->sax->error(ctxt->userData,
				3129	"Unexpected end tag : %s\n", name);
				3130	xmlFree(name);
				3131	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3132	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3133	}
				3134
				3135
				3136	/*
				3137	* Check for auto-closure of HTML elements.
				3138	*/
				3139
				3140	htmlAutoCloseOnClose(ctxt, name);
				3141
				3142	/*
				3143	* Well formedness constraints, opening and closing must match.
				3144	* With the exception that the autoclose may have popped stuff out
				3145	* of the stack.
				3146	*/
				3147	if (!xmlStrEqual(name, ctxt->name)) {
				3148	#ifdef DEBUG
				3149	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3150	#endif
				3151	if ((ctxt->name != NULL) &&
				3152	(!xmlStrEqual(ctxt->name, name))) {
				3153	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3154	ctxt->sax->error(ctxt->userData,
				3155	"Opening and ending tag mismatch: %s and %s\n",
				3156	name, ctxt->name);
				3157	ctxt->wellFormed = 0;
				3158	}
				3159	}
				3160
				3161	/*
				3162	* SAX: End of Tag
				3163	*/
				3164	oldname = ctxt->name;
				3165	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3166	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3167	ctxt->sax->endElement(ctxt->userData, name);
				3168	oldname = htmlnamePop(ctxt);
				3169	if (oldname != NULL) {
				3170	#ifdef DEBUG
				3171	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3172	#endif
				3173	xmlFree(oldname);
				3174	#ifdef DEBUG
				3175	} else {
				3176	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3177	#endif
				3178	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3179	ret = 1;
				3180	} else {
				3181	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3182	}
				3183
				3184	if (name != NULL)
				3185	xmlFree(name);
				3186
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3187	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3188	}
				3189
				3190
				3191	/**
				3192	* htmlParseReference:
				3193	* @ctxt: an HTML parser context
				3194	*
				3195	* parse and handle entity references in content,
				3196	* this will end-up in a call to character() since this is either a
				3197	* CharRef, or a predefined entity.
				3198	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3199	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3200	htmlParseReference(htmlParserCtxtPtr ctxt) {
				3201	htmlEntityDescPtr ent;
				3202	xmlChar out[6];
				3203	xmlChar *name;
				3204	if (CUR != '&') return;
				3205
				3206	if (NXT(1) == '#') {
				3207	unsigned int c;
				3208	int bits, i = 0;
				3209
				3210	c = htmlParseCharRef(ctxt);
				3211	if (c == 0)
				3212	return;
				3213
				3214	if (c < 0x80) { out[i++]= c; bits= -6; }
				3215	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3216	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3217	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3218
				3219	for ( ; bits >= 0; bits-= 6) {
				3220	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3221	}
				3222	out[i] = 0;
				3223
				3224	htmlCheckParagraph(ctxt);
				3225	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3226	ctxt->sax->characters(ctxt->userData, out, i);
				3227	} else {
				3228	ent = htmlParseEntityRef(ctxt, &name);
				3229	if (name == NULL) {
				3230	htmlCheckParagraph(ctxt);
				3231	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3232	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3233	return;
				3234	}
				3235	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3236	htmlCheckParagraph(ctxt);
				3237	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3238	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3239	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3240	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3241	}
				3242	} else {
				3243	unsigned int c;
				3244	int bits, i = 0;
				3245
				3246	c = ent->value;
				3247	if (c < 0x80)
				3248	{ out[i++]= c; bits= -6; }
				3249	else if (c < 0x800)
				3250	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3251	else if (c < 0x10000)
				3252	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3253	else
				3254	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3255
				3256	for ( ; bits >= 0; bits-= 6) {
				3257	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3258	}
				3259	out[i] = 0;
				3260
				3261	htmlCheckParagraph(ctxt);
				3262	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3263	ctxt->sax->characters(ctxt->userData, out, i);
				3264	}
				3265	xmlFree(name);
				3266	}
				3267	}
				3268
				3269	/**
				3270	* htmlParseContent:
				3271	* @ctxt: an HTML parser context
				3272	* @name: the node name
				3273	*
				3274	* Parse a content: comment, sub-element, reference or text.
				3275	*
				3276	*/
				3277
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3278	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3279	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3280	xmlChar *currentNode;
				3281	int depth;
				3282
				3283	currentNode = xmlStrdup(ctxt->name);
				3284	depth = ctxt->nameNr;
				3285	while (1) {
				3286	long cons = ctxt->nbChars;
				3287
				3288	GROW;
				3289	/*
				3290	* Our tag or one of it's parent or children is ending.
				3291	*/
				3292	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3293	if (htmlParseEndTag(ctxt) &&
				3294	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3295	if (currentNode != NULL)
				3296	xmlFree(currentNode);
				3297	return;
				3298	}
				3299	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3300	}
				3301
				3302	/*
				3303	* Has this node been popped out during parsing of
				3304	* the next element
				3305	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame^]	3306	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3307	(!xmlStrEqual(currentNode, ctxt->name)))
				3308	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3309	if (currentNode != NULL) xmlFree(currentNode);
				3310	return;
				3311	}
				3312
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3313	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3314	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3315	/*
				3316	* Handle SCRIPT/STYLE separately
				3317	*/
				3318	htmlParseScript(ctxt);
				3319	} else {
				3320	/*
				3321	* Sometimes DOCTYPE arrives in the middle of the document
				3322	*/
				3323	if ((CUR == '<') && (NXT(1) == '!') &&
				3324	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3325	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3326	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3327	(UPP(8) == 'E')) {
				3328	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3329	ctxt->sax->error(ctxt->userData,
				3330	"Misplaced DOCTYPE declaration\n");
				3331	ctxt->wellFormed = 0;
				3332	htmlParseDocTypeDecl(ctxt);
				3333	}
				3334
				3335	/*
				3336	* First case : a comment
				3337	*/
				3338	if ((CUR == '<') && (NXT(1) == '!') &&
				3339	(NXT(2) == '-') && (NXT(3) == '-')) {
				3340	htmlParseComment(ctxt);
				3341	}
				3342
				3343	/*
				3344	* Second case : a sub-element.
				3345	*/
				3346	else if (CUR == '<') {
				3347	htmlParseElement(ctxt);
				3348	}
				3349
				3350	/*
				3351	* Third case : a reference. If if has not been resolved,
				3352	* parsing returns it's Name, create the node
				3353	*/
				3354	else if (CUR == '&') {
				3355	htmlParseReference(ctxt);
				3356	}
				3357
				3358	/*
				3359	* Fourth : end of the resource
				3360	*/
				3361	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3362	htmlAutoCloseOnEnd(ctxt);
				3363	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3364	}
				3365
				3366	/*
				3367	* Last case, text. Note that References are handled directly.
				3368	*/
				3369	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3370	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3371	}
				3372
				3373	if (cons == ctxt->nbChars) {
				3374	if (ctxt->node != NULL) {
				3375	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3376	ctxt->sax->error(ctxt->userData,
				3377	"detected an error in element content\n");
				3378	ctxt->wellFormed = 0;
				3379	}
				3380	break;
				3381	}
				3382	}
				3383	GROW;
				3384	}
				3385	if (currentNode != NULL) xmlFree(currentNode);
				3386	}
				3387
				3388	/**
				3389	* htmlParseElement:
				3390	* @ctxt: an HTML parser context
				3391	*
				3392	* parse an HTML element, this is highly recursive
				3393	*
				3394	* [39] element ::= EmptyElemTag \| STag content ETag
				3395	*
				3396	* [41] Attribute ::= Name Eq AttValue
				3397	*/
				3398
				3399	void
				3400	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3401	xmlChar *name;
				3402	xmlChar *currentNode = NULL;
				3403	htmlElemDescPtr info;
				3404	htmlParserNodeInfo node_info;
				3405	xmlChar *oldname;
				3406	int depth = ctxt->nameNr;
				3407
				3408	/* Capture start position */
				3409	if (ctxt->record_info) {
				3410	node_info.begin_pos = ctxt->input->consumed +
				3411	(CUR_PTR - ctxt->input->base);
				3412	node_info.begin_line = ctxt->input->line;
				3413	}
				3414
				3415	oldname = xmlStrdup(ctxt->name);
				3416	htmlParseStartTag(ctxt);
				3417	name = ctxt->name;
				3418	#ifdef DEBUG
				3419	if (oldname == NULL)
				3420	xmlGenericError(xmlGenericErrorContext,
				3421	"Start of element %s\n", name);
				3422	else if (name == NULL)
				3423	xmlGenericError(xmlGenericErrorContext,
				3424	"Start of element failed, was %s\n", oldname);
				3425	else
				3426	xmlGenericError(xmlGenericErrorContext,
				3427	"Start of element %s, was %s\n", name, oldname);
				3428	#endif
				3429	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3430	(name == NULL)) {
				3431	if (CUR == '>')
				3432	NEXT;
				3433	if (oldname != NULL)
				3434	xmlFree(oldname);
				3435	return;
				3436	}
				3437	if (oldname != NULL)
				3438	xmlFree(oldname);
				3439
				3440	/*
				3441	* Lookup the info for that element.
				3442	*/
				3443	info = htmlTagLookup(name);
				3444	if (info == NULL) {
				3445	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3446	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3447	name);
				3448	ctxt->wellFormed = 0;
				3449	} else if (info->depr) {
				3450	/***************************
				3451	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3452	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3453	name);
				3454	***************************/
				3455	}
				3456
				3457	/*
				3458	* Check for an Empty Element labelled the XML/SGML way
				3459	*/
				3460	if ((CUR == '/') && (NXT(1) == '>')) {
				3461	SKIP(2);
				3462	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3463	ctxt->sax->endElement(ctxt->userData, name);
				3464	oldname = htmlnamePop(ctxt);
				3465	#ifdef DEBUG
				3466	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3467	#endif
				3468	if (oldname != NULL)
				3469	xmlFree(oldname);
				3470	return;
				3471	}
				3472
				3473	if (CUR == '>') {
				3474	NEXT;
				3475	} else {
				3476	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3477	ctxt->sax->error(ctxt->userData,
				3478	"Couldn't find end of Start Tag %s\n",
				3479	name);
				3480	ctxt->wellFormed = 0;
				3481
				3482	/*
				3483	* end of parsing of this node.
				3484	*/
				3485	if (xmlStrEqual(name, ctxt->name)) {
				3486	nodePop(ctxt);
				3487	oldname = htmlnamePop(ctxt);
				3488	#ifdef DEBUG
				3489	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3490	#endif
				3491	if (oldname != NULL)
				3492	xmlFree(oldname);
				3493	}
				3494
				3495	/*
				3496	* Capture end position and add node
				3497	*/
				3498	if ( currentNode != NULL && ctxt->record_info ) {
				3499	node_info.end_pos = ctxt->input->consumed +
				3500	(CUR_PTR - ctxt->input->base);
				3501	node_info.end_line = ctxt->input->line;
				3502	node_info.node = ctxt->node;
				3503	xmlParserAddNodeInfo(ctxt, &node_info);
				3504	}
				3505	return;
				3506	}
				3507
				3508	/*
				3509	* Check for an Empty Element from DTD definition
				3510	*/
				3511	if ((info != NULL) && (info->empty)) {
				3512	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3513	ctxt->sax->endElement(ctxt->userData, name);
				3514	oldname = htmlnamePop(ctxt);
				3515	#ifdef DEBUG
				3516	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3517	#endif
				3518	if (oldname != NULL)
				3519	xmlFree(oldname);
				3520	return;
				3521	}
				3522
				3523	/*
				3524	* Parse the content of the element:
				3525	*/
				3526	currentNode = xmlStrdup(ctxt->name);
				3527	depth = ctxt->nameNr;
				3528	while (IS_CHAR(CUR)) {
				3529	htmlParseContent(ctxt);
				3530	if (ctxt->nameNr < depth) break;
				3531	}
				3532
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3533	/*
				3534	* Capture end position and add node
				3535	*/
				3536	if ( currentNode != NULL && ctxt->record_info ) {
				3537	node_info.end_pos = ctxt->input->consumed +
				3538	(CUR_PTR - ctxt->input->base);
				3539	node_info.end_line = ctxt->input->line;
				3540	node_info.node = ctxt->node;
				3541	xmlParserAddNodeInfo(ctxt, &node_info);
				3542	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3543	if (!IS_CHAR(CUR)) {
				3544	htmlAutoCloseOnEnd(ctxt);
				3545	}
				3546
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3547	if (currentNode != NULL)
				3548	xmlFree(currentNode);
				3549	}
				3550
				3551	/**
				3552	* htmlParseDocument :
				3553	* @ctxt: an HTML parser context
				3554	*
				3555	* parse an HTML document (and build a tree if using the standard SAX
				3556	* interface).
				3557	*
				3558	* Returns 0, -1 in case of error. the parser context is augmented
				3559	* as a result of the parsing.
				3560	*/
				3561
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3562	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3563	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3564	xmlDtdPtr dtd;
				3565
				3566	htmlDefaultSAXHandlerInit();
				3567	ctxt->html = 1;
				3568
				3569	GROW;
				3570	/*
				3571	* SAX: beginning of the document processing.
				3572	*/
				3573	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3574	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3575
				3576	/*
				3577	* Wipe out everything which is before the first '<'
				3578	*/
				3579	SKIP_BLANKS;
				3580	if (CUR == 0) {
				3581	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3582	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3583	ctxt->wellFormed = 0;
				3584	}
				3585
				3586	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3587	ctxt->sax->startDocument(ctxt->userData);
				3588
				3589
				3590	/*
				3591	* Parse possible comments before any content
				3592	*/
				3593	while ((CUR == '<') && (NXT(1) == '!') &&
				3594	(NXT(2) == '-') && (NXT(3) == '-')) {
				3595	htmlParseComment(ctxt);
				3596	SKIP_BLANKS;
				3597	}
				3598
				3599
				3600	/*
				3601	* Then possibly doc type declaration(s) and more Misc
				3602	* (doctypedecl Misc*)?
				3603	*/
				3604	if ((CUR == '<') && (NXT(1) == '!') &&
				3605	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3606	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3607	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3608	(UPP(8) == 'E')) {
				3609	htmlParseDocTypeDecl(ctxt);
				3610	}
				3611	SKIP_BLANKS;
				3612
				3613	/*
				3614	* Parse possible comments before any content
				3615	*/
				3616	while ((CUR == '<') && (NXT(1) == '!') &&
				3617	(NXT(2) == '-') && (NXT(3) == '-')) {
				3618	htmlParseComment(ctxt);
				3619	SKIP_BLANKS;
				3620	}
				3621
				3622	/*
				3623	* Time to start parsing the tree itself
				3624	*/
				3625	htmlParseContent(ctxt);
				3626
				3627	/*
				3628	* autoclose
				3629	*/
				3630	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3631	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3632
				3633
				3634	/*
				3635	* SAX: end of the document processing.
				3636	*/
				3637	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3638	ctxt->sax->endDocument(ctxt->userData);
				3639
				3640	if (ctxt->myDoc != NULL) {
				3641	dtd = xmlGetIntSubset(ctxt->myDoc);
				3642	if (dtd == NULL)
				3643	ctxt->myDoc->intSubset =
				3644	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3645	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3646	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3647	}
				3648	if (! ctxt->wellFormed) return(-1);
				3649	return(0);
				3650	}
				3651
				3652
				3653	/************************************************************************
				3654	* *
				3655	* Parser contexts handling *
				3656	* *
				3657	************************************************************************/
				3658
				3659	/**
				3660	* xmlInitParserCtxt:
				3661	* @ctxt: an HTML parser context
				3662	*
				3663	* Initialize a parser context
				3664	*/
				3665
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3666	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3667	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3668	{
				3669	htmlSAXHandler *sax;
				3670
				3671	if (ctxt == NULL) return;
				3672	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3673
				3674	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3675	if (sax == NULL) {
				3676	xmlGenericError(xmlGenericErrorContext,
				3677	"htmlInitParserCtxt: out of memory\n");
				3678	}
				3679	else
				3680	memset(sax, 0, sizeof(htmlSAXHandler));
				3681
				3682	/* Allocate the Input stack */
				3683	ctxt->inputTab = (htmlParserInputPtr *)
				3684	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3685	if (ctxt->inputTab == NULL) {
				3686	xmlGenericError(xmlGenericErrorContext,
				3687	"htmlInitParserCtxt: out of memory\n");
				3688	ctxt->inputNr = 0;
				3689	ctxt->inputMax = 0;
				3690	ctxt->input = NULL;
				3691	return;
				3692	}
				3693	ctxt->inputNr = 0;
				3694	ctxt->inputMax = 5;
				3695	ctxt->input = NULL;
				3696	ctxt->version = NULL;
				3697	ctxt->encoding = NULL;
				3698	ctxt->standalone = -1;
				3699	ctxt->instate = XML_PARSER_START;
				3700
				3701	/* Allocate the Node stack */
				3702	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3703	if (ctxt->nodeTab == NULL) {
				3704	xmlGenericError(xmlGenericErrorContext,
				3705	"htmlInitParserCtxt: out of memory\n");
				3706	ctxt->nodeNr = 0;
				3707	ctxt->nodeMax = 0;
				3708	ctxt->node = NULL;
				3709	ctxt->inputNr = 0;
				3710	ctxt->inputMax = 0;
				3711	ctxt->input = NULL;
				3712	return;
				3713	}
				3714	ctxt->nodeNr = 0;
				3715	ctxt->nodeMax = 10;
				3716	ctxt->node = NULL;
				3717
				3718	/* Allocate the Name stack */
				3719	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3720	if (ctxt->nameTab == NULL) {
				3721	xmlGenericError(xmlGenericErrorContext,
				3722	"htmlInitParserCtxt: out of memory\n");
				3723	ctxt->nameNr = 0;
				3724	ctxt->nameMax = 10;
				3725	ctxt->name = NULL;
				3726	ctxt->nodeNr = 0;
				3727	ctxt->nodeMax = 0;
				3728	ctxt->node = NULL;
				3729	ctxt->inputNr = 0;
				3730	ctxt->inputMax = 0;
				3731	ctxt->input = NULL;
				3732	return;
				3733	}
				3734	ctxt->nameNr = 0;
				3735	ctxt->nameMax = 10;
				3736	ctxt->name = NULL;
				3737
				3738	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3739	else {
				3740	ctxt->sax = sax;
				3741	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3742	}
				3743	ctxt->userData = ctxt;
				3744	ctxt->myDoc = NULL;
				3745	ctxt->wellFormed = 1;
				3746	ctxt->replaceEntities = 0;
				3747	ctxt->html = 1;
				3748	ctxt->record_info = 0;
				3749	ctxt->validate = 0;
				3750	ctxt->nbChars = 0;
				3751	ctxt->checkIndex = 0;
				3752	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3753	}
				3754
				3755	/**
				3756	* htmlFreeParserCtxt:
				3757	* @ctxt: an HTML parser context
				3758	*
				3759	* Free all the memory used by a parser context. However the parsed
				3760	* document in ctxt->myDoc is not freed.
				3761	*/
				3762
				3763	void
				3764	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3765	{
				3766	xmlFreeParserCtxt(ctxt);
				3767	}
				3768
				3769	/**
				3770	* htmlCreateDocParserCtxt :
				3771	* @cur: a pointer to an array of xmlChar
				3772	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3773	*
				3774	* Create a parser context for an HTML document.
				3775	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3776	* TODO: check the need to add encoding handling there
				3777	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3778	* Returns the new parser context or NULL
				3779	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3780	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3781	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3782	htmlParserCtxtPtr ctxt;
				3783	htmlParserInputPtr input;
				3784	/* htmlCharEncoding enc; */
				3785
				3786	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3787	if (ctxt == NULL) {
				3788	perror("malloc");
				3789	return(NULL);
				3790	}
				3791	htmlInitParserCtxt(ctxt);
				3792	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3793	if (input == NULL) {
				3794	perror("malloc");
				3795	xmlFree(ctxt);
				3796	return(NULL);
				3797	}
				3798	memset(input, 0, sizeof(htmlParserInput));
				3799
				3800	input->line = 1;
				3801	input->col = 1;
				3802	input->base = cur;
				3803	input->cur = cur;
				3804
				3805	inputPush(ctxt, input);
				3806	return(ctxt);
				3807	}
				3808
				3809	/************************************************************************
				3810	* *
				3811	* Progressive parsing interfaces *
				3812	* *
				3813	************************************************************************/
				3814
				3815	/**
				3816	* htmlParseLookupSequence:
				3817	* @ctxt: an HTML parser context
				3818	* @first: the first char to lookup
				3819	* @next: the next char to lookup or zero
				3820	* @third: the next char to lookup or zero
				3821	*
				3822	* Try to find if a sequence (first, next, third) or just (first next) or
				3823	* (first) is available in the input stream.
				3824	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3825	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3826	* parser, do not use liberally.
				3827	* This is basically similar to xmlParseLookupSequence()
				3828	*
				3829	* Returns the index to the current parsing point if the full sequence
				3830	* is available, -1 otherwise.
				3831	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3832	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3833	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3834	xmlChar next, xmlChar third) {
				3835	int base, len;
				3836	htmlParserInputPtr in;
				3837	const xmlChar *buf;
				3838
				3839	in = ctxt->input;
				3840	if (in == NULL) return(-1);
				3841	base = in->cur - in->base;
				3842	if (base < 0) return(-1);
				3843	if (ctxt->checkIndex > base)
				3844	base = ctxt->checkIndex;
				3845	if (in->buf == NULL) {
				3846	buf = in->base;
				3847	len = in->length;
				3848	} else {
				3849	buf = in->buf->buffer->content;
				3850	len = in->buf->buffer->use;
				3851	}
				3852	/* take into account the sequence length */
				3853	if (third) len -= 2;
				3854	else if (next) len --;
				3855	for (;base < len;base++) {
				3856	if (buf[base] == first) {
				3857	if (third != 0) {
				3858	if ((buf[base + 1] != next) \|\|
				3859	(buf[base + 2] != third)) continue;
				3860	} else if (next != 0) {
				3861	if (buf[base + 1] != next) continue;
				3862	}
				3863	ctxt->checkIndex = 0;
				3864	#ifdef DEBUG_PUSH
				3865	if (next == 0)
				3866	xmlGenericError(xmlGenericErrorContext,
				3867	"HPP: lookup '%c' found at %d\n",
				3868	first, base);
				3869	else if (third == 0)
				3870	xmlGenericError(xmlGenericErrorContext,
				3871	"HPP: lookup '%c%c' found at %d\n",
				3872	first, next, base);
				3873	else
				3874	xmlGenericError(xmlGenericErrorContext,
				3875	"HPP: lookup '%c%c%c' found at %d\n",
				3876	first, next, third, base);
				3877	#endif
				3878	return(base - (in->cur - in->base));
				3879	}
				3880	}
				3881	ctxt->checkIndex = base;
				3882	#ifdef DEBUG_PUSH
				3883	if (next == 0)
				3884	xmlGenericError(xmlGenericErrorContext,
				3885	"HPP: lookup '%c' failed\n", first);
				3886	else if (third == 0)
				3887	xmlGenericError(xmlGenericErrorContext,
				3888	"HPP: lookup '%c%c' failed\n", first, next);
				3889	else
				3890	xmlGenericError(xmlGenericErrorContext,
				3891	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3892	#endif
				3893	return(-1);
				3894	}
				3895
				3896	/**
				3897	* htmlParseTryOrFinish:
				3898	* @ctxt: an HTML parser context
				3899	* @terminate: last chunk indicator
				3900	*
				3901	* Try to progress on parsing
				3902	*
				3903	* Returns zero if no parsing was possible
				3904	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3905	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3906	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3907	int ret = 0;
				3908	htmlParserInputPtr in;
				3909	int avail = 0;
				3910	xmlChar cur, next;
				3911
				3912	#ifdef DEBUG_PUSH
				3913	switch (ctxt->instate) {
				3914	case XML_PARSER_EOF:
				3915	xmlGenericError(xmlGenericErrorContext,
				3916	"HPP: try EOF\n"); break;
				3917	case XML_PARSER_START:
				3918	xmlGenericError(xmlGenericErrorContext,
				3919	"HPP: try START\n"); break;
				3920	case XML_PARSER_MISC:
				3921	xmlGenericError(xmlGenericErrorContext,
				3922	"HPP: try MISC\n");break;
				3923	case XML_PARSER_COMMENT:
				3924	xmlGenericError(xmlGenericErrorContext,
				3925	"HPP: try COMMENT\n");break;
				3926	case XML_PARSER_PROLOG:
				3927	xmlGenericError(xmlGenericErrorContext,
				3928	"HPP: try PROLOG\n");break;
				3929	case XML_PARSER_START_TAG:
				3930	xmlGenericError(xmlGenericErrorContext,
				3931	"HPP: try START_TAG\n");break;
				3932	case XML_PARSER_CONTENT:
				3933	xmlGenericError(xmlGenericErrorContext,
				3934	"HPP: try CONTENT\n");break;
				3935	case XML_PARSER_CDATA_SECTION:
				3936	xmlGenericError(xmlGenericErrorContext,
				3937	"HPP: try CDATA_SECTION\n");break;
				3938	case XML_PARSER_END_TAG:
				3939	xmlGenericError(xmlGenericErrorContext,
				3940	"HPP: try END_TAG\n");break;
				3941	case XML_PARSER_ENTITY_DECL:
				3942	xmlGenericError(xmlGenericErrorContext,
				3943	"HPP: try ENTITY_DECL\n");break;
				3944	case XML_PARSER_ENTITY_VALUE:
				3945	xmlGenericError(xmlGenericErrorContext,
				3946	"HPP: try ENTITY_VALUE\n");break;
				3947	case XML_PARSER_ATTRIBUTE_VALUE:
				3948	xmlGenericError(xmlGenericErrorContext,
				3949	"HPP: try ATTRIBUTE_VALUE\n");break;
				3950	case XML_PARSER_DTD:
				3951	xmlGenericError(xmlGenericErrorContext,
				3952	"HPP: try DTD\n");break;
				3953	case XML_PARSER_EPILOG:
				3954	xmlGenericError(xmlGenericErrorContext,
				3955	"HPP: try EPILOG\n");break;
				3956	case XML_PARSER_PI:
				3957	xmlGenericError(xmlGenericErrorContext,
				3958	"HPP: try PI\n");break;
				3959	case XML_PARSER_SYSTEM_LITERAL:
				3960	xmlGenericError(xmlGenericErrorContext,
				3961	"HPP: try SYSTEM_LITERAL\n");break;
				3962	}
				3963	#endif
				3964
				3965	while (1) {
				3966
				3967	in = ctxt->input;
				3968	if (in == NULL) break;
				3969	if (in->buf == NULL)
				3970	avail = in->length - (in->cur - in->base);
				3971	else
				3972	avail = in->buf->buffer->use - (in->cur - in->base);
				3973	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3974	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3975	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3976	/*
				3977	* SAX: end of the document processing.
				3978	*/
				3979	ctxt->instate = XML_PARSER_EOF;
				3980	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3981	ctxt->sax->endDocument(ctxt->userData);
				3982	}
				3983	}
				3984	if (avail < 1)
				3985	goto done;
				3986	switch (ctxt->instate) {
				3987	case XML_PARSER_EOF:
				3988	/*
				3989	* Document parsing is done !
				3990	*/
				3991	goto done;
				3992	case XML_PARSER_START:
				3993	/*
				3994	* Very first chars read from the document flow.
				3995	*/
				3996	cur = in->cur[0];
				3997	if (IS_BLANK(cur)) {
				3998	SKIP_BLANKS;
				3999	if (in->buf == NULL)
				4000	avail = in->length - (in->cur - in->base);
				4001	else
				4002	avail = in->buf->buffer->use - (in->cur - in->base);
				4003	}
				4004	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4005	ctxt->sax->setDocumentLocator(ctxt->userData,
				4006	&xmlDefaultSAXLocator);
				4007	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4008	(!ctxt->disableSAX))
				4009	ctxt->sax->startDocument(ctxt->userData);
				4010
				4011	cur = in->cur[0];
				4012	next = in->cur[1];
				4013	if ((cur == '<') && (next == '!') &&
				4014	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4015	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4016	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4017	(UPP(8) == 'E')) {
				4018	if ((!terminate) &&
				4019	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4020	goto done;
				4021	#ifdef DEBUG_PUSH
				4022	xmlGenericError(xmlGenericErrorContext,
				4023	"HPP: Parsing internal subset\n");
				4024	#endif
				4025	htmlParseDocTypeDecl(ctxt);
				4026	ctxt->instate = XML_PARSER_PROLOG;
				4027	#ifdef DEBUG_PUSH
				4028	xmlGenericError(xmlGenericErrorContext,
				4029	"HPP: entering PROLOG\n");
				4030	#endif
				4031	} else {
				4032	ctxt->instate = XML_PARSER_MISC;
				4033	}
				4034	#ifdef DEBUG_PUSH
				4035	xmlGenericError(xmlGenericErrorContext,
				4036	"HPP: entering MISC\n");
				4037	#endif
				4038	break;
				4039	case XML_PARSER_MISC:
				4040	SKIP_BLANKS;
				4041	if (in->buf == NULL)
				4042	avail = in->length - (in->cur - in->base);
				4043	else
				4044	avail = in->buf->buffer->use - (in->cur - in->base);
				4045	if (avail < 2)
				4046	goto done;
				4047	cur = in->cur[0];
				4048	next = in->cur[1];
				4049	if ((cur == '<') && (next == '!') &&
				4050	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4051	if ((!terminate) &&
				4052	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4053	goto done;
				4054	#ifdef DEBUG_PUSH
				4055	xmlGenericError(xmlGenericErrorContext,
				4056	"HPP: Parsing Comment\n");
				4057	#endif
				4058	htmlParseComment(ctxt);
				4059	ctxt->instate = XML_PARSER_MISC;
				4060	} else if ((cur == '<') && (next == '!') &&
				4061	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4062	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4063	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4064	(UPP(8) == 'E')) {
				4065	if ((!terminate) &&
				4066	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4067	goto done;
				4068	#ifdef DEBUG_PUSH
				4069	xmlGenericError(xmlGenericErrorContext,
				4070	"HPP: Parsing internal subset\n");
				4071	#endif
				4072	htmlParseDocTypeDecl(ctxt);
				4073	ctxt->instate = XML_PARSER_PROLOG;
				4074	#ifdef DEBUG_PUSH
				4075	xmlGenericError(xmlGenericErrorContext,
				4076	"HPP: entering PROLOG\n");
				4077	#endif
				4078	} else if ((cur == '<') && (next == '!') &&
				4079	(avail < 9)) {
				4080	goto done;
				4081	} else {
				4082	ctxt->instate = XML_PARSER_START_TAG;
				4083	#ifdef DEBUG_PUSH
				4084	xmlGenericError(xmlGenericErrorContext,
				4085	"HPP: entering START_TAG\n");
				4086	#endif
				4087	}
				4088	break;
				4089	case XML_PARSER_PROLOG:
				4090	SKIP_BLANKS;
				4091	if (in->buf == NULL)
				4092	avail = in->length - (in->cur - in->base);
				4093	else
				4094	avail = in->buf->buffer->use - (in->cur - in->base);
				4095	if (avail < 2)
				4096	goto done;
				4097	cur = in->cur[0];
				4098	next = in->cur[1];
				4099	if ((cur == '<') && (next == '!') &&
				4100	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4101	if ((!terminate) &&
				4102	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4103	goto done;
				4104	#ifdef DEBUG_PUSH
				4105	xmlGenericError(xmlGenericErrorContext,
				4106	"HPP: Parsing Comment\n");
				4107	#endif
				4108	htmlParseComment(ctxt);
				4109	ctxt->instate = XML_PARSER_PROLOG;
				4110	} else if ((cur == '<') && (next == '!') &&
				4111	(avail < 4)) {
				4112	goto done;
				4113	} else {
				4114	ctxt->instate = XML_PARSER_START_TAG;
				4115	#ifdef DEBUG_PUSH
				4116	xmlGenericError(xmlGenericErrorContext,
				4117	"HPP: entering START_TAG\n");
				4118	#endif
				4119	}
				4120	break;
				4121	case XML_PARSER_EPILOG:
				4122	if (in->buf == NULL)
				4123	avail = in->length - (in->cur - in->base);
				4124	else
				4125	avail = in->buf->buffer->use - (in->cur - in->base);
				4126	if (avail < 1)
				4127	goto done;
				4128	cur = in->cur[0];
				4129	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4130	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4131	goto done;
				4132	}
				4133	if (avail < 2)
				4134	goto done;
				4135	next = in->cur[1];
				4136	if ((cur == '<') && (next == '!') &&
				4137	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4138	if ((!terminate) &&
				4139	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4140	goto done;
				4141	#ifdef DEBUG_PUSH
				4142	xmlGenericError(xmlGenericErrorContext,
				4143	"HPP: Parsing Comment\n");
				4144	#endif
				4145	htmlParseComment(ctxt);
				4146	ctxt->instate = XML_PARSER_EPILOG;
				4147	} else if ((cur == '<') && (next == '!') &&
				4148	(avail < 4)) {
				4149	goto done;
				4150	} else {
				4151	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4152	ctxt->wellFormed = 0;
				4153	ctxt->instate = XML_PARSER_EOF;
				4154	#ifdef DEBUG_PUSH
				4155	xmlGenericError(xmlGenericErrorContext,
				4156	"HPP: entering EOF\n");
				4157	#endif
				4158	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4159	ctxt->sax->endDocument(ctxt->userData);
				4160	goto done;
				4161	}
				4162	break;
				4163	case XML_PARSER_START_TAG: {
				4164	xmlChar name, oldname;
				4165	int depth = ctxt->nameNr;
				4166	htmlElemDescPtr info;
				4167
				4168	if (avail < 2)
				4169	goto done;
				4170	cur = in->cur[0];
				4171	if (cur != '<') {
				4172	ctxt->instate = XML_PARSER_CONTENT;
				4173	#ifdef DEBUG_PUSH
				4174	xmlGenericError(xmlGenericErrorContext,
				4175	"HPP: entering CONTENT\n");
				4176	#endif
				4177	break;
				4178	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4179	if (in->cur[1] == '/') {
				4180	ctxt->instate = XML_PARSER_END_TAG;
				4181	ctxt->checkIndex = 0;
				4182	#ifdef DEBUG_PUSH
				4183	xmlGenericError(xmlGenericErrorContext,
				4184	"HPP: entering END_TAG\n");
				4185	#endif
				4186	break;
				4187	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4188	if ((!terminate) &&
				4189	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4190	goto done;
				4191
				4192	oldname = xmlStrdup(ctxt->name);
				4193	htmlParseStartTag(ctxt);
				4194	name = ctxt->name;
				4195	#ifdef DEBUG
				4196	if (oldname == NULL)
				4197	xmlGenericError(xmlGenericErrorContext,
				4198	"Start of element %s\n", name);
				4199	else if (name == NULL)
				4200	xmlGenericError(xmlGenericErrorContext,
				4201	"Start of element failed, was %s\n",
				4202	oldname);
				4203	else
				4204	xmlGenericError(xmlGenericErrorContext,
				4205	"Start of element %s, was %s\n",
				4206	name, oldname);
				4207	#endif
				4208	if (((depth == ctxt->nameNr) &&
				4209	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4210	(name == NULL)) {
				4211	if (CUR == '>')
				4212	NEXT;
				4213	if (oldname != NULL)
				4214	xmlFree(oldname);
				4215	break;
				4216	}
				4217	if (oldname != NULL)
				4218	xmlFree(oldname);
				4219
				4220	/*
				4221	* Lookup the info for that element.
				4222	*/
				4223	info = htmlTagLookup(name);
				4224	if (info == NULL) {
				4225	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4226	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4227	name);
				4228	ctxt->wellFormed = 0;
				4229	} else if (info->depr) {
				4230	/***************************
				4231	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4232	ctxt->sax->warning(ctxt->userData,
				4233	"Tag %s is deprecated\n",
				4234	name);
				4235	***************************/
				4236	}
				4237
				4238	/*
				4239	* Check for an Empty Element labelled the XML/SGML way
				4240	*/
				4241	if ((CUR == '/') && (NXT(1) == '>')) {
				4242	SKIP(2);
				4243	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4244	ctxt->sax->endElement(ctxt->userData, name);
				4245	oldname = htmlnamePop(ctxt);
				4246	#ifdef DEBUG
				4247	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4248	oldname);
				4249	#endif
				4250	if (oldname != NULL)
				4251	xmlFree(oldname);
				4252	ctxt->instate = XML_PARSER_CONTENT;
				4253	#ifdef DEBUG_PUSH
				4254	xmlGenericError(xmlGenericErrorContext,
				4255	"HPP: entering CONTENT\n");
				4256	#endif
				4257	break;
				4258	}
				4259
				4260	if (CUR == '>') {
				4261	NEXT;
				4262	} else {
				4263	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4264	ctxt->sax->error(ctxt->userData,
				4265	"Couldn't find end of Start Tag %s\n",
				4266	name);
				4267	ctxt->wellFormed = 0;
				4268
				4269	/*
				4270	* end of parsing of this node.
				4271	*/
				4272	if (xmlStrEqual(name, ctxt->name)) {
				4273	nodePop(ctxt);
				4274	oldname = htmlnamePop(ctxt);
				4275	#ifdef DEBUG
				4276	xmlGenericError(xmlGenericErrorContext,
				4277	"End of start tag problem: popping out %s\n", oldname);
				4278	#endif
				4279	if (oldname != NULL)
				4280	xmlFree(oldname);
				4281	}
				4282
				4283	ctxt->instate = XML_PARSER_CONTENT;
				4284	#ifdef DEBUG_PUSH
				4285	xmlGenericError(xmlGenericErrorContext,
				4286	"HPP: entering CONTENT\n");
				4287	#endif
				4288	break;
				4289	}
				4290
				4291	/*
				4292	* Check for an Empty Element from DTD definition
				4293	*/
				4294	if ((info != NULL) && (info->empty)) {
				4295	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4296	ctxt->sax->endElement(ctxt->userData, name);
				4297	oldname = htmlnamePop(ctxt);
				4298	#ifdef DEBUG
				4299	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4300	#endif
				4301	if (oldname != NULL)
				4302	xmlFree(oldname);
				4303	}
				4304	ctxt->instate = XML_PARSER_CONTENT;
				4305	#ifdef DEBUG_PUSH
				4306	xmlGenericError(xmlGenericErrorContext,
				4307	"HPP: entering CONTENT\n");
				4308	#endif
				4309	break;
				4310	}
				4311	case XML_PARSER_CONTENT: {
				4312	long cons;
				4313	/*
				4314	* Handle preparsed entities and charRef
				4315	*/
				4316	if (ctxt->token != 0) {
				4317	xmlChar chr[2] = { 0 , 0 } ;
				4318
				4319	chr[0] = (xmlChar) ctxt->token;
				4320	htmlCheckParagraph(ctxt);
				4321	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4322	ctxt->sax->characters(ctxt->userData, chr, 1);
				4323	ctxt->token = 0;
				4324	ctxt->checkIndex = 0;
				4325	}
				4326	if ((avail == 1) && (terminate)) {
				4327	cur = in->cur[0];
				4328	if ((cur != '<') && (cur != '&')) {
				4329	if (ctxt->sax != NULL) {
				4330	if (IS_BLANK(cur)) {
				4331	if (ctxt->sax->ignorableWhitespace != NULL)
				4332	ctxt->sax->ignorableWhitespace(
				4333	ctxt->userData, &cur, 1);
				4334	} else {
				4335	htmlCheckParagraph(ctxt);
				4336	if (ctxt->sax->characters != NULL)
				4337	ctxt->sax->characters(
				4338	ctxt->userData, &cur, 1);
				4339	}
				4340	}
				4341	ctxt->token = 0;
				4342	ctxt->checkIndex = 0;
				4343	NEXT;
				4344	}
				4345	break;
				4346	}
				4347	if (avail < 2)
				4348	goto done;
				4349	cur = in->cur[0];
				4350	next = in->cur[1];
				4351	cons = ctxt->nbChars;
				4352	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4353	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4354	/*
				4355	* Handle SCRIPT/STYLE separately
				4356	*/
				4357	if ((!terminate) &&
				4358	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4359	goto done;
				4360	htmlParseScript(ctxt);
				4361	if ((cur == '<') && (next == '/')) {
				4362	ctxt->instate = XML_PARSER_END_TAG;
				4363	ctxt->checkIndex = 0;
				4364	#ifdef DEBUG_PUSH
				4365	xmlGenericError(xmlGenericErrorContext,
				4366	"HPP: entering END_TAG\n");
				4367	#endif
				4368	break;
				4369	}
				4370	} else {
				4371	/*
				4372	* Sometimes DOCTYPE arrives in the middle of the document
				4373	*/
				4374	if ((cur == '<') && (next == '!') &&
				4375	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4376	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4377	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4378	(UPP(8) == 'E')) {
				4379	if ((!terminate) &&
				4380	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4381	goto done;
				4382	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4383	ctxt->sax->error(ctxt->userData,
				4384	"Misplaced DOCTYPE declaration\n");
				4385	ctxt->wellFormed = 0;
				4386	htmlParseDocTypeDecl(ctxt);
				4387	} else if ((cur == '<') && (next == '!') &&
				4388	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4389	if ((!terminate) &&
				4390	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4391	goto done;
				4392	#ifdef DEBUG_PUSH
				4393	xmlGenericError(xmlGenericErrorContext,
				4394	"HPP: Parsing Comment\n");
				4395	#endif
				4396	htmlParseComment(ctxt);
				4397	ctxt->instate = XML_PARSER_CONTENT;
				4398	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4399	goto done;
				4400	} else if ((cur == '<') && (next == '/')) {
				4401	ctxt->instate = XML_PARSER_END_TAG;
				4402	ctxt->checkIndex = 0;
				4403	#ifdef DEBUG_PUSH
				4404	xmlGenericError(xmlGenericErrorContext,
				4405	"HPP: entering END_TAG\n");
				4406	#endif
				4407	break;
				4408	} else if (cur == '<') {
				4409	ctxt->instate = XML_PARSER_START_TAG;
				4410	ctxt->checkIndex = 0;
				4411	#ifdef DEBUG_PUSH
				4412	xmlGenericError(xmlGenericErrorContext,
				4413	"HPP: entering START_TAG\n");
				4414	#endif
				4415	break;
				4416	} else if (cur == '&') {
				4417	if ((!terminate) &&
				4418	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4419	goto done;
				4420	#ifdef DEBUG_PUSH
				4421	xmlGenericError(xmlGenericErrorContext,
				4422	"HPP: Parsing Reference\n");
				4423	#endif
				4424	/* TODO: check generation of subtrees if noent !!! */
				4425	htmlParseReference(ctxt);
				4426	} else {
				4427	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4428	/*
				4429	* Goal of the following test is :
				4430	* - minimize calls to the SAX 'character' callback
				4431	* when they are mergeable
				4432	*/
				4433	if ((ctxt->inputNr == 1) &&
				4434	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4435	if ((!terminate) &&
				4436	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4437	goto done;
				4438	}
				4439	ctxt->checkIndex = 0;
				4440	#ifdef DEBUG_PUSH
				4441	xmlGenericError(xmlGenericErrorContext,
				4442	"HPP: Parsing char data\n");
				4443	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4444	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4445	}
				4446	}
				4447	if (cons == ctxt->nbChars) {
				4448	if (ctxt->node != NULL) {
				4449	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4450	ctxt->sax->error(ctxt->userData,
				4451	"detected an error in element content\n");
				4452	ctxt->wellFormed = 0;
				4453	}
				4454	NEXT;
				4455	break;
				4456	}
				4457
				4458	break;
				4459	}
				4460	case XML_PARSER_END_TAG:
				4461	if (avail < 2)
				4462	goto done;
				4463	if ((!terminate) &&
				4464	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4465	goto done;
				4466	htmlParseEndTag(ctxt);
				4467	if (ctxt->nameNr == 0) {
				4468	ctxt->instate = XML_PARSER_EPILOG;
				4469	} else {
				4470	ctxt->instate = XML_PARSER_CONTENT;
				4471	}
				4472	ctxt->checkIndex = 0;
				4473	#ifdef DEBUG_PUSH
				4474	xmlGenericError(xmlGenericErrorContext,
				4475	"HPP: entering CONTENT\n");
				4476	#endif
				4477	break;
				4478	case XML_PARSER_CDATA_SECTION:
				4479	xmlGenericError(xmlGenericErrorContext,
				4480	"HPP: internal error, state == CDATA\n");
				4481	ctxt->instate = XML_PARSER_CONTENT;
				4482	ctxt->checkIndex = 0;
				4483	#ifdef DEBUG_PUSH
				4484	xmlGenericError(xmlGenericErrorContext,
				4485	"HPP: entering CONTENT\n");
				4486	#endif
				4487	break;
				4488	case XML_PARSER_DTD:
				4489	xmlGenericError(xmlGenericErrorContext,
				4490	"HPP: internal error, state == DTD\n");
				4491	ctxt->instate = XML_PARSER_CONTENT;
				4492	ctxt->checkIndex = 0;
				4493	#ifdef DEBUG_PUSH
				4494	xmlGenericError(xmlGenericErrorContext,
				4495	"HPP: entering CONTENT\n");
				4496	#endif
				4497	break;
				4498	case XML_PARSER_COMMENT:
				4499	xmlGenericError(xmlGenericErrorContext,
				4500	"HPP: internal error, state == COMMENT\n");
				4501	ctxt->instate = XML_PARSER_CONTENT;
				4502	ctxt->checkIndex = 0;
				4503	#ifdef DEBUG_PUSH
				4504	xmlGenericError(xmlGenericErrorContext,
				4505	"HPP: entering CONTENT\n");
				4506	#endif
				4507	break;
				4508	case XML_PARSER_PI:
				4509	xmlGenericError(xmlGenericErrorContext,
				4510	"HPP: internal error, state == PI\n");
				4511	ctxt->instate = XML_PARSER_CONTENT;
				4512	ctxt->checkIndex = 0;
				4513	#ifdef DEBUG_PUSH
				4514	xmlGenericError(xmlGenericErrorContext,
				4515	"HPP: entering CONTENT\n");
				4516	#endif
				4517	break;
				4518	case XML_PARSER_ENTITY_DECL:
				4519	xmlGenericError(xmlGenericErrorContext,
				4520	"HPP: internal error, state == ENTITY_DECL\n");
				4521	ctxt->instate = XML_PARSER_CONTENT;
				4522	ctxt->checkIndex = 0;
				4523	#ifdef DEBUG_PUSH
				4524	xmlGenericError(xmlGenericErrorContext,
				4525	"HPP: entering CONTENT\n");
				4526	#endif
				4527	break;
				4528	case XML_PARSER_ENTITY_VALUE:
				4529	xmlGenericError(xmlGenericErrorContext,
				4530	"HPP: internal error, state == ENTITY_VALUE\n");
				4531	ctxt->instate = XML_PARSER_CONTENT;
				4532	ctxt->checkIndex = 0;
				4533	#ifdef DEBUG_PUSH
				4534	xmlGenericError(xmlGenericErrorContext,
				4535	"HPP: entering DTD\n");
				4536	#endif
				4537	break;
				4538	case XML_PARSER_ATTRIBUTE_VALUE:
				4539	xmlGenericError(xmlGenericErrorContext,
				4540	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4541	ctxt->instate = XML_PARSER_START_TAG;
				4542	ctxt->checkIndex = 0;
				4543	#ifdef DEBUG_PUSH
				4544	xmlGenericError(xmlGenericErrorContext,
				4545	"HPP: entering START_TAG\n");
				4546	#endif
				4547	break;
				4548	case XML_PARSER_SYSTEM_LITERAL:
				4549	xmlGenericError(xmlGenericErrorContext,
				4550	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4551	ctxt->instate = XML_PARSER_CONTENT;
				4552	ctxt->checkIndex = 0;
				4553	#ifdef DEBUG_PUSH
				4554	xmlGenericError(xmlGenericErrorContext,
				4555	"HPP: entering CONTENT\n");
				4556	#endif
				4557	break;
				4558	case XML_PARSER_IGNORE:
				4559	xmlGenericError(xmlGenericErrorContext,
				4560	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4561	ctxt->instate = XML_PARSER_CONTENT;
				4562	ctxt->checkIndex = 0;
				4563	#ifdef DEBUG_PUSH
				4564	xmlGenericError(xmlGenericErrorContext,
				4565	"HPP: entering CONTENT\n");
				4566	#endif
				4567	break;
				4568	}
				4569	}
				4570	done:
				4571	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4572	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4573	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4574	/*
				4575	* SAX: end of the document processing.
				4576	*/
				4577	ctxt->instate = XML_PARSER_EOF;
				4578	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4579	ctxt->sax->endDocument(ctxt->userData);
				4580	}
				4581	}
				4582	if ((ctxt->myDoc != NULL) &&
				4583	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4584	(ctxt->instate == XML_PARSER_EPILOG))) {
				4585	xmlDtdPtr dtd;
				4586	dtd = xmlGetIntSubset(ctxt->myDoc);
				4587	if (dtd == NULL)
				4588	ctxt->myDoc->intSubset =
				4589	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4590	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4591	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4592	}
				4593	#ifdef DEBUG_PUSH
				4594	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4595	#endif
				4596	return(ret);
				4597	}
				4598
				4599	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4600	* htmlParseChunk:
				4601	* @ctxt: an XML parser context
				4602	* @chunk: an char array
				4603	* @size: the size in byte of the chunk
				4604	* @terminate: last chunk indicator
				4605	*
				4606	* Parse a Chunk of memory
				4607	*
				4608	* Returns zero if no error, the xmlParserErrors otherwise.
				4609	*/
				4610	int
				4611	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4612	int terminate) {
				4613	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4614	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4615	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4616	int cur = ctxt->input->cur - ctxt->input->base;
				4617
				4618	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4619	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4620	ctxt->input->cur = ctxt->input->base + cur;
				4621	#ifdef DEBUG_PUSH
				4622	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4623	#endif
				4624
				4625	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4626	htmlParseTryOrFinish(ctxt, terminate);
				4627	} else if (ctxt->instate != XML_PARSER_EOF) {
				4628	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4629	htmlParseTryOrFinish(ctxt, terminate);
				4630	}
				4631	if (terminate) {
				4632	if ((ctxt->instate != XML_PARSER_EOF) &&
				4633	(ctxt->instate != XML_PARSER_EPILOG) &&
				4634	(ctxt->instate != XML_PARSER_MISC)) {
				4635	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4636	ctxt->wellFormed = 0;
				4637	}
				4638	if (ctxt->instate != XML_PARSER_EOF) {
				4639	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4640	ctxt->sax->endDocument(ctxt->userData);
				4641	}
				4642	ctxt->instate = XML_PARSER_EOF;
				4643	}
				4644	return((xmlParserErrors) ctxt->errNo);
				4645	}
				4646
				4647	/************************************************************************
				4648	* *
				4649	* User entry points *
				4650	* *
				4651	************************************************************************/
				4652
				4653	/**
				4654	* htmlCreatePushParserCtxt :
				4655	* @sax: a SAX handler
				4656	* @user_data: The user data returned on SAX callbacks
				4657	* @chunk: a pointer to an array of chars
				4658	* @size: number of chars in the array
				4659	* @filename: an optional file name or URI
				4660	* @enc: an optional encoding
				4661	*
				4662	* Create a parser context for using the HTML parser in push mode
				4663	* To allow content encoding detection, @size should be >= 4
				4664	* The value of @filename is used for fetching external entities
				4665	* and error/warning reports.
				4666	*
				4667	* Returns the new parser context or NULL
				4668	*/
				4669	htmlParserCtxtPtr
				4670	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4671	const char chunk, int size, const char filename,
				4672	xmlCharEncoding enc) {
				4673	htmlParserCtxtPtr ctxt;
				4674	htmlParserInputPtr inputStream;
				4675	xmlParserInputBufferPtr buf;
				4676
				4677	buf = xmlAllocParserInputBuffer(enc);
				4678	if (buf == NULL) return(NULL);
				4679
				4680	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4681	if (ctxt == NULL) {
				4682	xmlFree(buf);
				4683	return(NULL);
				4684	}
				4685	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4686	htmlInitParserCtxt(ctxt);
				4687	if (sax != NULL) {
				4688	if (ctxt->sax != &htmlDefaultSAXHandler)
				4689	xmlFree(ctxt->sax);
				4690	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4691	if (ctxt->sax == NULL) {
				4692	xmlFree(buf);
				4693	xmlFree(ctxt);
				4694	return(NULL);
				4695	}
				4696	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4697	if (user_data != NULL)
				4698	ctxt->userData = user_data;
				4699	}
				4700	if (filename == NULL) {
				4701	ctxt->directory = NULL;
				4702	} else {
				4703	ctxt->directory = xmlParserGetDirectory(filename);
				4704	}
				4705
				4706	inputStream = htmlNewInputStream(ctxt);
				4707	if (inputStream == NULL) {
				4708	xmlFreeParserCtxt(ctxt);
				4709	return(NULL);
				4710	}
				4711
				4712	if (filename == NULL)
				4713	inputStream->filename = NULL;
				4714	else
				4715	inputStream->filename = xmlMemStrdup(filename);
				4716	inputStream->buf = buf;
				4717	inputStream->base = inputStream->buf->buffer->content;
				4718	inputStream->cur = inputStream->buf->buffer->content;
				4719
				4720	inputPush(ctxt, inputStream);
				4721
				4722	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4723	(ctxt->input->buf != NULL)) {
				4724	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4725	#ifdef DEBUG_PUSH
				4726	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4727	#endif
				4728	}
				4729
				4730	return(ctxt);
				4731	}
				4732
				4733	/**
				4734	* htmlSAXParseDoc :
				4735	* @cur: a pointer to an array of xmlChar
				4736	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4737	* @sax: the SAX handler block
				4738	* @userData: if using SAX, this pointer will be provided on callbacks.
				4739	*
				4740	* parse an HTML in-memory document and build a tree.
				4741	* It use the given SAX function block to handle the parsing callback.
				4742	* If sax is NULL, fallback to the default DOM tree building routines.
				4743	*
				4744	* Returns the resulting document tree
				4745	*/
				4746
				4747	htmlDocPtr
				4748	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4749	htmlDocPtr ret;
				4750	htmlParserCtxtPtr ctxt;
				4751
				4752	if (cur == NULL) return(NULL);
				4753
				4754
				4755	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4756	if (ctxt == NULL) return(NULL);
				4757	if (sax != NULL) {
				4758	ctxt->sax = sax;
				4759	ctxt->userData = userData;
				4760	}
				4761
				4762	htmlParseDocument(ctxt);
				4763	ret = ctxt->myDoc;
				4764	if (sax != NULL) {
				4765	ctxt->sax = NULL;
				4766	ctxt->userData = NULL;
				4767	}
				4768	htmlFreeParserCtxt(ctxt);
				4769
				4770	return(ret);
				4771	}
				4772
				4773	/**
				4774	* htmlParseDoc :
				4775	* @cur: a pointer to an array of xmlChar
				4776	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4777	*
				4778	* parse an HTML in-memory document and build a tree.
				4779	*
				4780	* Returns the resulting document tree
				4781	*/
				4782
				4783	htmlDocPtr
				4784	htmlParseDoc(xmlChar cur, const char encoding) {
				4785	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4786	}
				4787
				4788
				4789	/**
				4790	* htmlCreateFileParserCtxt :
				4791	* @filename: the filename
				4792	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4793	*
				4794	* Create a parser context for a file content.
				4795	* Automatic support for ZLIB/Compress compressed document is provided
				4796	* by default if found at compile-time.
				4797	*
				4798	* Returns the new parser context or NULL
				4799	*/
				4800	htmlParserCtxtPtr
				4801	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4802	{
				4803	htmlParserCtxtPtr ctxt;
				4804	htmlParserInputPtr inputStream;
				4805	xmlParserInputBufferPtr buf;
				4806	/* htmlCharEncoding enc; */
				4807	xmlChar content, content_line = (xmlChar *) "charset=";
				4808
				4809	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4810	if (buf == NULL) return(NULL);
				4811
				4812	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4813	if (ctxt == NULL) {
				4814	perror("malloc");
				4815	return(NULL);
				4816	}
				4817	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4818	htmlInitParserCtxt(ctxt);
				4819	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4820	if (inputStream == NULL) {
				4821	perror("malloc");
				4822	xmlFree(ctxt);
				4823	return(NULL);
				4824	}
				4825	memset(inputStream, 0, sizeof(htmlParserInput));
				4826
				4827	inputStream->filename = xmlMemStrdup(filename);
				4828	inputStream->line = 1;
				4829	inputStream->col = 1;
				4830	inputStream->buf = buf;
				4831	inputStream->directory = NULL;
				4832
				4833	inputStream->base = inputStream->buf->buffer->content;
				4834	inputStream->cur = inputStream->buf->buffer->content;
				4835	inputStream->free = NULL;
				4836
				4837	inputPush(ctxt, inputStream);
				4838
				4839	/* set encoding */
				4840	if (encoding) {
				4841	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4842	if (content) {
				4843	strcpy ((char )content, (char )content_line);
				4844	strcat ((char )content, (char )encoding);
				4845	htmlCheckEncoding (ctxt, content);
				4846	xmlFree (content);
				4847	}
				4848	}
				4849
				4850	return(ctxt);
				4851	}
				4852
				4853	/**
				4854	* htmlSAXParseFile :
				4855	* @filename: the filename
				4856	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4857	* @sax: the SAX handler block
				4858	* @userData: if using SAX, this pointer will be provided on callbacks.
				4859	*
				4860	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4861	* compressed document is provided by default if found at compile-time.
				4862	* It use the given SAX function block to handle the parsing callback.
				4863	* If sax is NULL, fallback to the default DOM tree building routines.
				4864	*
				4865	* Returns the resulting document tree
				4866	*/
				4867
				4868	htmlDocPtr
				4869	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4870	void *userData) {
				4871	htmlDocPtr ret;
				4872	htmlParserCtxtPtr ctxt;
				4873	htmlSAXHandlerPtr oldsax = NULL;
				4874
				4875	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4876	if (ctxt == NULL) return(NULL);
				4877	if (sax != NULL) {
				4878	oldsax = ctxt->sax;
				4879	ctxt->sax = sax;
				4880	ctxt->userData = userData;
				4881	}
				4882
				4883	htmlParseDocument(ctxt);
				4884
				4885	ret = ctxt->myDoc;
				4886	if (sax != NULL) {
				4887	ctxt->sax = oldsax;
				4888	ctxt->userData = NULL;
				4889	}
				4890	htmlFreeParserCtxt(ctxt);
				4891
				4892	return(ret);
				4893	}
				4894
				4895	/**
				4896	* htmlParseFile :
				4897	* @filename: the filename
				4898	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4899	*
				4900	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4901	* compressed document is provided by default if found at compile-time.
				4902	*
				4903	* Returns the resulting document tree
				4904	*/
				4905
				4906	htmlDocPtr
				4907	htmlParseFile(const char filename, const char encoding) {
				4908	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4909	}
				4910
				4911	/**
				4912	* htmlHandleOmittedElem:
				4913	* @val: int 0 or 1
				4914	*
				4915	* Set and return the previous value for handling HTML omitted tags.
				4916	*
				4917	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4918	*/
				4919
				4920	int
				4921	htmlHandleOmittedElem(int val) {
				4922	int old = htmlOmittedDefaultValue;
				4923
				4924	htmlOmittedDefaultValue = val;
				4925	return(old);
				4926	}
				4927
				4928	#endif /* LIBXML_HTML_ENABLED */