Blame - HTMLparser.c - platform/external/libxml2

blob: 186ab09bb183180549dc4be1359f13200cce17a1 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Daniel Veillard	34ce8be	2002-03-18 19:37:11 +0000	[diff] [blame]	9	#define IN_LIBXML
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	10	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	11	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	12
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	13	#include <string.h>
				14	#ifdef HAVE_CTYPE_H
				15	#include <ctype.h>
				16	#endif
				17	#ifdef HAVE_STDLIB_H
				18	#include <stdlib.h>
				19	#endif
				20	#ifdef HAVE_SYS_STAT_H
				21	#include <sys/stat.h>
				22	#endif
				23	#ifdef HAVE_FCNTL_H
				24	#include <fcntl.h>
				25	#endif
				26	#ifdef HAVE_UNISTD_H
				27	#include <unistd.h>
				28	#endif
				29	#ifdef HAVE_ZLIB_H
				30	#include <zlib.h>
				31	#endif
				32
				33	#include <libxml/xmlmemory.h>
				34	#include <libxml/tree.h>
				35	#include <libxml/parser.h>
				36	#include <libxml/parserInternals.h>
				37	#include <libxml/xmlerror.h>
				38	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	39	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	40	#include <libxml/entities.h>
				41	#include <libxml/encoding.h>
				42	#include <libxml/valid.h>
				43	#include <libxml/xmlIO.h>
Daniel Veillard	3c01b1d	2001-10-17 15:58:35 +0000	[diff] [blame]	44	#include <libxml/globals.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	45
				46	#define HTML_MAX_NAMELEN 1000
				47	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				48	#define HTML_PARSER_BUFFER_SIZE 100
				49
				50	/* #define DEBUG */
				51	/* #define DEBUG_PUSH */
				52
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	53	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	54
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	55	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				56	xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	57	static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	58
				59	/************************************************************************
				60	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	61	* Parser stacks related functions and macros *
				62	* *
				63	************************************************************************/
				64
Daniel Veillard	1c732d2	2002-11-30 11:22:59 +0000	[diff] [blame]	65	/**
				66	* htmlnamePush:
				67	* @ctxt: an HTML parser context
				68	* @value: the element name
				69	*
				70	* Pushes a new element name on top of the name stack
				71	*
				72	* Returns 0 in case of error, the index in the stack otherwise
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	73	*/
Daniel Veillard	1c732d2	2002-11-30 11:22:59 +0000	[diff] [blame]	74	static int
				75	htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
				76	{
				77	if (ctxt->nameNr >= ctxt->nameMax) {
				78	ctxt->nameMax *= 2;
				79	ctxt->nameTab =
				80	(xmlChar * *)xmlRealloc(ctxt->nameTab,
				81	ctxt->nameMax *
				82	sizeof(ctxt->nameTab[0]));
				83	if (ctxt->nameTab == NULL) {
				84	xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
				85	return (0);
				86	}
				87	}
				88	ctxt->nameTab[ctxt->nameNr] = value;
				89	ctxt->name = value;
				90	return (ctxt->nameNr++);
				91	}
				92	/**
				93	* htmlnamePop:
				94	* @ctxt: an HTML parser context
				95	*
				96	* Pops the top element name from the name stack
				97	*
				98	* Returns the name just removed
				99	*/
				100	static xmlChar *
				101	htmlnamePop(htmlParserCtxtPtr ctxt)
				102	{
				103	xmlChar *ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	104
Daniel Veillard	1c732d2	2002-11-30 11:22:59 +0000	[diff] [blame]	105	if (ctxt->nameNr <= 0)
				106	return (0);
				107	ctxt->nameNr--;
				108	if (ctxt->nameNr < 0)
				109	return (0);
				110	if (ctxt->nameNr > 0)
				111	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
				112	else
				113	ctxt->name = NULL;
				114	ret = ctxt->nameTab[ctxt->nameNr];
				115	ctxt->nameTab[ctxt->nameNr] = 0;
				116	return (ret);
				117	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	118
				119	/*
				120	* Macros for accessing the content. Those should be used only by the parser,
				121	* and not exported.
				122	*
				123	* Dirty macros, i.e. one need to make assumption on the context to use them
				124	*
				125	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				126	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				127	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				128	* in UNICODE mode. This should be used internally by the parser
				129	* only to compare to ASCII values otherwise it would break when
				130	* running with UTF-8 encoding.
				131	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				132	* to compare on ASCII based substring.
				133	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				134	* it should be used only to compare on ASCII based substring.
				135	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				136	* strings within the parser.
				137	*
				138	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				139	*
				140	* CURRENT Returns the current char value, with the full decoding of
				141	* UTF-8 if we are using this mode. It returns an int.
				142	* NEXT Skip to the next character, this does the proper decoding
				143	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				144	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				145	*/
				146
				147	#define UPPER (toupper(*ctxt->input->cur))
				148
				149	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				150
				151	#define NXT(val) ctxt->input->cur[(val)]
				152
				153	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				154
				155	#define CUR_PTR ctxt->input->cur
				156
				157	#define SHRINK xmlParserInputShrink(ctxt->input)
				158
				159	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				160
				161	#define CURRENT ((int) (*ctxt->input->cur))
				162
				163	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				164
				165	/* Inported from XML */
				166
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	167	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				168	#define CUR ((int) (*ctxt->input->cur))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	169	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				170
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	171	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	172	#define NXT(val) ctxt->input->cur[(val)]
				173	#define CUR_PTR ctxt->input->cur
				174
				175
				176	#define NEXTL(l) do { \
				177	if (*(ctxt->input->cur) == '\n') { \
				178	ctxt->input->line++; ctxt->input->col = 1; \
				179	} else ctxt->input->col++; \
				180	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				181	} while (0)
				182
				183	/************
				184	\
				185	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				186	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				187	************/
				188
				189	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				190	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				191
				192	#define COPY_BUF(l,b,i,v) \
				193	if (l == 1) b[i++] = (xmlChar) v; \
				194	else i += xmlCopyChar(l,&b[i],v)
				195
				196	/**
				197	* htmlCurrentChar:
				198	* @ctxt: the HTML parser context
				199	* @len: pointer to the length of the char read
				200	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	201	* The current char value, if using UTF-8 this may actually span multiple
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	202	* bytes in the input buffer. Implement the end of line normalization:
				203	* 2.11 End-of-Line Handling
				204	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				205	* char, then the encoding converter is plugged in automatically.
				206	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	207	* Returns the current char value and its length
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	208	*/
				209
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	210	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	211	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				212	if (ctxt->instate == XML_PARSER_EOF)
				213	return(0);
				214
				215	if (ctxt->token != 0) {
				216	*len = 0;
				217	return(ctxt->token);
				218	}
				219	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				220	/*
				221	* We are supposed to handle UTF8, check it's valid
				222	* From rfc2044: encoding of the Unicode values on UTF-8:
				223	*
				224	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				225	* 0000 0000-0000 007F 0xxxxxxx
				226	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				227	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				228	*
				229	* Check for the 0x110000 limit too
				230	*/
				231	const unsigned char *cur = ctxt->input->cur;
				232	unsigned char c;
				233	unsigned int val;
				234
				235	c = *cur;
				236	if (c & 0x80) {
				237	if (cur[1] == 0)
				238	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				239	if ((cur[1] & 0xc0) != 0x80)
				240	goto encoding_error;
				241	if ((c & 0xe0) == 0xe0) {
				242
				243	if (cur[2] == 0)
				244	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				245	if ((cur[2] & 0xc0) != 0x80)
				246	goto encoding_error;
				247	if ((c & 0xf0) == 0xf0) {
				248	if (cur[3] == 0)
				249	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				250	if (((c & 0xf8) != 0xf0) \|\|
				251	((cur[3] & 0xc0) != 0x80))
				252	goto encoding_error;
				253	/* 4-byte code */
				254	*len = 4;
				255	val = (cur[0] & 0x7) << 18;
				256	val \|= (cur[1] & 0x3f) << 12;
				257	val \|= (cur[2] & 0x3f) << 6;
				258	val \|= cur[3] & 0x3f;
				259	} else {
				260	/* 3-byte code */
				261	*len = 3;
				262	val = (cur[0] & 0xf) << 12;
				263	val \|= (cur[1] & 0x3f) << 6;
				264	val \|= cur[2] & 0x3f;
				265	}
				266	} else {
				267	/* 2-byte code */
				268	*len = 2;
				269	val = (cur[0] & 0x1f) << 6;
				270	val \|= cur[1] & 0x3f;
				271	}
				272	if (!IS_CHAR(val)) {
				273	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				274	if ((ctxt->sax != NULL) &&
				275	(ctxt->sax->error != NULL))
				276	ctxt->sax->error(ctxt->userData,
				277	"Char 0x%X out of allowed range\n", val);
				278	ctxt->wellFormed = 0;
Daniel Veillard	dad3f68	2002-11-17 16:47:27 +0000	[diff] [blame]	279	if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	280	}
				281	return(val);
				282	} else {
				283	/* 1-byte code */
				284	*len = 1;
				285	return((int) *ctxt->input->cur);
				286	}
				287	}
				288	/*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	289	* Assume it's a fixed length encoding (1) with
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	290	* a compatible encoding for the ASCII set, since
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	291	* XML constructs only use < 128 chars
				292	*/
				293	*len = 1;
				294	if ((int) *ctxt->input->cur < 0x80)
				295	return((int) *ctxt->input->cur);
				296
				297	/*
				298	* Humm this is bad, do an automatic flow conversion
				299	*/
				300	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				301	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				302	return(xmlCurrentChar(ctxt, len));
				303
				304	encoding_error:
				305	/*
				306	* If we detect an UTF8 error that probably mean that the
				307	* input encoding didn't get properly advertized in the
				308	* declaration header. Report the error and switch the encoding
				309	* to ISO-Latin-1 (if you don't like this policy, just declare the
				310	* encoding !)
				311	*/
				312	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				313	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				314	ctxt->sax->error(ctxt->userData,
				315	"Input is not proper UTF-8, indicate encoding !\n");
				316	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				317	ctxt->input->cur[0], ctxt->input->cur[1],
				318	ctxt->input->cur[2], ctxt->input->cur[3]);
				319	}
				320
				321	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				322	*len = 1;
				323	return((int) *ctxt->input->cur);
				324	}
				325
				326	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	327	* htmlSkipBlankChars:
				328	* @ctxt: the HTML parser context
				329	*
				330	* skip all blanks character found at that point in the input streams.
				331	*
				332	* Returns the number of space chars skipped
				333	*/
				334
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	335	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	336	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				337	int res = 0;
				338
				339	while (IS_BLANK(*(ctxt->input->cur))) {
				340	if ((*ctxt->input->cur == 0) &&
				341	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				342	xmlPopInput(ctxt);
				343	} else {
				344	if (*(ctxt->input->cur) == '\n') {
				345	ctxt->input->line++; ctxt->input->col = 1;
				346	} else ctxt->input->col++;
				347	ctxt->input->cur++;
				348	ctxt->nbChars++;
				349	if (*ctxt->input->cur == 0)
				350	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				351	}
				352	res++;
				353	}
				354	return(res);
				355	}
				356
				357
				358
				359	/************************************************************************
				360	* *
				361	* The list of HTML elements and their properties *
				362	* *
				363	************************************************************************/
				364
				365	/*
				366	* Start Tag: 1 means the start tag can be ommited
				367	* End Tag: 1 means the end tag can be ommited
				368	* 2 means it's forbidden (empty elements)
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	369	* 3 means the tag is stylistic and should be closed easily
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	370	* Depr: this element is deprecated
				371	* DTD: 1 means that this element is valid only in the Loose DTD
				372	* 2 means that this element is valid only in the Frameset DTD
				373	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	374	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	375	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	376	static const htmlElemDesc
				377	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	378	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				379	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				380	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				381	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				382	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				383	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				384	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				385	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				386	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				387	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				388	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				389	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				390	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				391	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				392	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				393	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				394	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				395	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				396	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				397	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				398	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				399	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				400	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				401	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				402	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				403	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				404	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				405	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				406	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				407	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				408	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				409	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				410	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				411	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				412	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				413	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				414	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				415	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				416	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				417	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				418	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				419	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				420	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				421	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				422	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				423	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				424	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				425	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				426	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				427	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				428	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				429	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				430	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				431	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				432	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				433	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				434	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				435	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				436	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				437	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				438	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				439	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				440	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
Daniel Veillard	fee408f	2002-11-22 13:18:30 +0000	[diff] [blame]	441	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	442	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				443	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				444	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				445	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				446	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				447	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				448	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				449	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				450	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				451	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				452	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				453	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				454	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				455	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				456	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				457	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				458	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				459	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				460	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				461	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				462	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				463	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				464	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				465	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				466	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				467	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				468	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	469	};
				470
				471	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	472	* start tags that imply the end of current element
				473	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	474	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	475	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				476	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				477	"listing", "xmp", "head", NULL,
				478	"head", "p", NULL,
				479	"title", "p", NULL,
				480	"body", "head", "style", "link", "title", "p", NULL,
				481	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				482	"pre", "listing", "xmp", "head", "li", NULL,
				483	"hr", "p", "head", NULL,
				484	"h1", "p", "head", NULL,
				485	"h2", "p", "head", NULL,
				486	"h3", "p", "head", NULL,
				487	"h4", "p", "head", NULL,
				488	"h5", "p", "head", NULL,
				489	"h6", "p", "head", NULL,
				490	"dir", "p", "head", NULL,
				491	"address", "p", "head", "ul", NULL,
				492	"pre", "p", "head", "ul", NULL,
				493	"listing", "p", "head", NULL,
				494	"xmp", "p", "head", NULL,
				495	"blockquote", "p", "head", NULL,
				496	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				497	"xmp", "head", NULL,
				498	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				499	"head", "dd", NULL,
				500	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				501	"head", "dt", NULL,
				502	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				503	"listing", "xmp", NULL,
				504	"ol", "p", "head", "ul", NULL,
				505	"menu", "p", "head", "ul", NULL,
				506	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				507	"div", "p", "head", NULL,
				508	"noscript", "p", "head", NULL,
				509	"center", "font", "b", "i", "p", "head", NULL,
				510	"a", "a", NULL,
				511	"caption", "p", NULL,
				512	"colgroup", "caption", "colgroup", "col", "p", NULL,
				513	"col", "caption", "col", "p", NULL,
				514	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				515	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	516	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				517	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	518	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				519	"thead", "caption", "col", "colgroup", NULL,
				520	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				521	"tbody", "p", NULL,
				522	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				523	"tfoot", "tbody", "p", NULL,
				524	"optgroup", "option", NULL,
				525	"option", "option", NULL,
				526	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				527	"pre", "listing", "xmp", "a", NULL,
				528	NULL
				529	};
				530
				531	/*
				532	* The list of HTML elements which are supposed not to have
				533	* CDATA content and where a p element will be implied
				534	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	535	* TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	536	* implied paragraph
				537	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	538	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	539	"html",
				540	"head",
				541	"body",
				542	NULL
				543	};
				544
				545	/*
				546	* The list of HTML attributes which are of content %Script;
				547	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				548	* it assumes the name starts with 'on'
				549	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	550	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	551	"onclick",
				552	"ondblclick",
				553	"onmousedown",
				554	"onmouseup",
				555	"onmouseover",
				556	"onmousemove",
				557	"onmouseout",
				558	"onkeypress",
				559	"onkeydown",
				560	"onkeyup",
				561	"onload",
				562	"onunload",
				563	"onfocus",
				564	"onblur",
				565	"onsubmit",
				566	"onrest",
				567	"onchange",
				568	"onselect"
				569	};
				570
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	571	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	572	* This table is used by the htmlparser to know what to do with
				573	* broken html pages. By assigning different priorities to different
				574	* elements the parser can decide how to handle extra endtags.
				575	* Endtags are only allowed to close elements with lower or equal
				576	* priority.
				577	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	578
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	579	typedef struct {
				580	const char *name;
				581	int priority;
				582	} elementPriority;
				583
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	584	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	585	{"div", 150},
				586	{"td", 160},
				587	{"th", 160},
				588	{"tr", 170},
				589	{"thead", 180},
				590	{"tbody", 180},
				591	{"tfoot", 180},
				592	{"table", 190},
				593	{"head", 200},
				594	{"body", 200},
				595	{"html", 220},
				596	{NULL, 100} /* Default priority */
				597	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	598
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	599	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	600	static int htmlStartCloseIndexinitialized = 0;
				601
				602	/************************************************************************
				603	* *
				604	* functions to handle HTML specific data *
				605	* *
				606	************************************************************************/
				607
				608	/**
				609	* htmlInitAutoClose:
				610	*
				611	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				612	* This is not reentrant. Call xmlInitParser() once before processing in
				613	* case of use in multithreaded programs.
				614	*/
				615	void
				616	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	617	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	618
				619	if (htmlStartCloseIndexinitialized) return;
				620
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	621	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				622	indx = 0;
				623	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				624	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	625	while (htmlStartClose[i] != NULL) i++;
				626	i++;
				627	}
				628	htmlStartCloseIndexinitialized = 1;
				629	}
				630
				631	/**
				632	* htmlTagLookup:
				633	* @tag: The tag name in lowercase
				634	*
				635	* Lookup the HTML tag in the ElementTable
				636	*
				637	* Returns the related htmlElemDescPtr or NULL if not found.
				638	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	639	const htmlElemDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	640	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	641	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	642
				643	for (i = 0; i < (sizeof(html40ElementTable) /
				644	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	645	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	646	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	647	}
				648	return(NULL);
				649	}
				650
				651	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	652	* htmlGetEndPriority:
				653	* @name: The name of the element to look up the priority for.
				654	*
				655	* Return value: The "endtag" priority.
				656	**/
				657	static int
				658	htmlGetEndPriority (const xmlChar *name) {
				659	int i = 0;
				660
				661	while ((htmlEndPriority[i].name != NULL) &&
				662	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				663	i++;
				664
				665	return(htmlEndPriority[i].priority);
				666	}
				667
				668	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	669	* htmlCheckAutoClose:
				670	* @newtag: The new tag name
				671	* @oldtag: The old tag name
				672	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	673	* Checks whether the new tag is one of the registered valid tags for
				674	* closing old.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	675	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				676	*
				677	* Returns 0 if no, 1 if yes.
				678	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	679	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	680	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	681	int i, indx;
				682	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	683
				684	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				685
				686	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	687	for (indx = 0; indx < 100;indx++) {
				688	closed = htmlStartCloseIndex[indx];
				689	if (closed == NULL) return(0);
				690	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	691	}
				692
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	693	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	694	i++;
				695	while (htmlStartClose[i] != NULL) {
				696	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				697	return(1);
				698	}
				699	i++;
				700	}
				701	return(0);
				702	}
				703
				704	/**
				705	* htmlAutoCloseOnClose:
				706	* @ctxt: an HTML parser context
				707	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	708	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	709	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	710	* The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	711	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	712	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	713	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	714	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	715	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	716	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	717
				718	#ifdef DEBUG
				719	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				720	for (i = 0;i < ctxt->nameNr;i++)
				721	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				722	#endif
				723
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	724	priority = htmlGetEndPriority (newtag);
				725
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	726	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	727
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	728	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	729	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	730	* A missplaced endtag can only close elements with lower
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	731	* or equal priority, so if we find an element with higher
				732	* priority before we find an element with
				733	* matching name, we just ignore this endtag
				734	*/
				735	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	736	}
				737	if (i < 0) return;
				738
				739	while (!xmlStrEqual(newtag, ctxt->name)) {
				740	info = htmlTagLookup(ctxt->name);
				741	if ((info == NULL) \|\| (info->endTag == 1)) {
				742	#ifdef DEBUG
				743	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				744	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	745	} else if (info->endTag == 3) {
				746	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	747	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	748
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	749	#endif
				750	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				751	ctxt->sax->error(ctxt->userData,
				752	"Opening and ending tag mismatch: %s and %s\n",
				753	newtag, ctxt->name);
				754	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	755	}
				756	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				757	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				758	oldname = htmlnamePop(ctxt);
				759	if (oldname != NULL) {
				760	#ifdef DEBUG
				761	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				762	#endif
				763	xmlFree(oldname);
				764	}
				765	}
				766	}
				767
				768	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	769	* htmlAutoCloseOnEnd:
				770	* @ctxt: an HTML parser context
				771	*
				772	* Close all remaining tags at the end of the stream
				773	*/
				774	static void
				775	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				776	xmlChar *oldname;
				777	int i;
				778
				779	if (ctxt->nameNr == 0)
				780	return;
				781	#ifdef DEBUG
				782	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				783	#endif
				784
				785	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				786	#ifdef DEBUG
				787	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				788	#endif
				789	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				790	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				791	oldname = htmlnamePop(ctxt);
				792	if (oldname != NULL) {
				793	#ifdef DEBUG
				794	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				795	#endif
				796	xmlFree(oldname);
				797	}
				798	}
				799	}
				800
				801	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	802	* htmlAutoClose:
				803	* @ctxt: an HTML parser context
				804	* @newtag: The new tag name or NULL
				805	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	806	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	807	* The list is kept in htmlStartClose array. This function is
				808	* called when a new tag has been detected and generates the
				809	* appropriates closes if possible/needed.
				810	* If newtag is NULL this mean we are at the end of the resource
				811	* and we should check
				812	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	813	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	814	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				815	xmlChar *oldname;
				816	while ((newtag != NULL) && (ctxt->name != NULL) &&
				817	(htmlCheckAutoClose(newtag, ctxt->name))) {
				818	#ifdef DEBUG
				819	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				820	#endif
				821	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				822	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				823	oldname = htmlnamePop(ctxt);
				824	if (oldname != NULL) {
				825	#ifdef DEBUG
				826	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				827	#endif
				828	xmlFree(oldname);
				829	}
				830	}
				831	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	832	htmlAutoCloseOnEnd(ctxt);
				833	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	834	}
				835	while ((newtag == NULL) && (ctxt->name != NULL) &&
				836	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				837	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				838	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				839	#ifdef DEBUG
				840	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				841	#endif
				842	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				843	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				844	oldname = htmlnamePop(ctxt);
				845	if (oldname != NULL) {
				846	#ifdef DEBUG
				847	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				848	#endif
				849	xmlFree(oldname);
				850	}
				851	}
				852
				853	}
				854
				855	/**
				856	* htmlAutoCloseTag:
				857	* @doc: the HTML document
				858	* @name: The tag name
				859	* @elem: the HTML element
				860	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	861	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	862	* The list is kept in htmlStartClose array. This function checks
				863	* if the element or one of it's children would autoclose the
				864	* given tag.
				865	*
				866	* Returns 1 if autoclose, 0 otherwise
				867	*/
				868	int
				869	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				870	htmlNodePtr child;
				871
				872	if (elem == NULL) return(1);
				873	if (xmlStrEqual(name, elem->name)) return(0);
				874	if (htmlCheckAutoClose(elem->name, name)) return(1);
				875	child = elem->children;
				876	while (child != NULL) {
				877	if (htmlAutoCloseTag(doc, name, child)) return(1);
				878	child = child->next;
				879	}
				880	return(0);
				881	}
				882
				883	/**
				884	* htmlIsAutoClosed:
				885	* @doc: the HTML document
				886	* @elem: the HTML element
				887	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	888	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	889	* The list is kept in htmlStartClose array. This function checks
				890	* if a tag is autoclosed by one of it's child
				891	*
				892	* Returns 1 if autoclosed, 0 otherwise
				893	*/
				894	int
				895	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				896	htmlNodePtr child;
				897
				898	if (elem == NULL) return(1);
				899	child = elem->children;
				900	while (child != NULL) {
				901	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				902	child = child->next;
				903	}
				904	return(0);
				905	}
				906
				907	/**
				908	* htmlCheckImplied:
				909	* @ctxt: an HTML parser context
				910	* @newtag: The new tag name
				911	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	912	* The HTML DTD allows a tag to exists only implicitly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	913	* called when a new tag has been detected and generates the
				914	* appropriates implicit tags if missing
				915	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	916	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	917	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				918	if (!htmlOmittedDefaultValue)
				919	return;
				920	if (xmlStrEqual(newtag, BAD_CAST"html"))
				921	return;
				922	if (ctxt->nameNr <= 0) {
				923	#ifdef DEBUG
				924	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				925	#endif
				926	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				927	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				928	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				929	}
				930	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				931	return;
				932	if ((ctxt->nameNr <= 1) &&
				933	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				934	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				935	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				936	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				937	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				938	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				939	/*
				940	* dropped OBJECT ... i you put it first BODY will be
				941	* assumed !
				942	*/
				943	#ifdef DEBUG
				944	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				945	#endif
				946	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				947	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				948	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				949	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				950	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				951	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				952	int i;
				953	for (i = 0;i < ctxt->nameNr;i++) {
				954	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				955	return;
				956	}
				957	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				958	return;
				959	}
				960	}
				961
				962	#ifdef DEBUG
				963	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				964	#endif
				965	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				966	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				967	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				968	}
				969	}
				970
				971	/**
				972	* htmlCheckParagraph
				973	* @ctxt: an HTML parser context
				974	*
				975	* Check whether a p element need to be implied before inserting
				976	* characters in the current element.
				977	*
				978	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				979	* in case of error.
				980	*/
				981
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	982	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	983	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				984	const xmlChar *tag;
				985	int i;
				986
				987	if (ctxt == NULL)
				988	return(-1);
				989	tag = ctxt->name;
				990	if (tag == NULL) {
				991	htmlAutoClose(ctxt, BAD_CAST"p");
				992	htmlCheckImplied(ctxt, BAD_CAST"p");
				993	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				994	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				995	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				996	return(1);
				997	}
				998	if (!htmlOmittedDefaultValue)
				999	return(0);
				1000	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				1001	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				1002	#ifdef DEBUG
				1003	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				1004	#endif
				1005	htmlAutoClose(ctxt, BAD_CAST"p");
				1006	htmlCheckImplied(ctxt, BAD_CAST"p");
				1007	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				1008	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1009	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1010	return(1);
				1011	}
				1012	}
				1013	return(0);
				1014	}
				1015
				1016	/**
				1017	* htmlIsScriptAttribute:
				1018	* @name: an attribute name
				1019	*
				1020	* Check if an attribute is of content type Script
				1021	*
				1022	* Returns 1 is the attribute is a script 0 otherwise
				1023	*/
				1024	int
				1025	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1026	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1027
				1028	if (name == NULL)
				1029	return(0);
				1030	/*
				1031	* all script attributes start with 'on'
				1032	*/
				1033	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1034	return(0);
				1035	for (i = 0;
				1036	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1037	i++) {
				1038	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1039	return(1);
				1040	}
				1041	return(0);
				1042	}
				1043
				1044	/************************************************************************
				1045	* *
				1046	* The list of HTML predefined entities *
				1047	* *
				1048	************************************************************************/
				1049
				1050
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1051	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1052	/*
				1053	* the 4 absolute ones, plus apostrophe.
				1054	*/
				1055	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1056	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1057	{ 39, "apos", "single quote" },
				1058	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1059	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1060
				1061	/*
				1062	* A bunch still in the 128-255 range
				1063	* Replacing them depend really on the charset used.
				1064	*/
				1065	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1066	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1067	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1068	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1069	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1070	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1071	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1072	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1073	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1074	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1075	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1076	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1077	{ 172, "not", "not sign, U+00AC ISOnum" },
				1078	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1079	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1080	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1081	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1082	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1083	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1084	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1085	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1086	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1087	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1088	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1089	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1090	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1091	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1092	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1093	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1094	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1095	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1096	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1097	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1098	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1099	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1100	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1101	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1102	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1103	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1104	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1105	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1106	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1107	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1108	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1109	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1110	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1111	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1112	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1113	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1114	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1115	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1116	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1117	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1118	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1119	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1120	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1121	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1122	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1123	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1124	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1125	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1126	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1127	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1128	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1129	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1130	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1131	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1132	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1133	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1134	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1135	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1136	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1137	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1138	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1139	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1140	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1141	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1142	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1143	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1144	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1145	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1146	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1147	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1148	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1149	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1150	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1151	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1152	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1153	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1154	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1155	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1156	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1157	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1158	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1159	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1160	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1161
				1162	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1163	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1164	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1165	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1166	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1167
				1168	/*
				1169	* Anything below should really be kept as entities references
				1170	*/
				1171	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1172
				1173	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1174	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1175
				1176	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1177	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1178	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1179	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1180	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1181	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1182	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1183	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1184	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1185	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1186	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1187	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1188	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1189	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1190	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1191	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1192	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1193	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1194	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1195	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1196	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1197	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1198	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1199	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1200
				1201	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1202	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1203	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1204	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1205	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1206	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1207	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1208	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1209	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1210	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1211	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1212	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1213	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1214	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1215	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1216	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1217	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1218	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1219	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1220	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1221	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1222	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1223	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1224	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1225	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1226	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1227	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1228	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1229
				1230	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1231	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1232	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1233	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1234	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1235	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1236	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1237	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1238	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1239	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1240	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1241	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1242	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1243	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1244	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1245	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1246	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1247
				1248	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1249	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1250
				1251	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1252
				1253	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1254	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1255
				1256	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1257	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1258
				1259	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1260	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1261
				1262	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1263
				1264	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1265	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1266	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1267	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1268	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1269	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1270	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1271	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1272	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1273	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1274	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1275	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1276	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1277	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1278	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1279	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1280
				1281	{ 8704, "forall","for all, U+2200 ISOtech" },
				1282	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1283	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1284	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1285	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1286	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1287	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1288	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1289	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1290	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1291	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1292	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1293	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1294	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1295	{ 8734, "infin","infinity, U+221E ISOtech" },
				1296	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1297	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1298	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1299	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1300	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1301	{ 8747, "int", "integral, U+222B ISOtech" },
				1302	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1303	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1304	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1305	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1306	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1307	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1308	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1309	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1310	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1311	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1312	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1313	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1314	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1315	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1316	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1317	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1318	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1319	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1320	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1321	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1322	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1323	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1324	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1325	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1326
				1327	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1328	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1329	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1330	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1331
				1332	};
				1333
				1334	/************************************************************************
				1335	* *
				1336	* Commodity functions to handle entities *
				1337	* *
				1338	************************************************************************/
				1339
				1340	/*
				1341	* Macro used to grow the current buffer.
				1342	*/
				1343	#define growBuffer(buffer) { \
				1344	buffer##_size *= 2; \
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	1345	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1346	if (buffer == NULL) { \
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	1347	xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1348	return(NULL); \
				1349	} \
				1350	}
				1351
				1352	/**
				1353	* htmlEntityLookup:
				1354	* @name: the entity name
				1355	*
				1356	* Lookup the given entity in EntitiesTable
				1357	*
				1358	* TODO: the linear scan is really ugly, an hash table is really needed.
				1359	*
				1360	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1361	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1362	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1363	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1364	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1365
				1366	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1367	sizeof(html40EntitiesTable[0]));i++) {
				1368	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1369	#ifdef DEBUG
				1370	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1371	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1372	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1373	}
				1374	}
				1375	return(NULL);
				1376	}
				1377
				1378	/**
				1379	* htmlEntityValueLookup:
				1380	* @value: the entity's unicode value
				1381	*
				1382	* Lookup the given entity in EntitiesTable
				1383	*
				1384	* TODO: the linear scan is really ugly, an hash table is really needed.
				1385	*
				1386	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1387	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1388	const htmlEntityDesc *
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1389	htmlEntityValueLookup(unsigned int value) {
				1390	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1391	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1392	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1393	#endif
				1394
				1395	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1396	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1397	if (html40EntitiesTable[i].value >= value) {
				1398	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1399	break;
				1400	#ifdef DEBUG
				1401	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1402	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1403	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1404	}
				1405	#ifdef DEBUG
				1406	if (lv > html40EntitiesTable[i].value) {
				1407	xmlGenericError(xmlGenericErrorContext,
				1408	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1409	lv, html40EntitiesTable[i].value);
				1410	}
				1411	lv = html40EntitiesTable[i].value;
				1412	#endif
				1413	}
				1414	return(NULL);
				1415	}
				1416
				1417	/**
				1418	* UTF8ToHtml:
				1419	* @out: a pointer to an array of bytes to store the result
				1420	* @outlen: the length of @out
				1421	* @in: a pointer to an array of UTF-8 chars
				1422	* @inlen: the length of @in
				1423	*
				1424	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1425	* plus HTML entities block of chars out.
				1426	*
				1427	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1428	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1429	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1430	* The value of @outlen after return is the number of octets consumed.
				1431	*/
				1432	int
				1433	UTF8ToHtml(unsigned char* out, int *outlen,
				1434	const unsigned char* in, int *inlen) {
				1435	const unsigned char* processed = in;
				1436	const unsigned char* outend;
				1437	const unsigned char* outstart = out;
				1438	const unsigned char* instart = in;
				1439	const unsigned char* inend;
				1440	unsigned int c, d;
				1441	int trailing;
				1442
				1443	if (in == NULL) {
				1444	/*
				1445	* initialization nothing to do
				1446	*/
				1447	*outlen = 0;
				1448	*inlen = 0;
				1449	return(0);
				1450	}
				1451	inend = in + (*inlen);
				1452	outend = out + (*outlen);
				1453	while (in < inend) {
				1454	d = *in++;
				1455	if (d < 0x80) { c= d; trailing= 0; }
				1456	else if (d < 0xC0) {
				1457	/* trailing byte in leading position */
				1458	*outlen = out - outstart;
				1459	*inlen = processed - instart;
				1460	return(-2);
				1461	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1462	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1463	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1464	else {
				1465	/* no chance for this in Ascii */
				1466	*outlen = out - outstart;
				1467	*inlen = processed - instart;
				1468	return(-2);
				1469	}
				1470
				1471	if (inend - in < trailing) {
				1472	break;
				1473	}
				1474
				1475	for ( ; trailing; trailing--) {
				1476	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1477	break;
				1478	c <<= 6;
				1479	c \|= d & 0x3F;
				1480	}
				1481
				1482	/* assertion: c is a single UTF-4 value */
				1483	if (c < 0x80) {
				1484	if (out + 1 >= outend)
				1485	break;
				1486	*out++ = c;
				1487	} else {
				1488	int len;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1489	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1490
				1491	/*
				1492	* Try to lookup a predefined HTML entity for it
				1493	*/
				1494
				1495	ent = htmlEntityValueLookup(c);
				1496	if (ent == NULL) {
				1497	/* no chance for this in Ascii */
				1498	*outlen = out - outstart;
				1499	*inlen = processed - instart;
				1500	return(-2);
				1501	}
				1502	len = strlen(ent->name);
				1503	if (out + 2 + len >= outend)
				1504	break;
				1505	*out++ = '&';
				1506	memcpy(out, ent->name, len);
				1507	out += len;
				1508	*out++ = ';';
				1509	}
				1510	processed = in;
				1511	}
				1512	*outlen = out - outstart;
				1513	*inlen = processed - instart;
				1514	return(0);
				1515	}
				1516
				1517	/**
				1518	* htmlEncodeEntities:
				1519	* @out: a pointer to an array of bytes to store the result
				1520	* @outlen: the length of @out
				1521	* @in: a pointer to an array of UTF-8 chars
				1522	* @inlen: the length of @in
				1523	* @quoteChar: the quote character to escape (' or ") or zero.
				1524	*
				1525	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1526	* plus HTML entities block of chars out.
				1527	*
				1528	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1529	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1530	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1531	* The value of @outlen after return is the number of octets consumed.
				1532	*/
				1533	int
				1534	htmlEncodeEntities(unsigned char* out, int *outlen,
				1535	const unsigned char* in, int *inlen, int quoteChar) {
				1536	const unsigned char* processed = in;
				1537	const unsigned char* outend = out + (*outlen);
				1538	const unsigned char* outstart = out;
				1539	const unsigned char* instart = in;
				1540	const unsigned char* inend = in + (*inlen);
				1541	unsigned int c, d;
				1542	int trailing;
				1543
				1544	while (in < inend) {
				1545	d = *in++;
				1546	if (d < 0x80) { c= d; trailing= 0; }
				1547	else if (d < 0xC0) {
				1548	/* trailing byte in leading position */
				1549	*outlen = out - outstart;
				1550	*inlen = processed - instart;
				1551	return(-2);
				1552	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1553	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1554	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1555	else {
				1556	/* no chance for this in Ascii */
				1557	*outlen = out - outstart;
				1558	*inlen = processed - instart;
				1559	return(-2);
				1560	}
				1561
				1562	if (inend - in < trailing)
				1563	break;
				1564
				1565	while (trailing--) {
				1566	if (((d= *in++) & 0xC0) != 0x80) {
				1567	*outlen = out - outstart;
				1568	*inlen = processed - instart;
				1569	return(-2);
				1570	}
				1571	c <<= 6;
				1572	c \|= d & 0x3F;
				1573	}
				1574
				1575	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1576	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1577	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1578	if (out >= outend)
				1579	break;
				1580	*out++ = c;
				1581	} else {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1582	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1583	const char *cp;
				1584	char nbuf[16];
				1585	int len;
				1586
				1587	/*
				1588	* Try to lookup a predefined HTML entity for it
				1589	*/
				1590	ent = htmlEntityValueLookup(c);
				1591	if (ent == NULL) {
Aleksey Sanin	49cc975	2002-06-14 17:07:10 +0000	[diff] [blame]	1592	snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1593	cp = nbuf;
				1594	}
				1595	else
				1596	cp = ent->name;
				1597	len = strlen(cp);
				1598	if (out + 2 + len > outend)
				1599	break;
				1600	*out++ = '&';
				1601	memcpy(out, cp, len);
				1602	out += len;
				1603	*out++ = ';';
				1604	}
				1605	processed = in;
				1606	}
				1607	*outlen = out - outstart;
				1608	*inlen = processed - instart;
				1609	return(0);
				1610	}
				1611
				1612	/**
				1613	* htmlDecodeEntities:
				1614	* @ctxt: the parser context
				1615	* @len: the len to decode (in bytes !), -1 for no size limit
				1616	* @end: an end marker xmlChar, 0 if none
				1617	* @end2: an end marker xmlChar, 0 if none
				1618	* @end3: an end marker xmlChar, 0 if none
				1619	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1620	* Substitute the HTML entities by their value
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1621	*
				1622	* DEPRECATED !!!!
				1623	*
				1624	* Returns A newly allocated string with the substitution done. The caller
				1625	* must deallocate it !
				1626	*/
				1627	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1628	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1629	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1630	static int deprecated = 0;
				1631	if (!deprecated) {
				1632	xmlGenericError(xmlGenericErrorContext,
				1633	"htmlDecodeEntities() deprecated function reached\n");
				1634	deprecated = 1;
				1635	}
				1636	return(NULL);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1637	}
				1638
				1639	/************************************************************************
				1640	* *
				1641	* Commodity functions to handle streams *
				1642	* *
				1643	************************************************************************/
				1644
				1645	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1646	* htmlNewInputStream:
				1647	* @ctxt: an HTML parser context
				1648	*
				1649	* Create a new input stream structure
				1650	* Returns the new input stream or NULL
				1651	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1652	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1653	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1654	htmlParserInputPtr input;
				1655
				1656	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1657	if (input == NULL) {
				1658	ctxt->errNo = XML_ERR_NO_MEMORY;
				1659	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1660	ctxt->sax->error(ctxt->userData,
				1661	"malloc: couldn't allocate a new input stream\n");
				1662	return(NULL);
				1663	}
				1664	memset(input, 0, sizeof(htmlParserInput));
				1665	input->filename = NULL;
				1666	input->directory = NULL;
				1667	input->base = NULL;
				1668	input->cur = NULL;
				1669	input->buf = NULL;
				1670	input->line = 1;
				1671	input->col = 1;
				1672	input->buf = NULL;
				1673	input->free = NULL;
				1674	input->version = NULL;
				1675	input->consumed = 0;
				1676	input->length = 0;
				1677	return(input);
				1678	}
				1679
				1680
				1681	/************************************************************************
				1682	* *
				1683	* Commodity functions, cleanup needed ? *
				1684	* *
				1685	************************************************************************/
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1686	/*
				1687	* all tags allowing pc data from the html 4.01 loose dtd
				1688	* NOTE: it might be more apropriate to integrate this information
				1689	* into the html40ElementTable array but I don't want to risk any
				1690	* binary incomptibility
				1691	*/
				1692	static const char *allowPCData[] = {
				1693	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
				1694	"blockquote", "body", "button", "caption", "center", "cite", "code",
				1695	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
				1696	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
				1697	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
				1698	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
				1699	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1700
				1701	/**
				1702	* areBlanks:
				1703	* @ctxt: an HTML parser context
				1704	* @str: a xmlChar *
				1705	* @len: the size of @str
				1706	*
				1707	* Is this a sequence of blank chars that one can ignore ?
				1708	*
				1709	* Returns 1 if ignorable 0 otherwise.
				1710	*/
				1711
				1712	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1713	unsigned int i;
				1714	int j;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1715	xmlNodePtr lastChild;
				1716
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1717	for (j = 0;j < len;j++)
				1718	if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1719
				1720	if (CUR == 0) return(1);
				1721	if (CUR != '<') return(0);
				1722	if (ctxt->name == NULL)
				1723	return(1);
				1724	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1725	return(1);
				1726	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1727	return(1);
				1728	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1729	return(1);
				1730	if (ctxt->node == NULL) return(0);
				1731	lastChild = xmlGetLastChild(ctxt->node);
				1732	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1733	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1734	(ctxt->node->content != NULL)) return(0);
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1735	/* keep ws in constructs like ...<b> </b>...
				1736	for all tags "b" allowing PCDATA */
				1737	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				1738	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
				1739	return(0);
				1740	}
				1741	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1742	} else if (xmlNodeIsText(lastChild)) {
				1743	return(0);
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1744	} else {
				1745	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
				1746	for all tags "p" allowing PCDATA */
				1747	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				1748	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
				1749	return(0);
				1750	}
				1751	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1752	}
				1753	return(1);
				1754	}
				1755
				1756	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1757	* htmlNewDocNoDtD:
				1758	* @URI: URI for the dtd, or NULL
				1759	* @ExternalID: the external ID of the DTD, or NULL
				1760	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1761	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				1762	* are NULL
				1763	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1764	* Returns a new document, do not initialize the DTD if not provided
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1765	*/
				1766	htmlDocPtr
				1767	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1768	xmlDocPtr cur;
				1769
				1770	/*
				1771	* Allocate a new document and fill the fields.
				1772	*/
				1773	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1774	if (cur == NULL) {
				1775	xmlGenericError(xmlGenericErrorContext,
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1776	"htmlNewDocNoDtD : malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1777	return(NULL);
				1778	}
				1779	memset(cur, 0, sizeof(xmlDoc));
				1780
				1781	cur->type = XML_HTML_DOCUMENT_NODE;
				1782	cur->version = NULL;
				1783	cur->intSubset = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1784	cur->doc = cur;
				1785	cur->name = NULL;
				1786	cur->children = NULL;
				1787	cur->extSubset = NULL;
				1788	cur->oldNs = NULL;
				1789	cur->encoding = NULL;
				1790	cur->standalone = 1;
				1791	cur->compression = 0;
				1792	cur->ids = NULL;
				1793	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1794	cur->_private = NULL;
Daniel Veillard	b6b0fd8	2001-10-22 12:31:11 +0000	[diff] [blame]	1795	if ((ExternalID != NULL) \|\|
				1796	(URI != NULL))
Daniel Veillard	5151c06	2001-10-23 13:10:19 +0000	[diff] [blame]	1797	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1798	return(cur);
				1799	}
				1800
				1801	/**
				1802	* htmlNewDoc:
				1803	* @URI: URI for the dtd, or NULL
				1804	* @ExternalID: the external ID of the DTD, or NULL
				1805	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1806	* Creates a new HTML document
				1807	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1808	* Returns a new document
				1809	*/
				1810	htmlDocPtr
				1811	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1812	if ((URI == NULL) && (ExternalID == NULL))
				1813	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1814	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1815	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1816
				1817	return(htmlNewDocNoDtD(URI, ExternalID));
				1818	}
				1819
				1820
				1821	/************************************************************************
				1822	* *
				1823	* The parser itself *
				1824	* Relates to http://www.w3.org/TR/html40 *
				1825	* *
				1826	************************************************************************/
				1827
				1828	/************************************************************************
				1829	* *
				1830	* The parser itself *
				1831	* *
				1832	************************************************************************/
				1833
Daniel Veillard	e55e8e4	2003-01-10 12:50:02 +0000	[diff] [blame]	1834	static xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
				1835
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1836	/**
				1837	* htmlParseHTMLName:
				1838	* @ctxt: an HTML parser context
				1839	*
				1840	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1841	* since HTML names are not case-sensitive.
				1842	*
				1843	* Returns the Tag Name parsed or NULL
				1844	*/
				1845
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1846	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1847	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1848	xmlChar *ret = NULL;
				1849	int i = 0;
				1850	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1851
				1852	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1853	(CUR != ':')) return(NULL);
				1854
				1855	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1856	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1857	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1858	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1859	else loc[i] = CUR;
				1860	i++;
				1861
				1862	NEXT;
				1863	}
				1864
				1865	ret = xmlStrndup(loc, i);
				1866
				1867	return(ret);
				1868	}
				1869
				1870	/**
				1871	* htmlParseName:
				1872	* @ctxt: an HTML parser context
				1873	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1874	* parse an HTML name, this routine is case sensitive.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1875	*
				1876	* Returns the Name parsed or NULL
				1877	*/
				1878
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1879	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1880	htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillard	e55e8e4	2003-01-10 12:50:02 +0000	[diff] [blame]	1881	const xmlChar *in;
				1882	xmlChar *ret;
				1883	int count = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1884
				1885	GROW;
Daniel Veillard	e55e8e4	2003-01-10 12:50:02 +0000	[diff] [blame]	1886
				1887	/*
				1888	* Accelerator for simple ASCII names
				1889	*/
				1890	in = ctxt->input->cur;
				1891	if (((in >= 0x61) && (in <= 0x7A)) \|\|
				1892	((in >= 0x41) && (in <= 0x5A)) \|\|
				1893	(in == '_') \|\| (in == ':')) {
				1894	in++;
				1895	while (((in >= 0x61) && (in <= 0x7A)) \|\|
				1896	((in >= 0x41) && (in <= 0x5A)) \|\|
				1897	((in >= 0x30) && (in <= 0x39)) \|\|
				1898	(in == '_') \|\| (in == '-') \|\|
				1899	(in == ':') \|\| (in == '.'))
				1900	in++;
				1901	if ((in > 0) && (in < 0x80)) {
				1902	count = in - ctxt->input->cur;
				1903	ret = xmlStrndup(ctxt->input->cur, count);
				1904	ctxt->input->cur = in;
				1905	return(ret);
				1906	}
				1907	}
				1908	return(htmlParseNameComplex(ctxt));
				1909	}
				1910
				1911	static xmlChar *
				1912	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
				1913	xmlChar buf[XML_MAX_NAMELEN + 5];
				1914	int len = 0, l;
				1915	int c;
				1916	int count = 0;
				1917
				1918	/*
				1919	* Handler for more complex cases
				1920	*/
				1921	GROW;
				1922	c = CUR_CHAR(l);
				1923	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
				1924	(!IS_LETTER(c) && (c != '_') &&
				1925	(c != ':'))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1926	return(NULL);
				1927	}
				1928
Daniel Veillard	e55e8e4	2003-01-10 12:50:02 +0000	[diff] [blame]	1929	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
				1930	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
				1931	(c == '.') \|\| (c == '-') \|\|
				1932	(c == '_') \|\| (c == ':') \|\|
				1933	(IS_COMBINING(c)) \|\|
				1934	(IS_EXTENDER(c)))) {
				1935	if (count++ > 100) {
				1936	count = 0;
				1937	GROW;
				1938	}
				1939	COPY_BUF(l,buf,len,c);
				1940	NEXTL(l);
				1941	c = CUR_CHAR(l);
				1942	if (len >= XML_MAX_NAMELEN) {
				1943	/*
				1944	* Okay someone managed to make a huge name, so he's ready to pay
				1945	* for the processing speed.
				1946	*/
				1947	xmlChar *buffer;
				1948	int max = len * 2;
				1949
				1950	buffer = (xmlChar ) xmlMalloc(max sizeof(xmlChar));
				1951	if (buffer == NULL) {
				1952	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1953	ctxt->sax->error(ctxt->userData,
				1954	"htmlParseNameComplex: out of memory\n");
				1955	return(NULL);
				1956	}
				1957	memcpy(buffer, buf, len);
				1958	while ((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\| /* test bigname.xml */
				1959	(c == '.') \|\| (c == '-') \|\|
				1960	(c == '_') \|\| (c == ':') \|\|
				1961	(IS_COMBINING(c)) \|\|
				1962	(IS_EXTENDER(c))) {
				1963	if (count++ > 100) {
				1964	count = 0;
				1965	GROW;
				1966	}
				1967	if (len + 10 > max) {
				1968	max *= 2;
				1969	buffer = (xmlChar *) xmlRealloc(buffer,
				1970	max * sizeof(xmlChar));
				1971	if (buffer == NULL) {
				1972	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1973	ctxt->sax->error(ctxt->userData,
				1974	"htmlParseNameComplex: out of memory\n");
				1975	return(NULL);
				1976	}
				1977	}
				1978	COPY_BUF(l,buffer,len,c);
				1979	NEXTL(l);
				1980	c = CUR_CHAR(l);
				1981	}
				1982	buffer[len] = 0;
				1983	return(buffer);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1984	}
				1985	}
				1986	return(xmlStrndup(buf, len));
				1987	}
				1988
Daniel Veillard	e55e8e4	2003-01-10 12:50:02 +0000	[diff] [blame]	1989
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1990	/**
				1991	* htmlParseHTMLAttribute:
				1992	* @ctxt: an HTML parser context
				1993	* @stop: a char stop value
				1994	*
				1995	* parse an HTML attribute value till the stop (quote), if
				1996	* stop is 0 then it stops at the first space
				1997	*
				1998	* Returns the attribute parsed or NULL
				1999	*/
				2000
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2001	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2002	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				2003	xmlChar *buffer = NULL;
				2004	int buffer_size = 0;
				2005	xmlChar *out = NULL;
				2006	xmlChar *name = NULL;
				2007
				2008	xmlChar *cur = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2009	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2010
				2011	/*
				2012	* allocate a translation buffer.
				2013	*/
				2014	buffer_size = HTML_PARSER_BUFFER_SIZE;
				2015	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				2016	if (buffer == NULL) {
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	2017	xmlGenericError(xmlGenericErrorContext,
				2018	"htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2019	return(NULL);
				2020	}
				2021	out = buffer;
				2022
				2023	/*
				2024	* Ok loop until we reach one of the ending chars
				2025	*/
Daniel Veillard	957fdcf	2001-11-06 22:50:19 +0000	[diff] [blame]	2026	while ((CUR != 0) && (CUR != stop)) {
				2027	if ((stop == 0) && (CUR == '>')) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2028	if ((stop == 0) && (IS_BLANK(CUR))) break;
				2029	if (CUR == '&') {
				2030	if (NXT(1) == '#') {
				2031	unsigned int c;
				2032	int bits;
				2033
				2034	c = htmlParseCharRef(ctxt);
				2035	if (c < 0x80)
				2036	{ *out++ = c; bits= -6; }
				2037	else if (c < 0x800)
				2038	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2039	else if (c < 0x10000)
				2040	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2041	else
				2042	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2043
				2044	for ( ; bits >= 0; bits-= 6) {
				2045	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2046	}
Daniel Veillard	ce02dbc	2002-10-22 19:14:58 +0000	[diff] [blame]	2047
				2048	if (out - buffer > buffer_size - 100) {
				2049	int indx = out - buffer;
				2050
				2051	growBuffer(buffer);
				2052	out = &buffer[indx];
				2053	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2054	} else {
				2055	ent = htmlParseEntityRef(ctxt, &name);
				2056	if (name == NULL) {
				2057	*out++ = '&';
				2058	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2059	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2060
				2061	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2062	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2063	}
				2064	} else if (ent == NULL) {
				2065	*out++ = '&';
				2066	cur = name;
				2067	while (*cur != 0) {
				2068	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2069	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2070
				2071	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2072	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2073	}
				2074	out++ = cur++;
				2075	}
				2076	xmlFree(name);
				2077	} else {
				2078	unsigned int c;
				2079	int bits;
				2080
				2081	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2082	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2083
				2084	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2085	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2086	}
				2087	c = (xmlChar)ent->value;
				2088	if (c < 0x80)
				2089	{ *out++ = c; bits= -6; }
				2090	else if (c < 0x800)
				2091	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2092	else if (c < 0x10000)
				2093	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2094	else
				2095	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2096
				2097	for ( ; bits >= 0; bits-= 6) {
				2098	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2099	}
				2100	xmlFree(name);
				2101	}
				2102	}
				2103	} else {
				2104	unsigned int c;
				2105	int bits, l;
				2106
				2107	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2108	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2109
				2110	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2111	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2112	}
				2113	c = CUR_CHAR(l);
				2114	if (c < 0x80)
				2115	{ *out++ = c; bits= -6; }
				2116	else if (c < 0x800)
				2117	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2118	else if (c < 0x10000)
				2119	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2120	else
				2121	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2122
				2123	for ( ; bits >= 0; bits-= 6) {
				2124	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2125	}
				2126	NEXT;
				2127	}
				2128	}
				2129	*out++ = 0;
				2130	return(buffer);
				2131	}
				2132
				2133	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2134	* htmlParseEntityRef:
				2135	* @ctxt: an HTML parser context
				2136	* @str: location to store the entity name
				2137	*
				2138	* parse an HTML ENTITY references
				2139	*
				2140	* [68] EntityRef ::= '&' Name ';'
				2141	*
				2142	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2143	* if non-NULL *str will have to be freed by the caller.
				2144	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2145	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2146	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2147	xmlChar *name;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2148	const htmlEntityDesc * ent = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2149	*str = NULL;
				2150
				2151	if (CUR == '&') {
				2152	NEXT;
				2153	name = htmlParseName(ctxt);
				2154	if (name == NULL) {
				2155	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2156	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2157	ctxt->wellFormed = 0;
				2158	} else {
				2159	GROW;
				2160	if (CUR == ';') {
				2161	*str = name;
				2162
				2163	/*
				2164	* Lookup the entity in the table.
				2165	*/
				2166	ent = htmlEntityLookup(name);
				2167	if (ent != NULL) /* OK that's ugly !!! */
				2168	NEXT;
				2169	} else {
				2170	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2171	ctxt->sax->error(ctxt->userData,
				2172	"htmlParseEntityRef: expecting ';'\n");
				2173	*str = name;
				2174	}
				2175	}
				2176	}
				2177	return(ent);
				2178	}
				2179
				2180	/**
				2181	* htmlParseAttValue:
				2182	* @ctxt: an HTML parser context
				2183	*
				2184	* parse a value for an attribute
				2185	* Note: the parser won't do substitution of entities here, this
				2186	* will be handled later in xmlStringGetNodeList, unless it was
				2187	* asked for ctxt->replaceEntities != 0
				2188	*
				2189	* Returns the AttValue parsed or NULL.
				2190	*/
				2191
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2192	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2193	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2194	xmlChar *ret = NULL;
				2195
				2196	if (CUR == '"') {
				2197	NEXT;
				2198	ret = htmlParseHTMLAttribute(ctxt, '"');
				2199	if (CUR != '"') {
				2200	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2201	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2202	ctxt->wellFormed = 0;
				2203	} else
				2204	NEXT;
				2205	} else if (CUR == '\'') {
				2206	NEXT;
				2207	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2208	if (CUR != '\'') {
				2209	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2210	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2211	ctxt->wellFormed = 0;
				2212	} else
				2213	NEXT;
				2214	} else {
				2215	/*
				2216	* That's an HTMLism, the attribute value may not be quoted
				2217	*/
				2218	ret = htmlParseHTMLAttribute(ctxt, 0);
				2219	if (ret == NULL) {
				2220	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2221	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2222	ctxt->wellFormed = 0;
				2223	}
				2224	}
				2225	return(ret);
				2226	}
				2227
				2228	/**
				2229	* htmlParseSystemLiteral:
				2230	* @ctxt: an HTML parser context
				2231	*
				2232	* parse an HTML Literal
				2233	*
				2234	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2235	*
				2236	* Returns the SystemLiteral parsed or NULL
				2237	*/
				2238
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2239	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2240	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2241	const xmlChar *q;
				2242	xmlChar *ret = NULL;
				2243
				2244	if (CUR == '"') {
				2245	NEXT;
				2246	q = CUR_PTR;
				2247	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2248	NEXT;
				2249	if (!IS_CHAR(CUR)) {
				2250	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2251	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2252	ctxt->wellFormed = 0;
				2253	} else {
				2254	ret = xmlStrndup(q, CUR_PTR - q);
				2255	NEXT;
				2256	}
				2257	} else if (CUR == '\'') {
				2258	NEXT;
				2259	q = CUR_PTR;
				2260	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2261	NEXT;
				2262	if (!IS_CHAR(CUR)) {
				2263	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2264	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2265	ctxt->wellFormed = 0;
				2266	} else {
				2267	ret = xmlStrndup(q, CUR_PTR - q);
				2268	NEXT;
				2269	}
				2270	} else {
				2271	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2272	ctxt->sax->error(ctxt->userData,
				2273	"SystemLiteral \" or ' expected\n");
				2274	ctxt->wellFormed = 0;
				2275	}
				2276
				2277	return(ret);
				2278	}
				2279
				2280	/**
				2281	* htmlParsePubidLiteral:
				2282	* @ctxt: an HTML parser context
				2283	*
				2284	* parse an HTML public literal
				2285	*
				2286	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2287	*
				2288	* Returns the PubidLiteral parsed or NULL.
				2289	*/
				2290
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2291	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2292	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2293	const xmlChar *q;
				2294	xmlChar *ret = NULL;
				2295	/*
				2296	* Name ::= (Letter \| '_') (NameChar)*
				2297	*/
				2298	if (CUR == '"') {
				2299	NEXT;
				2300	q = CUR_PTR;
				2301	while (IS_PUBIDCHAR(CUR)) NEXT;
				2302	if (CUR != '"') {
				2303	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2304	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2305	ctxt->wellFormed = 0;
				2306	} else {
				2307	ret = xmlStrndup(q, CUR_PTR - q);
				2308	NEXT;
				2309	}
				2310	} else if (CUR == '\'') {
				2311	NEXT;
				2312	q = CUR_PTR;
				2313	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2314	NEXT;
				2315	if (!IS_LETTER(CUR)) {
				2316	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2317	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2318	ctxt->wellFormed = 0;
				2319	} else {
				2320	ret = xmlStrndup(q, CUR_PTR - q);
				2321	NEXT;
				2322	}
				2323	} else {
				2324	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2325	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2326	ctxt->wellFormed = 0;
				2327	}
				2328
				2329	return(ret);
				2330	}
				2331
				2332	/**
				2333	* htmlParseScript:
				2334	* @ctxt: an HTML parser context
				2335	*
				2336	* parse the content of an HTML SCRIPT or STYLE element
				2337	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2338	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2339	* http://www.w3.org/TR/html4/types.html#type-script
				2340	* http://www.w3.org/TR/html4/types.html#h-6.15
				2341	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2342	*
				2343	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2344	* element and the value of intrinsic event attributes. User agents must
				2345	* not evaluate script data as HTML markup but instead must pass it on as
				2346	* data to a script engine.
				2347	* NOTES:
				2348	* - The content is passed like CDATA
				2349	* - the attributes for style and scripting "onXXX" are also described
				2350	* as CDATA but SGML allows entities references in attributes so their
				2351	* processing is identical as other attributes
				2352	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2353	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2354	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2355	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2356	int nbchar = 0;
				2357	xmlChar cur;
				2358
				2359	SHRINK;
				2360	cur = CUR;
				2361	while (IS_CHAR(cur)) {
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	2362	if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
				2363	(NXT(3) == '-')) {
				2364	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2365	if (ctxt->sax->cdataBlock!= NULL) {
				2366	/*
				2367	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2368	*/
				2369	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2370	}
				2371	}
				2372	nbchar = 0;
				2373	htmlParseComment(ctxt);
				2374	cur = CUR;
				2375	continue;
				2376	} else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2377	/*
				2378	* One should break here, the specification is clear:
				2379	* Authors should therefore escape "</" within the content.
				2380	* Escape mechanisms are specific to each scripting or
				2381	* style sheet language.
				2382	*/
				2383	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2384	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2385	break; /* while */
				2386	}
				2387	buf[nbchar++] = cur;
				2388	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2389	if (ctxt->sax->cdataBlock!= NULL) {
				2390	/*
				2391	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2392	*/
				2393	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2394	}
				2395	nbchar = 0;
				2396	}
				2397	NEXT;
				2398	cur = CUR;
				2399	}
				2400	if (!(IS_CHAR(cur))) {
				2401	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2402	ctxt->sax->error(ctxt->userData,
				2403	"Invalid char in CDATA 0x%X\n", cur);
				2404	ctxt->wellFormed = 0;
				2405	NEXT;
				2406	}
				2407
				2408	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2409	if (ctxt->sax->cdataBlock!= NULL) {
				2410	/*
				2411	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2412	*/
				2413	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2414	}
				2415	}
				2416	}
				2417
				2418
				2419	/**
				2420	* htmlParseCharData:
				2421	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2422	*
				2423	* parse a CharData section.
				2424	* if we are within a CDATA section ']]>' marks an end of section.
				2425	*
				2426	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2427	*/
				2428
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2429	static void
				2430	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2431	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2432	int nbchar = 0;
				2433	int cur, l;
				2434
				2435	SHRINK;
				2436	cur = CUR_CHAR(l);
				2437	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2438	((cur != '&') \|\| (ctxt->token == '&')) &&
				2439	(IS_CHAR(cur))) {
				2440	COPY_BUF(l,buf,nbchar,cur);
				2441	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2442	/*
				2443	* Ok the segment is to be consumed as chars.
				2444	*/
				2445	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2446	if (areBlanks(ctxt, buf, nbchar)) {
				2447	if (ctxt->sax->ignorableWhitespace != NULL)
				2448	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2449	buf, nbchar);
				2450	} else {
				2451	htmlCheckParagraph(ctxt);
				2452	if (ctxt->sax->characters != NULL)
				2453	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2454	}
				2455	}
				2456	nbchar = 0;
				2457	}
				2458	NEXTL(l);
				2459	cur = CUR_CHAR(l);
				2460	}
				2461	if (nbchar != 0) {
				2462	/*
				2463	* Ok the segment is to be consumed as chars.
				2464	*/
				2465	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2466	if (areBlanks(ctxt, buf, nbchar)) {
				2467	if (ctxt->sax->ignorableWhitespace != NULL)
				2468	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2469	} else {
				2470	htmlCheckParagraph(ctxt);
				2471	if (ctxt->sax->characters != NULL)
				2472	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2473	}
				2474	}
Daniel Veillard	7cc95c0	2001-10-17 15:45:12 +0000	[diff] [blame]	2475	} else {
				2476	/*
				2477	* Loop detection
				2478	*/
				2479	if (cur == 0)
				2480	ctxt->instate = XML_PARSER_EOF;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2481	}
				2482	}
				2483
				2484	/**
				2485	* htmlParseExternalID:
				2486	* @ctxt: an HTML parser context
				2487	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2488	*
				2489	* Parse an External ID or a Public ID
				2490	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2491	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2492	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2493	*
				2494	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2495	*
				2496	* Returns the function returns SystemLiteral and in the second
				2497	* case publicID receives PubidLiteral, is strict is off
				2498	* it is possible to return NULL and have publicID set.
				2499	*/
				2500
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2501	static xmlChar *
				2502	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2503	xmlChar *URI = NULL;
				2504
				2505	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2506	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2507	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2508	SKIP(6);
				2509	if (!IS_BLANK(CUR)) {
				2510	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2511	ctxt->sax->error(ctxt->userData,
				2512	"Space required after 'SYSTEM'\n");
				2513	ctxt->wellFormed = 0;
				2514	}
				2515	SKIP_BLANKS;
				2516	URI = htmlParseSystemLiteral(ctxt);
				2517	if (URI == NULL) {
				2518	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2519	ctxt->sax->error(ctxt->userData,
				2520	"htmlParseExternalID: SYSTEM, no URI\n");
				2521	ctxt->wellFormed = 0;
				2522	}
				2523	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2524	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2525	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2526	SKIP(6);
				2527	if (!IS_BLANK(CUR)) {
				2528	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2529	ctxt->sax->error(ctxt->userData,
				2530	"Space required after 'PUBLIC'\n");
				2531	ctxt->wellFormed = 0;
				2532	}
				2533	SKIP_BLANKS;
				2534	*publicID = htmlParsePubidLiteral(ctxt);
				2535	if (*publicID == NULL) {
				2536	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2537	ctxt->sax->error(ctxt->userData,
				2538	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2539	ctxt->wellFormed = 0;
				2540	}
				2541	SKIP_BLANKS;
				2542	if ((CUR == '"') \|\| (CUR == '\'')) {
				2543	URI = htmlParseSystemLiteral(ctxt);
				2544	}
				2545	}
				2546	return(URI);
				2547	}
				2548
				2549	/**
				2550	* htmlParseComment:
				2551	* @ctxt: an HTML parser context
				2552	*
				2553	* Parse an XML (SGML) comment <!-- .... -->
				2554	*
				2555	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2556	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2557	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2558	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2559	xmlChar *buf = NULL;
				2560	int len;
				2561	int size = HTML_PARSER_BUFFER_SIZE;
				2562	int q, ql;
				2563	int r, rl;
				2564	int cur, l;
				2565	xmlParserInputState state;
				2566
				2567	/*
				2568	* Check that there is a comment right here.
				2569	*/
				2570	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2571	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2572
				2573	state = ctxt->instate;
				2574	ctxt->instate = XML_PARSER_COMMENT;
				2575	SHRINK;
				2576	SKIP(4);
				2577	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2578	if (buf == NULL) {
				2579	xmlGenericError(xmlGenericErrorContext,
				2580	"malloc of %d byte failed\n", size);
				2581	ctxt->instate = state;
				2582	return;
				2583	}
				2584	q = CUR_CHAR(ql);
				2585	NEXTL(ql);
				2586	r = CUR_CHAR(rl);
				2587	NEXTL(rl);
				2588	cur = CUR_CHAR(l);
				2589	len = 0;
				2590	while (IS_CHAR(cur) &&
				2591	((cur != '>') \|\|
				2592	(r != '-') \|\| (q != '-'))) {
				2593	if (len + 5 >= size) {
				2594	size *= 2;
				2595	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2596	if (buf == NULL) {
				2597	xmlGenericError(xmlGenericErrorContext,
				2598	"realloc of %d byte failed\n", size);
				2599	ctxt->instate = state;
				2600	return;
				2601	}
				2602	}
				2603	COPY_BUF(ql,buf,len,q);
				2604	q = r;
				2605	ql = rl;
				2606	r = cur;
				2607	rl = l;
				2608	NEXTL(l);
				2609	cur = CUR_CHAR(l);
				2610	if (cur == 0) {
				2611	SHRINK;
				2612	GROW;
				2613	cur = CUR_CHAR(l);
				2614	}
				2615	}
				2616	buf[len] = 0;
				2617	if (!IS_CHAR(cur)) {
				2618	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2619	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2620	ctxt->sax->error(ctxt->userData,
				2621	"Comment not terminated \n<!--%.50s\n", buf);
				2622	ctxt->wellFormed = 0;
				2623	xmlFree(buf);
				2624	} else {
				2625	NEXT;
				2626	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2627	(!ctxt->disableSAX))
				2628	ctxt->sax->comment(ctxt->userData, buf);
				2629	xmlFree(buf);
				2630	}
				2631	ctxt->instate = state;
				2632	}
				2633
				2634	/**
				2635	* htmlParseCharRef:
				2636	* @ctxt: an HTML parser context
				2637	*
				2638	* parse Reference declarations
				2639	*
				2640	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2641	* '&#x' [0-9a-fA-F]+ ';'
				2642	*
				2643	* Returns the value parsed (as an int)
				2644	*/
				2645	int
				2646	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2647	int val = 0;
				2648
				2649	if ((CUR == '&') && (NXT(1) == '#') &&
				2650	(NXT(2) == 'x')) {
				2651	SKIP(3);
				2652	while (CUR != ';') {
				2653	if ((CUR >= '0') && (CUR <= '9'))
				2654	val = val * 16 + (CUR - '0');
				2655	else if ((CUR >= 'a') && (CUR <= 'f'))
				2656	val = val * 16 + (CUR - 'a') + 10;
				2657	else if ((CUR >= 'A') && (CUR <= 'F'))
				2658	val = val * 16 + (CUR - 'A') + 10;
				2659	else {
				2660	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2661	ctxt->sax->error(ctxt->userData,
				2662	"htmlParseCharRef: invalid hexadecimal value\n");
				2663	ctxt->wellFormed = 0;
				2664	return(0);
				2665	}
				2666	NEXT;
				2667	}
				2668	if (CUR == ';')
				2669	NEXT;
				2670	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2671	SKIP(2);
				2672	while (CUR != ';') {
				2673	if ((CUR >= '0') && (CUR <= '9'))
				2674	val = val * 10 + (CUR - '0');
				2675	else {
				2676	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2677	ctxt->sax->error(ctxt->userData,
				2678	"htmlParseCharRef: invalid decimal value\n");
				2679	ctxt->wellFormed = 0;
				2680	return(0);
				2681	}
				2682	NEXT;
				2683	}
				2684	if (CUR == ';')
				2685	NEXT;
				2686	} else {
				2687	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2688	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2689	ctxt->wellFormed = 0;
				2690	}
				2691	/*
				2692	* Check the value IS_CHAR ...
				2693	*/
				2694	if (IS_CHAR(val)) {
				2695	return(val);
				2696	} else {
				2697	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2698	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2699	val);
				2700	ctxt->wellFormed = 0;
				2701	}
				2702	return(0);
				2703	}
				2704
				2705
				2706	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	2707	* htmlParseDocTypeDecl:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2708	* @ctxt: an HTML parser context
				2709	*
				2710	* parse a DOCTYPE declaration
				2711	*
				2712	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2713	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2714	*/
				2715
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2716	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2717	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2718	xmlChar *name;
				2719	xmlChar *ExternalID = NULL;
				2720	xmlChar *URI = NULL;
				2721
				2722	/*
				2723	* We know that '<!DOCTYPE' has been detected.
				2724	*/
				2725	SKIP(9);
				2726
				2727	SKIP_BLANKS;
				2728
				2729	/*
				2730	* Parse the DOCTYPE name.
				2731	*/
				2732	name = htmlParseName(ctxt);
				2733	if (name == NULL) {
				2734	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2735	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2736	ctxt->wellFormed = 0;
				2737	}
				2738	/*
				2739	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2740	*/
				2741
				2742	SKIP_BLANKS;
				2743
				2744	/*
				2745	* Check for SystemID and ExternalID
				2746	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2747	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2748	SKIP_BLANKS;
				2749
				2750	/*
				2751	* We should be at the end of the DOCTYPE declaration.
				2752	*/
				2753	if (CUR != '>') {
				2754	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard	f6ed8bc	2001-10-02 09:22:47 +0000	[diff] [blame]	2755	ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2756	ctxt->wellFormed = 0;
				2757	/* We shouldn't try to resynchronize ... */
				2758	}
				2759	NEXT;
				2760
				2761	/*
				2762	* Create or update the document accordingly to the DOCTYPE
				2763	*/
				2764	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2765	(!ctxt->disableSAX))
				2766	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2767
				2768	/*
				2769	* Cleanup, since we don't use all those identifiers
				2770	*/
				2771	if (URI != NULL) xmlFree(URI);
				2772	if (ExternalID != NULL) xmlFree(ExternalID);
				2773	if (name != NULL) xmlFree(name);
				2774	}
				2775
				2776	/**
				2777	* htmlParseAttribute:
				2778	* @ctxt: an HTML parser context
				2779	* @value: a xmlChar ** used to store the value of the attribute
				2780	*
				2781	* parse an attribute
				2782	*
				2783	* [41] Attribute ::= Name Eq AttValue
				2784	*
				2785	* [25] Eq ::= S? '=' S?
				2786	*
				2787	* With namespace:
				2788	*
				2789	* [NS 11] Attribute ::= QName Eq AttValue
				2790	*
				2791	* Also the case QName == xmlns:??? is handled independently as a namespace
				2792	* definition.
				2793	*
				2794	* Returns the attribute name, and the value in *value.
				2795	*/
				2796
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2797	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2798	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2799	xmlChar name, val = NULL;
				2800
				2801	*value = NULL;
				2802	name = htmlParseHTMLName(ctxt);
				2803	if (name == NULL) {
				2804	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2805	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2806	ctxt->wellFormed = 0;
				2807	return(NULL);
				2808	}
				2809
				2810	/*
				2811	* read the value
				2812	*/
				2813	SKIP_BLANKS;
				2814	if (CUR == '=') {
				2815	NEXT;
				2816	SKIP_BLANKS;
				2817	val = htmlParseAttValue(ctxt);
				2818	/******
				2819	} else {
				2820	* TODO : some attribute must have values, some may not
				2821	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2822	ctxt->sax->warning(ctxt->userData,
				2823	"No value for attribute %s\n", name); */
				2824	}
				2825
				2826	*value = val;
				2827	return(name);
				2828	}
				2829
				2830	/**
				2831	* htmlCheckEncoding:
				2832	* @ctxt: an HTML parser context
				2833	* @attvalue: the attribute value
				2834	*
				2835	* Checks an http-equiv attribute from a Meta tag to detect
				2836	* the encoding
				2837	* If a new encoding is detected the parser is switched to decode
				2838	* it and pass UTF8
				2839	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2840	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2841	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2842	const xmlChar *encoding;
				2843
				2844	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2845	return;
				2846
				2847	/* do not change encoding */
				2848	if (ctxt->input->encoding != NULL)
				2849	return;
				2850
				2851	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2852	if (encoding != NULL) {
				2853	encoding += 8;
				2854	} else {
				2855	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2856	if (encoding != NULL)
				2857	encoding += 9;
				2858	}
				2859	if (encoding != NULL) {
				2860	xmlCharEncoding enc;
				2861	xmlCharEncodingHandlerPtr handler;
				2862
				2863	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2864
				2865	if (ctxt->input->encoding != NULL)
				2866	xmlFree((xmlChar *) ctxt->input->encoding);
				2867	ctxt->input->encoding = xmlStrdup(encoding);
				2868
				2869	enc = xmlParseCharEncoding((const char *) encoding);
				2870	/*
				2871	* registered set of known encodings
				2872	*/
				2873	if (enc != XML_CHAR_ENCODING_ERROR) {
				2874	xmlSwitchEncoding(ctxt, enc);
				2875	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2876	} else {
				2877	/*
				2878	* fallback for unknown encodings
				2879	*/
				2880	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2881	if (handler != NULL) {
				2882	xmlSwitchToEncoding(ctxt, handler);
				2883	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2884	} else {
				2885	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2886	}
				2887	}
				2888
				2889	if ((ctxt->input->buf != NULL) &&
				2890	(ctxt->input->buf->encoder != NULL) &&
				2891	(ctxt->input->buf->raw != NULL) &&
				2892	(ctxt->input->buf->buffer != NULL)) {
				2893	int nbchars;
				2894	int processed;
				2895
				2896	/*
				2897	* convert as much as possible to the parser reading buffer.
				2898	*/
				2899	processed = ctxt->input->cur - ctxt->input->base;
				2900	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2901	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2902	ctxt->input->buf->buffer,
				2903	ctxt->input->buf->raw);
				2904	if (nbchars < 0) {
				2905	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2906	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2907	ctxt->sax->error(ctxt->userData,
				2908	"htmlCheckEncoding: encoder error\n");
				2909	}
				2910	ctxt->input->base =
				2911	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2912	}
				2913	}
				2914	}
				2915
				2916	/**
				2917	* htmlCheckMeta:
				2918	* @ctxt: an HTML parser context
				2919	* @atts: the attributes values
				2920	*
				2921	* Checks an attributes from a Meta tag
				2922	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2923	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2924	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2925	int i;
				2926	const xmlChar att, value;
				2927	int http = 0;
				2928	const xmlChar *content = NULL;
				2929
				2930	if ((ctxt == NULL) \|\| (atts == NULL))
				2931	return;
				2932
				2933	i = 0;
				2934	att = atts[i++];
				2935	while (att != NULL) {
				2936	value = atts[i++];
				2937	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2938	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2939	http = 1;
				2940	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2941	content = value;
				2942	att = atts[i++];
				2943	}
				2944	if ((http) && (content != NULL))
				2945	htmlCheckEncoding(ctxt, content);
				2946
				2947	}
				2948
				2949	/**
				2950	* htmlParseStartTag:
				2951	* @ctxt: an HTML parser context
				2952	*
				2953	* parse a start of tag either for rule element or
				2954	* EmptyElement. In both case we don't parse the tag closing chars.
				2955	*
				2956	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2957	*
				2958	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2959	*
				2960	* With namespace:
				2961	*
				2962	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2963	*
				2964	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2965	*
				2966	*/
				2967
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2968	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2969	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2970	xmlChar *name;
				2971	xmlChar *attname;
				2972	xmlChar *attvalue;
				2973	const xmlChar **atts = NULL;
				2974	int nbatts = 0;
				2975	int maxatts = 0;
				2976	int meta = 0;
				2977	int i;
				2978
				2979	if (CUR != '<') return;
				2980	NEXT;
				2981
				2982	GROW;
				2983	name = htmlParseHTMLName(ctxt);
				2984	if (name == NULL) {
				2985	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2986	ctxt->sax->error(ctxt->userData,
				2987	"htmlParseStartTag: invalid element name\n");
				2988	ctxt->wellFormed = 0;
				2989	/* Dump the bogus tag like browsers do */
				2990	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2991	NEXT;
				2992	return;
				2993	}
				2994	if (xmlStrEqual(name, BAD_CAST"meta"))
				2995	meta = 1;
				2996
				2997	/*
				2998	* Check for auto-closure of HTML elements.
				2999	*/
				3000	htmlAutoClose(ctxt, name);
				3001
				3002	/*
				3003	* Check for implied HTML elements.
				3004	*/
				3005	htmlCheckImplied(ctxt, name);
				3006
				3007	/*
				3008	* Avoid html at any level > 0, head at any level != 1
				3009	* or any attempt to recurse body
				3010	*/
				3011	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				3012	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3013	ctxt->sax->error(ctxt->userData,
				3014	"htmlParseStartTag: misplaced <html> tag\n");
				3015	ctxt->wellFormed = 0;
				3016	xmlFree(name);
				3017	return;
				3018	}
				3019	if ((ctxt->nameNr != 1) &&
				3020	(xmlStrEqual(name, BAD_CAST"head"))) {
				3021	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3022	ctxt->sax->error(ctxt->userData,
				3023	"htmlParseStartTag: misplaced <head> tag\n");
				3024	ctxt->wellFormed = 0;
				3025	xmlFree(name);
				3026	return;
				3027	}
				3028	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3029	int indx;
				3030	for (indx = 0;indx < ctxt->nameNr;indx++) {
				3031	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3032	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3033	ctxt->sax->error(ctxt->userData,
				3034	"htmlParseStartTag: misplaced <body> tag\n");
				3035	ctxt->wellFormed = 0;
				3036	xmlFree(name);
				3037	return;
				3038	}
				3039	}
				3040	}
				3041
				3042	/*
				3043	* Now parse the attributes, it ends up with the ending
				3044	*
				3045	* (S Attribute)* S?
				3046	*/
				3047	SKIP_BLANKS;
				3048	while ((IS_CHAR(CUR)) &&
				3049	(CUR != '>') &&
				3050	((CUR != '/') \|\| (NXT(1) != '>'))) {
				3051	long cons = ctxt->nbChars;
				3052
				3053	GROW;
				3054	attname = htmlParseAttribute(ctxt, &attvalue);
				3055	if (attname != NULL) {
				3056
				3057	/*
				3058	* Well formedness requires at most one declaration of an attribute
				3059	*/
				3060	for (i = 0; i < nbatts;i += 2) {
				3061	if (xmlStrEqual(atts[i], attname)) {
				3062	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3063	ctxt->sax->error(ctxt->userData,
				3064	"Attribute %s redefined\n",
				3065	attname);
				3066	ctxt->wellFormed = 0;
				3067	xmlFree(attname);
				3068	if (attvalue != NULL)
				3069	xmlFree(attvalue);
				3070	goto failed;
				3071	}
				3072	}
				3073
				3074	/*
				3075	* Add the pair to atts
				3076	*/
				3077	if (atts == NULL) {
				3078	maxatts = 10;
				3079	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				3080	if (atts == NULL) {
				3081	xmlGenericError(xmlGenericErrorContext,
				3082	"malloc of %ld byte failed\n",
				3083	maxatts * (long)sizeof(xmlChar *));
				3084	if (name != NULL) xmlFree(name);
				3085	return;
				3086	}
				3087	} else if (nbatts + 4 > maxatts) {
				3088	maxatts *= 2;
				3089	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3090	maxatts * sizeof(xmlChar *));
				3091	if (atts == NULL) {
				3092	xmlGenericError(xmlGenericErrorContext,
				3093	"realloc of %ld byte failed\n",
				3094	maxatts * (long)sizeof(xmlChar *));
				3095	if (name != NULL) xmlFree(name);
				3096	return;
				3097	}
				3098	}
				3099	atts[nbatts++] = attname;
				3100	atts[nbatts++] = attvalue;
				3101	atts[nbatts] = NULL;
				3102	atts[nbatts + 1] = NULL;
				3103	}
				3104	else {
				3105	/* Dump the bogus attribute string up to the next blank or
				3106	* the end of the tag. */
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	3107	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3108	&& ((CUR != '/') \|\| (NXT(1) != '>')))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3109	NEXT;
				3110	}
				3111
				3112	failed:
				3113	SKIP_BLANKS;
				3114	if (cons == ctxt->nbChars) {
				3115	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3116	ctxt->sax->error(ctxt->userData,
				3117	"htmlParseStartTag: problem parsing attributes\n");
				3118	ctxt->wellFormed = 0;
				3119	break;
				3120	}
				3121	}
				3122
				3123	/*
				3124	* Handle specific association to the META tag
				3125	*/
				3126	if (meta)
				3127	htmlCheckMeta(ctxt, atts);
				3128
				3129	/*
				3130	* SAX: Start of Element !
				3131	*/
				3132	htmlnamePush(ctxt, xmlStrdup(name));
				3133	#ifdef DEBUG
				3134	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3135	#endif
				3136	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3137	ctxt->sax->startElement(ctxt->userData, name, atts);
				3138
				3139	if (atts != NULL) {
				3140	for (i = 0;i < nbatts;i++) {
				3141	if (atts[i] != NULL)
				3142	xmlFree((xmlChar *) atts[i]);
				3143	}
				3144	xmlFree((void *) atts);
				3145	}
				3146	if (name != NULL) xmlFree(name);
				3147	}
				3148
				3149	/**
				3150	* htmlParseEndTag:
				3151	* @ctxt: an HTML parser context
				3152	*
				3153	* parse an end of tag
				3154	*
				3155	* [42] ETag ::= '</' Name S? '>'
				3156	*
				3157	* With namespace
				3158	*
				3159	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3160	*
				3161	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3162	*/
				3163
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3164	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3165	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3166	xmlChar *name;
				3167	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3168	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3169
				3170	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3171	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3172	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3173	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3174	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3175	}
				3176	SKIP(2);
				3177
				3178	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3179	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3180
				3181	/*
				3182	* We should definitely be at the ending "S? '>'" part
				3183	*/
				3184	SKIP_BLANKS;
				3185	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3186	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3187	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3188	ctxt->wellFormed = 0;
				3189	} else
				3190	NEXT;
				3191
				3192	/*
				3193	* If the name read is not one of the element in the parsing stack
				3194	* then return, it's just an error.
				3195	*/
				3196	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3197	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3198	}
				3199	if (i < 0) {
				3200	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3201	ctxt->sax->error(ctxt->userData,
				3202	"Unexpected end tag : %s\n", name);
				3203	xmlFree(name);
				3204	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3205	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3206	}
				3207
				3208
				3209	/*
				3210	* Check for auto-closure of HTML elements.
				3211	*/
				3212
				3213	htmlAutoCloseOnClose(ctxt, name);
				3214
				3215	/*
				3216	* Well formedness constraints, opening and closing must match.
				3217	* With the exception that the autoclose may have popped stuff out
				3218	* of the stack.
				3219	*/
				3220	if (!xmlStrEqual(name, ctxt->name)) {
				3221	#ifdef DEBUG
				3222	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3223	#endif
				3224	if ((ctxt->name != NULL) &&
				3225	(!xmlStrEqual(ctxt->name, name))) {
				3226	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3227	ctxt->sax->error(ctxt->userData,
				3228	"Opening and ending tag mismatch: %s and %s\n",
				3229	name, ctxt->name);
				3230	ctxt->wellFormed = 0;
				3231	}
				3232	}
				3233
				3234	/*
				3235	* SAX: End of Tag
				3236	*/
				3237	oldname = ctxt->name;
				3238	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3239	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3240	ctxt->sax->endElement(ctxt->userData, name);
				3241	oldname = htmlnamePop(ctxt);
				3242	if (oldname != NULL) {
				3243	#ifdef DEBUG
				3244	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3245	#endif
				3246	xmlFree(oldname);
				3247	#ifdef DEBUG
				3248	} else {
				3249	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3250	#endif
				3251	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3252	ret = 1;
				3253	} else {
				3254	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3255	}
				3256
				3257	if (name != NULL)
				3258	xmlFree(name);
				3259
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3260	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3261	}
				3262
				3263
				3264	/**
				3265	* htmlParseReference:
				3266	* @ctxt: an HTML parser context
				3267	*
				3268	* parse and handle entity references in content,
				3269	* this will end-up in a call to character() since this is either a
				3270	* CharRef, or a predefined entity.
				3271	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3272	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3273	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3274	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3275	xmlChar out[6];
				3276	xmlChar *name;
				3277	if (CUR != '&') return;
				3278
				3279	if (NXT(1) == '#') {
				3280	unsigned int c;
				3281	int bits, i = 0;
				3282
				3283	c = htmlParseCharRef(ctxt);
				3284	if (c == 0)
				3285	return;
				3286
				3287	if (c < 0x80) { out[i++]= c; bits= -6; }
				3288	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3289	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3290	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3291
				3292	for ( ; bits >= 0; bits-= 6) {
				3293	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3294	}
				3295	out[i] = 0;
				3296
				3297	htmlCheckParagraph(ctxt);
				3298	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3299	ctxt->sax->characters(ctxt->userData, out, i);
				3300	} else {
				3301	ent = htmlParseEntityRef(ctxt, &name);
				3302	if (name == NULL) {
				3303	htmlCheckParagraph(ctxt);
				3304	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3305	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3306	return;
				3307	}
Daniel Veillard	e645e8c	2002-10-22 17:35:37 +0000	[diff] [blame]	3308	if ((ent == NULL) \|\| !(ent->value > 0)) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3309	htmlCheckParagraph(ctxt);
				3310	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3311	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3312	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3313	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3314	}
				3315	} else {
				3316	unsigned int c;
				3317	int bits, i = 0;
				3318
				3319	c = ent->value;
				3320	if (c < 0x80)
				3321	{ out[i++]= c; bits= -6; }
				3322	else if (c < 0x800)
				3323	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3324	else if (c < 0x10000)
				3325	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3326	else
				3327	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3328
				3329	for ( ; bits >= 0; bits-= 6) {
				3330	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3331	}
				3332	out[i] = 0;
				3333
				3334	htmlCheckParagraph(ctxt);
				3335	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3336	ctxt->sax->characters(ctxt->userData, out, i);
				3337	}
				3338	xmlFree(name);
				3339	}
				3340	}
				3341
				3342	/**
				3343	* htmlParseContent:
				3344	* @ctxt: an HTML parser context
				3345	* @name: the node name
				3346	*
				3347	* Parse a content: comment, sub-element, reference or text.
				3348	*
				3349	*/
				3350
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3351	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3352	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3353	xmlChar *currentNode;
				3354	int depth;
				3355
				3356	currentNode = xmlStrdup(ctxt->name);
				3357	depth = ctxt->nameNr;
				3358	while (1) {
				3359	long cons = ctxt->nbChars;
				3360
				3361	GROW;
				3362	/*
				3363	* Our tag or one of it's parent or children is ending.
				3364	*/
				3365	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3366	if (htmlParseEndTag(ctxt) &&
				3367	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3368	if (currentNode != NULL)
				3369	xmlFree(currentNode);
				3370	return;
				3371	}
				3372	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3373	}
				3374
				3375	/*
				3376	* Has this node been popped out during parsing of
				3377	* the next element
				3378	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3379	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3380	(!xmlStrEqual(currentNode, ctxt->name)))
				3381	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3382	if (currentNode != NULL) xmlFree(currentNode);
				3383	return;
				3384	}
				3385
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3386	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3387	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3388	/*
				3389	* Handle SCRIPT/STYLE separately
				3390	*/
				3391	htmlParseScript(ctxt);
				3392	} else {
				3393	/*
				3394	* Sometimes DOCTYPE arrives in the middle of the document
				3395	*/
				3396	if ((CUR == '<') && (NXT(1) == '!') &&
				3397	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3398	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3399	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3400	(UPP(8) == 'E')) {
				3401	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3402	ctxt->sax->error(ctxt->userData,
				3403	"Misplaced DOCTYPE declaration\n");
				3404	ctxt->wellFormed = 0;
				3405	htmlParseDocTypeDecl(ctxt);
				3406	}
				3407
				3408	/*
				3409	* First case : a comment
				3410	*/
				3411	if ((CUR == '<') && (NXT(1) == '!') &&
				3412	(NXT(2) == '-') && (NXT(3) == '-')) {
				3413	htmlParseComment(ctxt);
				3414	}
				3415
				3416	/*
				3417	* Second case : a sub-element.
				3418	*/
				3419	else if (CUR == '<') {
				3420	htmlParseElement(ctxt);
				3421	}
				3422
				3423	/*
				3424	* Third case : a reference. If if has not been resolved,
				3425	* parsing returns it's Name, create the node
				3426	*/
				3427	else if (CUR == '&') {
				3428	htmlParseReference(ctxt);
				3429	}
				3430
				3431	/*
				3432	* Fourth : end of the resource
				3433	*/
				3434	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3435	htmlAutoCloseOnEnd(ctxt);
				3436	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3437	}
				3438
				3439	/*
				3440	* Last case, text. Note that References are handled directly.
				3441	*/
				3442	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3443	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3444	}
				3445
				3446	if (cons == ctxt->nbChars) {
				3447	if (ctxt->node != NULL) {
				3448	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3449	ctxt->sax->error(ctxt->userData,
				3450	"detected an error in element content\n");
				3451	ctxt->wellFormed = 0;
				3452	}
				3453	break;
				3454	}
				3455	}
				3456	GROW;
				3457	}
				3458	if (currentNode != NULL) xmlFree(currentNode);
				3459	}
				3460
				3461	/**
				3462	* htmlParseElement:
				3463	* @ctxt: an HTML parser context
				3464	*
				3465	* parse an HTML element, this is highly recursive
				3466	*
				3467	* [39] element ::= EmptyElemTag \| STag content ETag
				3468	*
				3469	* [41] Attribute ::= Name Eq AttValue
				3470	*/
				3471
				3472	void
				3473	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3474	xmlChar *name;
				3475	xmlChar *currentNode = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3476	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3477	htmlParserNodeInfo node_info;
				3478	xmlChar *oldname;
				3479	int depth = ctxt->nameNr;
Daniel Veillard	3fbe8e3	2001-10-06 13:30:33 +0000	[diff] [blame]	3480	const xmlChar *oldptr;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3481
				3482	/* Capture start position */
				3483	if (ctxt->record_info) {
				3484	node_info.begin_pos = ctxt->input->consumed +
				3485	(CUR_PTR - ctxt->input->base);
				3486	node_info.begin_line = ctxt->input->line;
				3487	}
				3488
				3489	oldname = xmlStrdup(ctxt->name);
				3490	htmlParseStartTag(ctxt);
				3491	name = ctxt->name;
				3492	#ifdef DEBUG
				3493	if (oldname == NULL)
				3494	xmlGenericError(xmlGenericErrorContext,
				3495	"Start of element %s\n", name);
				3496	else if (name == NULL)
				3497	xmlGenericError(xmlGenericErrorContext,
				3498	"Start of element failed, was %s\n", oldname);
				3499	else
				3500	xmlGenericError(xmlGenericErrorContext,
				3501	"Start of element %s, was %s\n", name, oldname);
				3502	#endif
				3503	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3504	(name == NULL)) {
				3505	if (CUR == '>')
				3506	NEXT;
				3507	if (oldname != NULL)
				3508	xmlFree(oldname);
				3509	return;
				3510	}
				3511	if (oldname != NULL)
				3512	xmlFree(oldname);
				3513
				3514	/*
				3515	* Lookup the info for that element.
				3516	*/
				3517	info = htmlTagLookup(name);
				3518	if (info == NULL) {
				3519	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3520	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3521	name);
				3522	ctxt->wellFormed = 0;
				3523	} else if (info->depr) {
				3524	/***************************
				3525	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3526	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3527	name);
				3528	***************************/
				3529	}
				3530
				3531	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	3532	* Check for an Empty Element labeled the XML/SGML way
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3533	*/
				3534	if ((CUR == '/') && (NXT(1) == '>')) {
				3535	SKIP(2);
				3536	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3537	ctxt->sax->endElement(ctxt->userData, name);
				3538	oldname = htmlnamePop(ctxt);
				3539	#ifdef DEBUG
				3540	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3541	#endif
				3542	if (oldname != NULL)
				3543	xmlFree(oldname);
				3544	return;
				3545	}
				3546
				3547	if (CUR == '>') {
				3548	NEXT;
				3549	} else {
				3550	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3551	ctxt->sax->error(ctxt->userData,
				3552	"Couldn't find end of Start Tag %s\n",
				3553	name);
				3554	ctxt->wellFormed = 0;
				3555
				3556	/*
				3557	* end of parsing of this node.
				3558	*/
				3559	if (xmlStrEqual(name, ctxt->name)) {
				3560	nodePop(ctxt);
				3561	oldname = htmlnamePop(ctxt);
				3562	#ifdef DEBUG
				3563	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3564	#endif
				3565	if (oldname != NULL)
				3566	xmlFree(oldname);
				3567	}
				3568
				3569	/*
				3570	* Capture end position and add node
				3571	*/
				3572	if ( currentNode != NULL && ctxt->record_info ) {
				3573	node_info.end_pos = ctxt->input->consumed +
				3574	(CUR_PTR - ctxt->input->base);
				3575	node_info.end_line = ctxt->input->line;
				3576	node_info.node = ctxt->node;
				3577	xmlParserAddNodeInfo(ctxt, &node_info);
				3578	}
				3579	return;
				3580	}
				3581
				3582	/*
				3583	* Check for an Empty Element from DTD definition
				3584	*/
				3585	if ((info != NULL) && (info->empty)) {
				3586	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3587	ctxt->sax->endElement(ctxt->userData, name);
				3588	oldname = htmlnamePop(ctxt);
				3589	#ifdef DEBUG
				3590	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3591	#endif
				3592	if (oldname != NULL)
				3593	xmlFree(oldname);
				3594	return;
				3595	}
				3596
				3597	/*
				3598	* Parse the content of the element:
				3599	*/
				3600	currentNode = xmlStrdup(ctxt->name);
				3601	depth = ctxt->nameNr;
				3602	while (IS_CHAR(CUR)) {
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3603	oldptr = ctxt->input->cur;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3604	htmlParseContent(ctxt);
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3605	if (oldptr==ctxt->input->cur) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3606	if (ctxt->nameNr < depth) break;
				3607	}
				3608
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3609	/*
				3610	* Capture end position and add node
				3611	*/
				3612	if ( currentNode != NULL && ctxt->record_info ) {
				3613	node_info.end_pos = ctxt->input->consumed +
				3614	(CUR_PTR - ctxt->input->base);
				3615	node_info.end_line = ctxt->input->line;
				3616	node_info.node = ctxt->node;
				3617	xmlParserAddNodeInfo(ctxt, &node_info);
				3618	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3619	if (!IS_CHAR(CUR)) {
				3620	htmlAutoCloseOnEnd(ctxt);
				3621	}
				3622
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3623	if (currentNode != NULL)
				3624	xmlFree(currentNode);
				3625	}
				3626
				3627	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	3628	* htmlParseDocument:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3629	* @ctxt: an HTML parser context
				3630	*
				3631	* parse an HTML document (and build a tree if using the standard SAX
				3632	* interface).
				3633	*
				3634	* Returns 0, -1 in case of error. the parser context is augmented
				3635	* as a result of the parsing.
				3636	*/
				3637
Daniel Veillard	1b31e4a	2002-05-27 14:44:50 +0000	[diff] [blame]	3638	int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3639	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3640	xmlDtdPtr dtd;
				3641
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	3642	xmlInitParser();
				3643
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3644	htmlDefaultSAXHandlerInit();
				3645	ctxt->html = 1;
				3646
				3647	GROW;
				3648	/*
				3649	* SAX: beginning of the document processing.
				3650	*/
				3651	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3652	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3653
				3654	/*
				3655	* Wipe out everything which is before the first '<'
				3656	*/
				3657	SKIP_BLANKS;
				3658	if (CUR == 0) {
				3659	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3660	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3661	ctxt->wellFormed = 0;
				3662	}
				3663
				3664	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3665	ctxt->sax->startDocument(ctxt->userData);
				3666
				3667
				3668	/*
				3669	* Parse possible comments before any content
				3670	*/
				3671	while ((CUR == '<') && (NXT(1) == '!') &&
				3672	(NXT(2) == '-') && (NXT(3) == '-')) {
				3673	htmlParseComment(ctxt);
				3674	SKIP_BLANKS;
				3675	}
				3676
				3677
				3678	/*
				3679	* Then possibly doc type declaration(s) and more Misc
				3680	* (doctypedecl Misc*)?
				3681	*/
				3682	if ((CUR == '<') && (NXT(1) == '!') &&
				3683	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3684	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3685	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3686	(UPP(8) == 'E')) {
				3687	htmlParseDocTypeDecl(ctxt);
				3688	}
				3689	SKIP_BLANKS;
				3690
				3691	/*
				3692	* Parse possible comments before any content
				3693	*/
				3694	while ((CUR == '<') && (NXT(1) == '!') &&
				3695	(NXT(2) == '-') && (NXT(3) == '-')) {
				3696	htmlParseComment(ctxt);
				3697	SKIP_BLANKS;
				3698	}
				3699
				3700	/*
				3701	* Time to start parsing the tree itself
				3702	*/
				3703	htmlParseContent(ctxt);
				3704
				3705	/*
				3706	* autoclose
				3707	*/
				3708	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3709	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3710
				3711
				3712	/*
				3713	* SAX: end of the document processing.
				3714	*/
				3715	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3716	ctxt->sax->endDocument(ctxt->userData);
				3717
				3718	if (ctxt->myDoc != NULL) {
				3719	dtd = xmlGetIntSubset(ctxt->myDoc);
				3720	if (dtd == NULL)
				3721	ctxt->myDoc->intSubset =
				3722	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3723	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3724	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3725	}
				3726	if (! ctxt->wellFormed) return(-1);
				3727	return(0);
				3728	}
				3729
				3730
				3731	/************************************************************************
				3732	* *
				3733	* Parser contexts handling *
				3734	* *
				3735	************************************************************************/
				3736
				3737	/**
				3738	* xmlInitParserCtxt:
				3739	* @ctxt: an HTML parser context
				3740	*
				3741	* Initialize a parser context
				3742	*/
				3743
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3744	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3745	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3746	{
				3747	htmlSAXHandler *sax;
				3748
				3749	if (ctxt == NULL) return;
				3750	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3751
				3752	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3753	if (sax == NULL) {
				3754	xmlGenericError(xmlGenericErrorContext,
				3755	"htmlInitParserCtxt: out of memory\n");
				3756	}
				3757	else
				3758	memset(sax, 0, sizeof(htmlSAXHandler));
				3759
				3760	/* Allocate the Input stack */
				3761	ctxt->inputTab = (htmlParserInputPtr *)
				3762	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3763	if (ctxt->inputTab == NULL) {
				3764	xmlGenericError(xmlGenericErrorContext,
				3765	"htmlInitParserCtxt: out of memory\n");
				3766	ctxt->inputNr = 0;
				3767	ctxt->inputMax = 0;
				3768	ctxt->input = NULL;
				3769	return;
				3770	}
				3771	ctxt->inputNr = 0;
				3772	ctxt->inputMax = 5;
				3773	ctxt->input = NULL;
				3774	ctxt->version = NULL;
				3775	ctxt->encoding = NULL;
				3776	ctxt->standalone = -1;
				3777	ctxt->instate = XML_PARSER_START;
				3778
				3779	/* Allocate the Node stack */
				3780	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3781	if (ctxt->nodeTab == NULL) {
				3782	xmlGenericError(xmlGenericErrorContext,
				3783	"htmlInitParserCtxt: out of memory\n");
				3784	ctxt->nodeNr = 0;
				3785	ctxt->nodeMax = 0;
				3786	ctxt->node = NULL;
				3787	ctxt->inputNr = 0;
				3788	ctxt->inputMax = 0;
				3789	ctxt->input = NULL;
				3790	return;
				3791	}
				3792	ctxt->nodeNr = 0;
				3793	ctxt->nodeMax = 10;
				3794	ctxt->node = NULL;
				3795
				3796	/* Allocate the Name stack */
				3797	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3798	if (ctxt->nameTab == NULL) {
				3799	xmlGenericError(xmlGenericErrorContext,
				3800	"htmlInitParserCtxt: out of memory\n");
				3801	ctxt->nameNr = 0;
				3802	ctxt->nameMax = 10;
				3803	ctxt->name = NULL;
				3804	ctxt->nodeNr = 0;
				3805	ctxt->nodeMax = 0;
				3806	ctxt->node = NULL;
				3807	ctxt->inputNr = 0;
				3808	ctxt->inputMax = 0;
				3809	ctxt->input = NULL;
				3810	return;
				3811	}
				3812	ctxt->nameNr = 0;
				3813	ctxt->nameMax = 10;
				3814	ctxt->name = NULL;
				3815
				3816	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3817	else {
				3818	ctxt->sax = sax;
				3819	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3820	}
				3821	ctxt->userData = ctxt;
				3822	ctxt->myDoc = NULL;
				3823	ctxt->wellFormed = 1;
				3824	ctxt->replaceEntities = 0;
Daniel Veillard	635ef72	2001-10-29 11:48:19 +0000	[diff] [blame]	3825	ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3826	ctxt->html = 1;
				3827	ctxt->record_info = 0;
				3828	ctxt->validate = 0;
				3829	ctxt->nbChars = 0;
				3830	ctxt->checkIndex = 0;
Daniel Veillard	dc2cee2	2001-08-22 16:30:37 +0000	[diff] [blame]	3831	ctxt->catalogs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3832	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3833	}
				3834
				3835	/**
				3836	* htmlFreeParserCtxt:
				3837	* @ctxt: an HTML parser context
				3838	*
				3839	* Free all the memory used by a parser context. However the parsed
				3840	* document in ctxt->myDoc is not freed.
				3841	*/
				3842
				3843	void
				3844	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3845	{
				3846	xmlFreeParserCtxt(ctxt);
				3847	}
				3848
				3849	/**
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3850	* htmlNewParserCtxt:
				3851	*
				3852	* Allocate and initialize a new parser context.
				3853	*
				3854	* Returns the xmlParserCtxtPtr or NULL
				3855	*/
				3856
				3857	static htmlParserCtxtPtr
				3858	htmlNewParserCtxt(void)
				3859	{
				3860	xmlParserCtxtPtr ctxt;
				3861
				3862	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
				3863	if (ctxt == NULL) {
				3864	xmlGenericError(xmlGenericErrorContext,
				3865	"xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3866	return(NULL);
				3867	}
				3868	memset(ctxt, 0, sizeof(xmlParserCtxt));
				3869	htmlInitParserCtxt(ctxt);
				3870	return(ctxt);
				3871	}
				3872
				3873	/**
				3874	* htmlCreateMemoryParserCtxt:
				3875	* @buffer: a pointer to a char array
				3876	* @size: the size of the array
				3877	*
				3878	* Create a parser context for an HTML in-memory document.
				3879	*
				3880	* Returns the new parser context or NULL
				3881	*/
				3882	static htmlParserCtxtPtr
				3883	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
				3884	xmlParserCtxtPtr ctxt;
				3885	xmlParserInputPtr input;
				3886	xmlParserInputBufferPtr buf;
				3887
				3888	if (buffer == NULL)
				3889	return(NULL);
				3890	if (size <= 0)
				3891	return(NULL);
				3892
				3893	ctxt = htmlNewParserCtxt();
				3894	if (ctxt == NULL)
				3895	return(NULL);
				3896
				3897	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				3898	if (buf == NULL) return(NULL);
				3899
				3900	input = xmlNewInputStream(ctxt);
				3901	if (input == NULL) {
				3902	xmlFreeParserCtxt(ctxt);
				3903	return(NULL);
				3904	}
				3905
				3906	input->filename = NULL;
				3907	input->buf = buf;
				3908	input->base = input->buf->buffer->content;
				3909	input->cur = input->buf->buffer->content;
				3910	input->end = &input->buf->buffer->content[input->buf->buffer->use];
				3911
				3912	inputPush(ctxt, input);
				3913	return(ctxt);
				3914	}
				3915
				3916	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	3917	* htmlCreateDocParserCtxt:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3918	* @cur: a pointer to an array of xmlChar
				3919	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3920	*
				3921	* Create a parser context for an HTML document.
				3922	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3923	* TODO: check the need to add encoding handling there
				3924	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3925	* Returns the new parser context or NULL
				3926	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3927	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3928	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3929	int len;
Daniel Veillard	e5b110b	2003-02-04 14:43:39 +0000	[diff] [blame^]	3930	htmlParserCtxtPtr ctxt;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3931
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3932	if (cur == NULL)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3933	return(NULL);
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3934	len = xmlStrlen(cur);
Daniel Veillard	e5b110b	2003-02-04 14:43:39 +0000	[diff] [blame^]	3935	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
				3936
				3937	if (encoding != NULL) {
				3938	xmlCharEncoding enc;
				3939	xmlCharEncodingHandlerPtr handler;
				3940
				3941	if (ctxt->input->encoding != NULL)
				3942	xmlFree((xmlChar *) ctxt->input->encoding);
				3943	ctxt->input->encoding = (const xmlChar *) encoding;
				3944
				3945	enc = xmlParseCharEncoding(encoding);
				3946	/*
				3947	* registered set of known encodings
				3948	*/
				3949	if (enc != XML_CHAR_ENCODING_ERROR) {
				3950	xmlSwitchEncoding(ctxt, enc);
				3951	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
				3952	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3953	ctxt->sax->error(ctxt->userData,
				3954	"Unsupported encoding %s\n", encoding);
				3955	ctxt->input->encoding = NULL;
				3956	}
				3957	} else {
				3958	/*
				3959	* fallback for unknown encodings
				3960	*/
				3961	handler = xmlFindCharEncodingHandler((const char *) encoding);
				3962	if (handler != NULL) {
				3963	xmlSwitchToEncoding(ctxt, handler);
				3964	} else {
				3965	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				3966	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3967	ctxt->sax->error(ctxt->userData,
				3968	"Unsupported encoding %s\n", encoding);
				3969	}
				3970	}
				3971	}
				3972	return(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3973	}
				3974
				3975	/************************************************************************
				3976	* *
				3977	* Progressive parsing interfaces *
				3978	* *
				3979	************************************************************************/
				3980
				3981	/**
				3982	* htmlParseLookupSequence:
				3983	* @ctxt: an HTML parser context
				3984	* @first: the first char to lookup
				3985	* @next: the next char to lookup or zero
				3986	* @third: the next char to lookup or zero
				3987	*
				3988	* Try to find if a sequence (first, next, third) or just (first next) or
				3989	* (first) is available in the input stream.
				3990	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3991	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3992	* parser, do not use liberally.
				3993	* This is basically similar to xmlParseLookupSequence()
				3994	*
				3995	* Returns the index to the current parsing point if the full sequence
				3996	* is available, -1 otherwise.
				3997	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3998	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3999	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				4000	xmlChar next, xmlChar third) {
				4001	int base, len;
				4002	htmlParserInputPtr in;
				4003	const xmlChar *buf;
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	4004	int incomment = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4005
				4006	in = ctxt->input;
				4007	if (in == NULL) return(-1);
				4008	base = in->cur - in->base;
				4009	if (base < 0) return(-1);
				4010	if (ctxt->checkIndex > base)
				4011	base = ctxt->checkIndex;
				4012	if (in->buf == NULL) {
				4013	buf = in->base;
				4014	len = in->length;
				4015	} else {
				4016	buf = in->buf->buffer->content;
				4017	len = in->buf->buffer->use;
				4018	}
				4019	/* take into account the sequence length */
				4020	if (third) len -= 2;
				4021	else if (next) len --;
				4022	for (;base < len;base++) {
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	4023	if (!incomment && (base + 4 < len)) {
				4024	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
				4025	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
				4026	incomment = 1;
				4027	}
				4028	/* do not increment base, some people use <!--> */
				4029	}
				4030	if (incomment) {
				4031	if (base + 3 < len)
				4032	return(-1);
				4033	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
				4034	(buf[base + 2] == '>')) {
				4035	incomment = 0;
				4036	base += 2;
				4037	}
				4038	continue;
				4039	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4040	if (buf[base] == first) {
				4041	if (third != 0) {
				4042	if ((buf[base + 1] != next) \|\|
				4043	(buf[base + 2] != third)) continue;
				4044	} else if (next != 0) {
				4045	if (buf[base + 1] != next) continue;
				4046	}
				4047	ctxt->checkIndex = 0;
				4048	#ifdef DEBUG_PUSH
				4049	if (next == 0)
				4050	xmlGenericError(xmlGenericErrorContext,
				4051	"HPP: lookup '%c' found at %d\n",
				4052	first, base);
				4053	else if (third == 0)
				4054	xmlGenericError(xmlGenericErrorContext,
				4055	"HPP: lookup '%c%c' found at %d\n",
				4056	first, next, base);
				4057	else
				4058	xmlGenericError(xmlGenericErrorContext,
				4059	"HPP: lookup '%c%c%c' found at %d\n",
				4060	first, next, third, base);
				4061	#endif
				4062	return(base - (in->cur - in->base));
				4063	}
				4064	}
				4065	ctxt->checkIndex = base;
				4066	#ifdef DEBUG_PUSH
				4067	if (next == 0)
				4068	xmlGenericError(xmlGenericErrorContext,
				4069	"HPP: lookup '%c' failed\n", first);
				4070	else if (third == 0)
				4071	xmlGenericError(xmlGenericErrorContext,
				4072	"HPP: lookup '%c%c' failed\n", first, next);
				4073	else
				4074	xmlGenericError(xmlGenericErrorContext,
				4075	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				4076	#endif
				4077	return(-1);
				4078	}
				4079
				4080	/**
				4081	* htmlParseTryOrFinish:
				4082	* @ctxt: an HTML parser context
				4083	* @terminate: last chunk indicator
				4084	*
				4085	* Try to progress on parsing
				4086	*
				4087	* Returns zero if no parsing was possible
				4088	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4089	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4090	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				4091	int ret = 0;
				4092	htmlParserInputPtr in;
				4093	int avail = 0;
				4094	xmlChar cur, next;
				4095
				4096	#ifdef DEBUG_PUSH
				4097	switch (ctxt->instate) {
				4098	case XML_PARSER_EOF:
				4099	xmlGenericError(xmlGenericErrorContext,
				4100	"HPP: try EOF\n"); break;
				4101	case XML_PARSER_START:
				4102	xmlGenericError(xmlGenericErrorContext,
				4103	"HPP: try START\n"); break;
				4104	case XML_PARSER_MISC:
				4105	xmlGenericError(xmlGenericErrorContext,
				4106	"HPP: try MISC\n");break;
				4107	case XML_PARSER_COMMENT:
				4108	xmlGenericError(xmlGenericErrorContext,
				4109	"HPP: try COMMENT\n");break;
				4110	case XML_PARSER_PROLOG:
				4111	xmlGenericError(xmlGenericErrorContext,
				4112	"HPP: try PROLOG\n");break;
				4113	case XML_PARSER_START_TAG:
				4114	xmlGenericError(xmlGenericErrorContext,
				4115	"HPP: try START_TAG\n");break;
				4116	case XML_PARSER_CONTENT:
				4117	xmlGenericError(xmlGenericErrorContext,
				4118	"HPP: try CONTENT\n");break;
				4119	case XML_PARSER_CDATA_SECTION:
				4120	xmlGenericError(xmlGenericErrorContext,
				4121	"HPP: try CDATA_SECTION\n");break;
				4122	case XML_PARSER_END_TAG:
				4123	xmlGenericError(xmlGenericErrorContext,
				4124	"HPP: try END_TAG\n");break;
				4125	case XML_PARSER_ENTITY_DECL:
				4126	xmlGenericError(xmlGenericErrorContext,
				4127	"HPP: try ENTITY_DECL\n");break;
				4128	case XML_PARSER_ENTITY_VALUE:
				4129	xmlGenericError(xmlGenericErrorContext,
				4130	"HPP: try ENTITY_VALUE\n");break;
				4131	case XML_PARSER_ATTRIBUTE_VALUE:
				4132	xmlGenericError(xmlGenericErrorContext,
				4133	"HPP: try ATTRIBUTE_VALUE\n");break;
				4134	case XML_PARSER_DTD:
				4135	xmlGenericError(xmlGenericErrorContext,
				4136	"HPP: try DTD\n");break;
				4137	case XML_PARSER_EPILOG:
				4138	xmlGenericError(xmlGenericErrorContext,
				4139	"HPP: try EPILOG\n");break;
				4140	case XML_PARSER_PI:
				4141	xmlGenericError(xmlGenericErrorContext,
				4142	"HPP: try PI\n");break;
				4143	case XML_PARSER_SYSTEM_LITERAL:
				4144	xmlGenericError(xmlGenericErrorContext,
				4145	"HPP: try SYSTEM_LITERAL\n");break;
				4146	}
				4147	#endif
				4148
				4149	while (1) {
				4150
				4151	in = ctxt->input;
				4152	if (in == NULL) break;
				4153	if (in->buf == NULL)
				4154	avail = in->length - (in->cur - in->base);
				4155	else
				4156	avail = in->buf->buffer->use - (in->cur - in->base);
				4157	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4158	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4159	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4160	/*
				4161	* SAX: end of the document processing.
				4162	*/
				4163	ctxt->instate = XML_PARSER_EOF;
				4164	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4165	ctxt->sax->endDocument(ctxt->userData);
				4166	}
				4167	}
				4168	if (avail < 1)
				4169	goto done;
				4170	switch (ctxt->instate) {
				4171	case XML_PARSER_EOF:
				4172	/*
				4173	* Document parsing is done !
				4174	*/
				4175	goto done;
				4176	case XML_PARSER_START:
				4177	/*
				4178	* Very first chars read from the document flow.
				4179	*/
				4180	cur = in->cur[0];
				4181	if (IS_BLANK(cur)) {
				4182	SKIP_BLANKS;
				4183	if (in->buf == NULL)
				4184	avail = in->length - (in->cur - in->base);
				4185	else
				4186	avail = in->buf->buffer->use - (in->cur - in->base);
				4187	}
				4188	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4189	ctxt->sax->setDocumentLocator(ctxt->userData,
				4190	&xmlDefaultSAXLocator);
				4191	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4192	(!ctxt->disableSAX))
				4193	ctxt->sax->startDocument(ctxt->userData);
				4194
				4195	cur = in->cur[0];
				4196	next = in->cur[1];
				4197	if ((cur == '<') && (next == '!') &&
				4198	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4199	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4200	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4201	(UPP(8) == 'E')) {
				4202	if ((!terminate) &&
				4203	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4204	goto done;
				4205	#ifdef DEBUG_PUSH
				4206	xmlGenericError(xmlGenericErrorContext,
				4207	"HPP: Parsing internal subset\n");
				4208	#endif
				4209	htmlParseDocTypeDecl(ctxt);
				4210	ctxt->instate = XML_PARSER_PROLOG;
				4211	#ifdef DEBUG_PUSH
				4212	xmlGenericError(xmlGenericErrorContext,
				4213	"HPP: entering PROLOG\n");
				4214	#endif
				4215	} else {
				4216	ctxt->instate = XML_PARSER_MISC;
				4217	}
				4218	#ifdef DEBUG_PUSH
				4219	xmlGenericError(xmlGenericErrorContext,
				4220	"HPP: entering MISC\n");
				4221	#endif
				4222	break;
				4223	case XML_PARSER_MISC:
				4224	SKIP_BLANKS;
				4225	if (in->buf == NULL)
				4226	avail = in->length - (in->cur - in->base);
				4227	else
				4228	avail = in->buf->buffer->use - (in->cur - in->base);
				4229	if (avail < 2)
				4230	goto done;
				4231	cur = in->cur[0];
				4232	next = in->cur[1];
				4233	if ((cur == '<') && (next == '!') &&
				4234	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4235	if ((!terminate) &&
				4236	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4237	goto done;
				4238	#ifdef DEBUG_PUSH
				4239	xmlGenericError(xmlGenericErrorContext,
				4240	"HPP: Parsing Comment\n");
				4241	#endif
				4242	htmlParseComment(ctxt);
				4243	ctxt->instate = XML_PARSER_MISC;
				4244	} else if ((cur == '<') && (next == '!') &&
				4245	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4246	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4247	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4248	(UPP(8) == 'E')) {
				4249	if ((!terminate) &&
				4250	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4251	goto done;
				4252	#ifdef DEBUG_PUSH
				4253	xmlGenericError(xmlGenericErrorContext,
				4254	"HPP: Parsing internal subset\n");
				4255	#endif
				4256	htmlParseDocTypeDecl(ctxt);
				4257	ctxt->instate = XML_PARSER_PROLOG;
				4258	#ifdef DEBUG_PUSH
				4259	xmlGenericError(xmlGenericErrorContext,
				4260	"HPP: entering PROLOG\n");
				4261	#endif
				4262	} else if ((cur == '<') && (next == '!') &&
				4263	(avail < 9)) {
				4264	goto done;
				4265	} else {
				4266	ctxt->instate = XML_PARSER_START_TAG;
				4267	#ifdef DEBUG_PUSH
				4268	xmlGenericError(xmlGenericErrorContext,
				4269	"HPP: entering START_TAG\n");
				4270	#endif
				4271	}
				4272	break;
				4273	case XML_PARSER_PROLOG:
				4274	SKIP_BLANKS;
				4275	if (in->buf == NULL)
				4276	avail = in->length - (in->cur - in->base);
				4277	else
				4278	avail = in->buf->buffer->use - (in->cur - in->base);
				4279	if (avail < 2)
				4280	goto done;
				4281	cur = in->cur[0];
				4282	next = in->cur[1];
				4283	if ((cur == '<') && (next == '!') &&
				4284	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4285	if ((!terminate) &&
				4286	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4287	goto done;
				4288	#ifdef DEBUG_PUSH
				4289	xmlGenericError(xmlGenericErrorContext,
				4290	"HPP: Parsing Comment\n");
				4291	#endif
				4292	htmlParseComment(ctxt);
				4293	ctxt->instate = XML_PARSER_PROLOG;
				4294	} else if ((cur == '<') && (next == '!') &&
				4295	(avail < 4)) {
				4296	goto done;
				4297	} else {
				4298	ctxt->instate = XML_PARSER_START_TAG;
				4299	#ifdef DEBUG_PUSH
				4300	xmlGenericError(xmlGenericErrorContext,
				4301	"HPP: entering START_TAG\n");
				4302	#endif
				4303	}
				4304	break;
				4305	case XML_PARSER_EPILOG:
				4306	if (in->buf == NULL)
				4307	avail = in->length - (in->cur - in->base);
				4308	else
				4309	avail = in->buf->buffer->use - (in->cur - in->base);
				4310	if (avail < 1)
				4311	goto done;
				4312	cur = in->cur[0];
				4313	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4314	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4315	goto done;
				4316	}
				4317	if (avail < 2)
				4318	goto done;
				4319	next = in->cur[1];
				4320	if ((cur == '<') && (next == '!') &&
				4321	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4322	if ((!terminate) &&
				4323	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4324	goto done;
				4325	#ifdef DEBUG_PUSH
				4326	xmlGenericError(xmlGenericErrorContext,
				4327	"HPP: Parsing Comment\n");
				4328	#endif
				4329	htmlParseComment(ctxt);
				4330	ctxt->instate = XML_PARSER_EPILOG;
				4331	} else if ((cur == '<') && (next == '!') &&
				4332	(avail < 4)) {
				4333	goto done;
				4334	} else {
				4335	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4336	ctxt->wellFormed = 0;
				4337	ctxt->instate = XML_PARSER_EOF;
				4338	#ifdef DEBUG_PUSH
				4339	xmlGenericError(xmlGenericErrorContext,
				4340	"HPP: entering EOF\n");
				4341	#endif
				4342	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4343	ctxt->sax->endDocument(ctxt->userData);
				4344	goto done;
				4345	}
				4346	break;
				4347	case XML_PARSER_START_TAG: {
				4348	xmlChar name, oldname;
				4349	int depth = ctxt->nameNr;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	4350	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4351
				4352	if (avail < 2)
				4353	goto done;
				4354	cur = in->cur[0];
				4355	if (cur != '<') {
				4356	ctxt->instate = XML_PARSER_CONTENT;
				4357	#ifdef DEBUG_PUSH
				4358	xmlGenericError(xmlGenericErrorContext,
				4359	"HPP: entering CONTENT\n");
				4360	#endif
				4361	break;
				4362	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4363	if (in->cur[1] == '/') {
				4364	ctxt->instate = XML_PARSER_END_TAG;
				4365	ctxt->checkIndex = 0;
				4366	#ifdef DEBUG_PUSH
				4367	xmlGenericError(xmlGenericErrorContext,
				4368	"HPP: entering END_TAG\n");
				4369	#endif
				4370	break;
				4371	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4372	if ((!terminate) &&
				4373	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4374	goto done;
				4375
				4376	oldname = xmlStrdup(ctxt->name);
				4377	htmlParseStartTag(ctxt);
				4378	name = ctxt->name;
				4379	#ifdef DEBUG
				4380	if (oldname == NULL)
				4381	xmlGenericError(xmlGenericErrorContext,
				4382	"Start of element %s\n", name);
				4383	else if (name == NULL)
				4384	xmlGenericError(xmlGenericErrorContext,
				4385	"Start of element failed, was %s\n",
				4386	oldname);
				4387	else
				4388	xmlGenericError(xmlGenericErrorContext,
				4389	"Start of element %s, was %s\n",
				4390	name, oldname);
				4391	#endif
				4392	if (((depth == ctxt->nameNr) &&
				4393	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4394	(name == NULL)) {
				4395	if (CUR == '>')
				4396	NEXT;
				4397	if (oldname != NULL)
				4398	xmlFree(oldname);
				4399	break;
				4400	}
				4401	if (oldname != NULL)
				4402	xmlFree(oldname);
				4403
				4404	/*
				4405	* Lookup the info for that element.
				4406	*/
				4407	info = htmlTagLookup(name);
				4408	if (info == NULL) {
				4409	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4410	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4411	name);
				4412	ctxt->wellFormed = 0;
				4413	} else if (info->depr) {
				4414	/***************************
				4415	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4416	ctxt->sax->warning(ctxt->userData,
				4417	"Tag %s is deprecated\n",
				4418	name);
				4419	***************************/
				4420	}
				4421
				4422	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	4423	* Check for an Empty Element labeled the XML/SGML way
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4424	*/
				4425	if ((CUR == '/') && (NXT(1) == '>')) {
				4426	SKIP(2);
				4427	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4428	ctxt->sax->endElement(ctxt->userData, name);
				4429	oldname = htmlnamePop(ctxt);
				4430	#ifdef DEBUG
				4431	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4432	oldname);
				4433	#endif
				4434	if (oldname != NULL)
				4435	xmlFree(oldname);
				4436	ctxt->instate = XML_PARSER_CONTENT;
				4437	#ifdef DEBUG_PUSH
				4438	xmlGenericError(xmlGenericErrorContext,
				4439	"HPP: entering CONTENT\n");
				4440	#endif
				4441	break;
				4442	}
				4443
				4444	if (CUR == '>') {
				4445	NEXT;
				4446	} else {
				4447	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4448	ctxt->sax->error(ctxt->userData,
				4449	"Couldn't find end of Start Tag %s\n",
				4450	name);
				4451	ctxt->wellFormed = 0;
				4452
				4453	/*
				4454	* end of parsing of this node.
				4455	*/
				4456	if (xmlStrEqual(name, ctxt->name)) {
				4457	nodePop(ctxt);
				4458	oldname = htmlnamePop(ctxt);
				4459	#ifdef DEBUG
				4460	xmlGenericError(xmlGenericErrorContext,
				4461	"End of start tag problem: popping out %s\n", oldname);
				4462	#endif
				4463	if (oldname != NULL)
				4464	xmlFree(oldname);
				4465	}
				4466
				4467	ctxt->instate = XML_PARSER_CONTENT;
				4468	#ifdef DEBUG_PUSH
				4469	xmlGenericError(xmlGenericErrorContext,
				4470	"HPP: entering CONTENT\n");
				4471	#endif
				4472	break;
				4473	}
				4474
				4475	/*
				4476	* Check for an Empty Element from DTD definition
				4477	*/
				4478	if ((info != NULL) && (info->empty)) {
				4479	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4480	ctxt->sax->endElement(ctxt->userData, name);
				4481	oldname = htmlnamePop(ctxt);
				4482	#ifdef DEBUG
				4483	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4484	#endif
				4485	if (oldname != NULL)
				4486	xmlFree(oldname);
				4487	}
				4488	ctxt->instate = XML_PARSER_CONTENT;
				4489	#ifdef DEBUG_PUSH
				4490	xmlGenericError(xmlGenericErrorContext,
				4491	"HPP: entering CONTENT\n");
				4492	#endif
				4493	break;
				4494	}
				4495	case XML_PARSER_CONTENT: {
				4496	long cons;
				4497	/*
				4498	* Handle preparsed entities and charRef
				4499	*/
				4500	if (ctxt->token != 0) {
				4501	xmlChar chr[2] = { 0 , 0 } ;
				4502
				4503	chr[0] = (xmlChar) ctxt->token;
				4504	htmlCheckParagraph(ctxt);
				4505	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4506	ctxt->sax->characters(ctxt->userData, chr, 1);
				4507	ctxt->token = 0;
				4508	ctxt->checkIndex = 0;
				4509	}
				4510	if ((avail == 1) && (terminate)) {
				4511	cur = in->cur[0];
				4512	if ((cur != '<') && (cur != '&')) {
				4513	if (ctxt->sax != NULL) {
				4514	if (IS_BLANK(cur)) {
				4515	if (ctxt->sax->ignorableWhitespace != NULL)
				4516	ctxt->sax->ignorableWhitespace(
				4517	ctxt->userData, &cur, 1);
				4518	} else {
				4519	htmlCheckParagraph(ctxt);
				4520	if (ctxt->sax->characters != NULL)
				4521	ctxt->sax->characters(
				4522	ctxt->userData, &cur, 1);
				4523	}
				4524	}
				4525	ctxt->token = 0;
				4526	ctxt->checkIndex = 0;
Daniel Veillard	bc6e1a3	2002-11-18 15:07:25 +0000	[diff] [blame]	4527	in->cur++;
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	4528	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4529	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4530	}
				4531	if (avail < 2)
				4532	goto done;
				4533	cur = in->cur[0];
				4534	next = in->cur[1];
				4535	cons = ctxt->nbChars;
				4536	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4537	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4538	/*
				4539	* Handle SCRIPT/STYLE separately
				4540	*/
				4541	if ((!terminate) &&
				4542	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4543	goto done;
				4544	htmlParseScript(ctxt);
				4545	if ((cur == '<') && (next == '/')) {
				4546	ctxt->instate = XML_PARSER_END_TAG;
				4547	ctxt->checkIndex = 0;
				4548	#ifdef DEBUG_PUSH
				4549	xmlGenericError(xmlGenericErrorContext,
				4550	"HPP: entering END_TAG\n");
				4551	#endif
				4552	break;
				4553	}
				4554	} else {
				4555	/*
				4556	* Sometimes DOCTYPE arrives in the middle of the document
				4557	*/
				4558	if ((cur == '<') && (next == '!') &&
				4559	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4560	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4561	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4562	(UPP(8) == 'E')) {
				4563	if ((!terminate) &&
				4564	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4565	goto done;
				4566	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4567	ctxt->sax->error(ctxt->userData,
				4568	"Misplaced DOCTYPE declaration\n");
				4569	ctxt->wellFormed = 0;
				4570	htmlParseDocTypeDecl(ctxt);
				4571	} else if ((cur == '<') && (next == '!') &&
				4572	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4573	if ((!terminate) &&
				4574	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4575	goto done;
				4576	#ifdef DEBUG_PUSH
				4577	xmlGenericError(xmlGenericErrorContext,
				4578	"HPP: Parsing Comment\n");
				4579	#endif
				4580	htmlParseComment(ctxt);
				4581	ctxt->instate = XML_PARSER_CONTENT;
				4582	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4583	goto done;
				4584	} else if ((cur == '<') && (next == '/')) {
				4585	ctxt->instate = XML_PARSER_END_TAG;
				4586	ctxt->checkIndex = 0;
				4587	#ifdef DEBUG_PUSH
				4588	xmlGenericError(xmlGenericErrorContext,
				4589	"HPP: entering END_TAG\n");
				4590	#endif
				4591	break;
				4592	} else if (cur == '<') {
				4593	ctxt->instate = XML_PARSER_START_TAG;
				4594	ctxt->checkIndex = 0;
				4595	#ifdef DEBUG_PUSH
				4596	xmlGenericError(xmlGenericErrorContext,
				4597	"HPP: entering START_TAG\n");
				4598	#endif
				4599	break;
				4600	} else if (cur == '&') {
				4601	if ((!terminate) &&
				4602	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4603	goto done;
				4604	#ifdef DEBUG_PUSH
				4605	xmlGenericError(xmlGenericErrorContext,
				4606	"HPP: Parsing Reference\n");
				4607	#endif
				4608	/* TODO: check generation of subtrees if noent !!! */
				4609	htmlParseReference(ctxt);
				4610	} else {
				4611	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4612	/*
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	4613	* Goal of the following test is:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4614	* - minimize calls to the SAX 'character' callback
				4615	* when they are mergeable
				4616	*/
				4617	if ((ctxt->inputNr == 1) &&
				4618	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4619	if ((!terminate) &&
				4620	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4621	goto done;
				4622	}
				4623	ctxt->checkIndex = 0;
				4624	#ifdef DEBUG_PUSH
				4625	xmlGenericError(xmlGenericErrorContext,
				4626	"HPP: Parsing char data\n");
				4627	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4628	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4629	}
				4630	}
				4631	if (cons == ctxt->nbChars) {
				4632	if (ctxt->node != NULL) {
				4633	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4634	ctxt->sax->error(ctxt->userData,
				4635	"detected an error in element content\n");
				4636	ctxt->wellFormed = 0;
				4637	}
				4638	NEXT;
				4639	break;
				4640	}
				4641
				4642	break;
				4643	}
				4644	case XML_PARSER_END_TAG:
				4645	if (avail < 2)
				4646	goto done;
				4647	if ((!terminate) &&
				4648	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4649	goto done;
				4650	htmlParseEndTag(ctxt);
				4651	if (ctxt->nameNr == 0) {
				4652	ctxt->instate = XML_PARSER_EPILOG;
				4653	} else {
				4654	ctxt->instate = XML_PARSER_CONTENT;
				4655	}
				4656	ctxt->checkIndex = 0;
				4657	#ifdef DEBUG_PUSH
				4658	xmlGenericError(xmlGenericErrorContext,
				4659	"HPP: entering CONTENT\n");
				4660	#endif
				4661	break;
				4662	case XML_PARSER_CDATA_SECTION:
				4663	xmlGenericError(xmlGenericErrorContext,
				4664	"HPP: internal error, state == CDATA\n");
				4665	ctxt->instate = XML_PARSER_CONTENT;
				4666	ctxt->checkIndex = 0;
				4667	#ifdef DEBUG_PUSH
				4668	xmlGenericError(xmlGenericErrorContext,
				4669	"HPP: entering CONTENT\n");
				4670	#endif
				4671	break;
				4672	case XML_PARSER_DTD:
				4673	xmlGenericError(xmlGenericErrorContext,
				4674	"HPP: internal error, state == DTD\n");
				4675	ctxt->instate = XML_PARSER_CONTENT;
				4676	ctxt->checkIndex = 0;
				4677	#ifdef DEBUG_PUSH
				4678	xmlGenericError(xmlGenericErrorContext,
				4679	"HPP: entering CONTENT\n");
				4680	#endif
				4681	break;
				4682	case XML_PARSER_COMMENT:
				4683	xmlGenericError(xmlGenericErrorContext,
				4684	"HPP: internal error, state == COMMENT\n");
				4685	ctxt->instate = XML_PARSER_CONTENT;
				4686	ctxt->checkIndex = 0;
				4687	#ifdef DEBUG_PUSH
				4688	xmlGenericError(xmlGenericErrorContext,
				4689	"HPP: entering CONTENT\n");
				4690	#endif
				4691	break;
				4692	case XML_PARSER_PI:
				4693	xmlGenericError(xmlGenericErrorContext,
				4694	"HPP: internal error, state == PI\n");
				4695	ctxt->instate = XML_PARSER_CONTENT;
				4696	ctxt->checkIndex = 0;
				4697	#ifdef DEBUG_PUSH
				4698	xmlGenericError(xmlGenericErrorContext,
				4699	"HPP: entering CONTENT\n");
				4700	#endif
				4701	break;
				4702	case XML_PARSER_ENTITY_DECL:
				4703	xmlGenericError(xmlGenericErrorContext,
				4704	"HPP: internal error, state == ENTITY_DECL\n");
				4705	ctxt->instate = XML_PARSER_CONTENT;
				4706	ctxt->checkIndex = 0;
				4707	#ifdef DEBUG_PUSH
				4708	xmlGenericError(xmlGenericErrorContext,
				4709	"HPP: entering CONTENT\n");
				4710	#endif
				4711	break;
				4712	case XML_PARSER_ENTITY_VALUE:
				4713	xmlGenericError(xmlGenericErrorContext,
				4714	"HPP: internal error, state == ENTITY_VALUE\n");
				4715	ctxt->instate = XML_PARSER_CONTENT;
				4716	ctxt->checkIndex = 0;
				4717	#ifdef DEBUG_PUSH
				4718	xmlGenericError(xmlGenericErrorContext,
				4719	"HPP: entering DTD\n");
				4720	#endif
				4721	break;
				4722	case XML_PARSER_ATTRIBUTE_VALUE:
				4723	xmlGenericError(xmlGenericErrorContext,
				4724	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4725	ctxt->instate = XML_PARSER_START_TAG;
				4726	ctxt->checkIndex = 0;
				4727	#ifdef DEBUG_PUSH
				4728	xmlGenericError(xmlGenericErrorContext,
				4729	"HPP: entering START_TAG\n");
				4730	#endif
				4731	break;
				4732	case XML_PARSER_SYSTEM_LITERAL:
				4733	xmlGenericError(xmlGenericErrorContext,
				4734	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4735	ctxt->instate = XML_PARSER_CONTENT;
				4736	ctxt->checkIndex = 0;
				4737	#ifdef DEBUG_PUSH
				4738	xmlGenericError(xmlGenericErrorContext,
				4739	"HPP: entering CONTENT\n");
				4740	#endif
				4741	break;
				4742	case XML_PARSER_IGNORE:
				4743	xmlGenericError(xmlGenericErrorContext,
				4744	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4745	ctxt->instate = XML_PARSER_CONTENT;
				4746	ctxt->checkIndex = 0;
				4747	#ifdef DEBUG_PUSH
				4748	xmlGenericError(xmlGenericErrorContext,
				4749	"HPP: entering CONTENT\n");
				4750	#endif
				4751	break;
Daniel Veillard	044fc6b	2002-03-04 17:09:44 +0000	[diff] [blame]	4752	case XML_PARSER_PUBLIC_LITERAL:
				4753	xmlGenericError(xmlGenericErrorContext,
				4754	"HPP: internal error, state == XML_PARSER_LITERAL\n");
				4755	ctxt->instate = XML_PARSER_CONTENT;
				4756	ctxt->checkIndex = 0;
				4757	#ifdef DEBUG_PUSH
				4758	xmlGenericError(xmlGenericErrorContext,
				4759	"HPP: entering CONTENT\n");
				4760	#endif
				4761	break;
				4762
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4763	}
				4764	}
				4765	done:
				4766	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4767	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4768	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4769	/*
				4770	* SAX: end of the document processing.
				4771	*/
				4772	ctxt->instate = XML_PARSER_EOF;
				4773	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4774	ctxt->sax->endDocument(ctxt->userData);
				4775	}
				4776	}
				4777	if ((ctxt->myDoc != NULL) &&
				4778	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4779	(ctxt->instate == XML_PARSER_EPILOG))) {
				4780	xmlDtdPtr dtd;
				4781	dtd = xmlGetIntSubset(ctxt->myDoc);
				4782	if (dtd == NULL)
				4783	ctxt->myDoc->intSubset =
				4784	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4785	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4786	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4787	}
				4788	#ifdef DEBUG_PUSH
				4789	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4790	#endif
				4791	return(ret);
				4792	}
				4793
				4794	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4795	* htmlParseChunk:
				4796	* @ctxt: an XML parser context
				4797	* @chunk: an char array
				4798	* @size: the size in byte of the chunk
				4799	* @terminate: last chunk indicator
				4800	*
				4801	* Parse a Chunk of memory
				4802	*
				4803	* Returns zero if no error, the xmlParserErrors otherwise.
				4804	*/
				4805	int
				4806	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4807	int terminate) {
				4808	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4809	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4810	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4811	int cur = ctxt->input->cur - ctxt->input->base;
				4812
				4813	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4814	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4815	ctxt->input->cur = ctxt->input->base + cur;
				4816	#ifdef DEBUG_PUSH
				4817	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4818	#endif
				4819
				4820	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4821	htmlParseTryOrFinish(ctxt, terminate);
				4822	} else if (ctxt->instate != XML_PARSER_EOF) {
				4823	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4824	htmlParseTryOrFinish(ctxt, terminate);
				4825	}
				4826	if (terminate) {
				4827	if ((ctxt->instate != XML_PARSER_EOF) &&
				4828	(ctxt->instate != XML_PARSER_EPILOG) &&
				4829	(ctxt->instate != XML_PARSER_MISC)) {
				4830	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4831	ctxt->wellFormed = 0;
				4832	}
				4833	if (ctxt->instate != XML_PARSER_EOF) {
				4834	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4835	ctxt->sax->endDocument(ctxt->userData);
				4836	}
				4837	ctxt->instate = XML_PARSER_EOF;
				4838	}
				4839	return((xmlParserErrors) ctxt->errNo);
				4840	}
				4841
				4842	/************************************************************************
				4843	* *
				4844	* User entry points *
				4845	* *
				4846	************************************************************************/
				4847
				4848	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	4849	* htmlCreatePushParserCtxt:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4850	* @sax: a SAX handler
				4851	* @user_data: The user data returned on SAX callbacks
				4852	* @chunk: a pointer to an array of chars
				4853	* @size: number of chars in the array
				4854	* @filename: an optional file name or URI
				4855	* @enc: an optional encoding
				4856	*
				4857	* Create a parser context for using the HTML parser in push mode
				4858	* To allow content encoding detection, @size should be >= 4
				4859	* The value of @filename is used for fetching external entities
				4860	* and error/warning reports.
				4861	*
				4862	* Returns the new parser context or NULL
				4863	*/
				4864	htmlParserCtxtPtr
				4865	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4866	const char chunk, int size, const char filename,
				4867	xmlCharEncoding enc) {
				4868	htmlParserCtxtPtr ctxt;
				4869	htmlParserInputPtr inputStream;
				4870	xmlParserInputBufferPtr buf;
				4871
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4872	xmlInitParser();
				4873
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4874	buf = xmlAllocParserInputBuffer(enc);
				4875	if (buf == NULL) return(NULL);
				4876
				4877	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4878	if (ctxt == NULL) {
				4879	xmlFree(buf);
				4880	return(NULL);
				4881	}
				4882	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4883	htmlInitParserCtxt(ctxt);
				4884	if (sax != NULL) {
				4885	if (ctxt->sax != &htmlDefaultSAXHandler)
				4886	xmlFree(ctxt->sax);
				4887	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4888	if (ctxt->sax == NULL) {
				4889	xmlFree(buf);
				4890	xmlFree(ctxt);
				4891	return(NULL);
				4892	}
				4893	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4894	if (user_data != NULL)
				4895	ctxt->userData = user_data;
				4896	}
				4897	if (filename == NULL) {
				4898	ctxt->directory = NULL;
				4899	} else {
				4900	ctxt->directory = xmlParserGetDirectory(filename);
				4901	}
				4902
				4903	inputStream = htmlNewInputStream(ctxt);
				4904	if (inputStream == NULL) {
				4905	xmlFreeParserCtxt(ctxt);
				4906	return(NULL);
				4907	}
				4908
				4909	if (filename == NULL)
				4910	inputStream->filename = NULL;
				4911	else
				4912	inputStream->filename = xmlMemStrdup(filename);
				4913	inputStream->buf = buf;
				4914	inputStream->base = inputStream->buf->buffer->content;
				4915	inputStream->cur = inputStream->buf->buffer->content;
				4916
				4917	inputPush(ctxt, inputStream);
				4918
				4919	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4920	(ctxt->input->buf != NULL)) {
				4921	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4922	#ifdef DEBUG_PUSH
				4923	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4924	#endif
				4925	}
				4926
				4927	return(ctxt);
				4928	}
				4929
				4930	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	4931	* htmlSAXParseDoc:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4932	* @cur: a pointer to an array of xmlChar
				4933	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4934	* @sax: the SAX handler block
				4935	* @userData: if using SAX, this pointer will be provided on callbacks.
				4936	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4937	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4938	* to handle parse events. If sax is NULL, fallback to the default DOM
				4939	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4940	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4941	* Returns the resulting document tree unless SAX is NULL or the document is
				4942	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4943	*/
				4944
				4945	htmlDocPtr
				4946	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4947	htmlDocPtr ret;
				4948	htmlParserCtxtPtr ctxt;
				4949
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4950	xmlInitParser();
				4951
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4952	if (cur == NULL) return(NULL);
				4953
				4954
				4955	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4956	if (ctxt == NULL) return(NULL);
				4957	if (sax != NULL) {
				4958	ctxt->sax = sax;
				4959	ctxt->userData = userData;
				4960	}
				4961
				4962	htmlParseDocument(ctxt);
				4963	ret = ctxt->myDoc;
				4964	if (sax != NULL) {
				4965	ctxt->sax = NULL;
				4966	ctxt->userData = NULL;
				4967	}
				4968	htmlFreeParserCtxt(ctxt);
				4969
				4970	return(ret);
				4971	}
				4972
				4973	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	4974	* htmlParseDoc:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4975	* @cur: a pointer to an array of xmlChar
				4976	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4977	*
				4978	* parse an HTML in-memory document and build a tree.
				4979	*
				4980	* Returns the resulting document tree
				4981	*/
				4982
				4983	htmlDocPtr
				4984	htmlParseDoc(xmlChar cur, const char encoding) {
				4985	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4986	}
				4987
				4988
				4989	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	4990	* htmlCreateFileParserCtxt:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4991	* @filename: the filename
				4992	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4993	*
				4994	* Create a parser context for a file content.
				4995	* Automatic support for ZLIB/Compress compressed document is provided
				4996	* by default if found at compile-time.
				4997	*
				4998	* Returns the new parser context or NULL
				4999	*/
				5000	htmlParserCtxtPtr
				5001	htmlCreateFileParserCtxt(const char filename, const char encoding)
				5002	{
				5003	htmlParserCtxtPtr ctxt;
				5004	htmlParserInputPtr inputStream;
				5005	xmlParserInputBufferPtr buf;
				5006	/* htmlCharEncoding enc; */
				5007	xmlChar content, content_line = (xmlChar *) "charset=";
				5008
				5009	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				5010	if (buf == NULL) return(NULL);
				5011
				5012	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				5013	if (ctxt == NULL) {
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	5014	xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5015	return(NULL);
				5016	}
				5017	memset(ctxt, 0, sizeof(htmlParserCtxt));
				5018	htmlInitParserCtxt(ctxt);
				5019	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				5020	if (inputStream == NULL) {
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	5021	xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5022	xmlFree(ctxt);
				5023	return(NULL);
				5024	}
				5025	memset(inputStream, 0, sizeof(htmlParserInput));
				5026
Daniel Veillard	a646cfd	2002-09-17 21:50:03 +0000	[diff] [blame]	5027	inputStream->filename = (char *)
				5028	xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5029	inputStream->line = 1;
				5030	inputStream->col = 1;
				5031	inputStream->buf = buf;
				5032	inputStream->directory = NULL;
				5033
				5034	inputStream->base = inputStream->buf->buffer->content;
				5035	inputStream->cur = inputStream->buf->buffer->content;
				5036	inputStream->free = NULL;
				5037
				5038	inputPush(ctxt, inputStream);
				5039
				5040	/* set encoding */
				5041	if (encoding) {
				5042	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				5043	if (content) {
				5044	strcpy ((char )content, (char )content_line);
				5045	strcat ((char )content, (char )encoding);
				5046	htmlCheckEncoding (ctxt, content);
				5047	xmlFree (content);
				5048	}
				5049	}
				5050
				5051	return(ctxt);
				5052	}
				5053
				5054	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	5055	* htmlSAXParseFile:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5056	* @filename: the filename
				5057	* @encoding: a free form C string describing the HTML document encoding, or NULL
				5058	* @sax: the SAX handler block
				5059	* @userData: if using SAX, this pointer will be provided on callbacks.
				5060	*
				5061	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				5062	* compressed document is provided by default if found at compile-time.
				5063	* It use the given SAX function block to handle the parsing callback.
				5064	* If sax is NULL, fallback to the default DOM tree building routines.
				5065	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	5066	* Returns the resulting document tree unless SAX is NULL or the document is
				5067	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5068	*/
				5069
				5070	htmlDocPtr
				5071	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				5072	void *userData) {
				5073	htmlDocPtr ret;
				5074	htmlParserCtxtPtr ctxt;
				5075	htmlSAXHandlerPtr oldsax = NULL;
				5076
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	5077	xmlInitParser();
				5078
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5079	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				5080	if (ctxt == NULL) return(NULL);
				5081	if (sax != NULL) {
				5082	oldsax = ctxt->sax;
				5083	ctxt->sax = sax;
				5084	ctxt->userData = userData;
				5085	}
				5086
				5087	htmlParseDocument(ctxt);
				5088
				5089	ret = ctxt->myDoc;
				5090	if (sax != NULL) {
				5091	ctxt->sax = oldsax;
				5092	ctxt->userData = NULL;
				5093	}
				5094	htmlFreeParserCtxt(ctxt);
				5095
				5096	return(ret);
				5097	}
				5098
				5099	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	5100	* htmlParseFile:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5101	* @filename: the filename
				5102	* @encoding: a free form C string describing the HTML document encoding, or NULL
				5103	*
				5104	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				5105	* compressed document is provided by default if found at compile-time.
				5106	*
				5107	* Returns the resulting document tree
				5108	*/
				5109
				5110	htmlDocPtr
				5111	htmlParseFile(const char filename, const char encoding) {
				5112	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				5113	}
				5114
				5115	/**
				5116	* htmlHandleOmittedElem:
				5117	* @val: int 0 or 1
				5118	*
				5119	* Set and return the previous value for handling HTML omitted tags.
				5120	*
				5121	* Returns the last value for 0 for no handling, 1 for auto insertion.
				5122	*/
				5123
				5124	int
				5125	htmlHandleOmittedElem(int val) {
				5126	int old = htmlOmittedDefaultValue;
				5127
				5128	htmlOmittedDefaultValue = val;
				5129	return(old);
				5130	}
				5131
				5132	#endif /* LIBXML_HTML_ENABLED */