Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: 7db3e9e3a8fa8478965467a6843843705f678ef6 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Daniel Veillard	34ce8be	2002-03-18 19:37:11 +0000	[diff] [blame]	9	#define IN_LIBXML
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	10	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	11	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	12
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	13	#include <string.h>
				14	#ifdef HAVE_CTYPE_H
				15	#include <ctype.h>
				16	#endif
				17	#ifdef HAVE_STDLIB_H
				18	#include <stdlib.h>
				19	#endif
				20	#ifdef HAVE_SYS_STAT_H
				21	#include <sys/stat.h>
				22	#endif
				23	#ifdef HAVE_FCNTL_H
				24	#include <fcntl.h>
				25	#endif
				26	#ifdef HAVE_UNISTD_H
				27	#include <unistd.h>
				28	#endif
				29	#ifdef HAVE_ZLIB_H
				30	#include <zlib.h>
				31	#endif
				32
				33	#include <libxml/xmlmemory.h>
				34	#include <libxml/tree.h>
				35	#include <libxml/parser.h>
				36	#include <libxml/parserInternals.h>
				37	#include <libxml/xmlerror.h>
				38	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	39	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	40	#include <libxml/entities.h>
				41	#include <libxml/encoding.h>
				42	#include <libxml/valid.h>
				43	#include <libxml/xmlIO.h>
Daniel Veillard	3c01b1d	2001-10-17 15:58:35 +0000	[diff] [blame]	44	#include <libxml/globals.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	45
				46	#define HTML_MAX_NAMELEN 1000
				47	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				48	#define HTML_PARSER_BUFFER_SIZE 100
				49
				50	/* #define DEBUG */
				51	/* #define DEBUG_PUSH */
				52
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	53	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	54
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	55	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				56	xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	57	static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	58
				59	/************************************************************************
				60	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	61	* Parser stacks related functions and macros *
				62	* *
				63	************************************************************************/
				64
Daniel Veillard	1c732d2	2002-11-30 11:22:59 +0000	[diff] [blame]	65	/**
				66	* htmlnamePush:
				67	* @ctxt: an HTML parser context
				68	* @value: the element name
				69	*
				70	* Pushes a new element name on top of the name stack
				71	*
				72	* Returns 0 in case of error, the index in the stack otherwise
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	73	*/
Daniel Veillard	1c732d2	2002-11-30 11:22:59 +0000	[diff] [blame]	74	static int
				75	htmlnamePush(htmlParserCtxtPtr ctxt, xmlChar * value)
				76	{
				77	if (ctxt->nameNr >= ctxt->nameMax) {
				78	ctxt->nameMax *= 2;
				79	ctxt->nameTab =
				80	(xmlChar * *)xmlRealloc(ctxt->nameTab,
				81	ctxt->nameMax *
				82	sizeof(ctxt->nameTab[0]));
				83	if (ctxt->nameTab == NULL) {
				84	xmlGenericError(xmlGenericErrorContext, "realloc failed !\n");
				85	return (0);
				86	}
				87	}
				88	ctxt->nameTab[ctxt->nameNr] = value;
				89	ctxt->name = value;
				90	return (ctxt->nameNr++);
				91	}
				92	/**
				93	* htmlnamePop:
				94	* @ctxt: an HTML parser context
				95	*
				96	* Pops the top element name from the name stack
				97	*
				98	* Returns the name just removed
				99	*/
				100	static xmlChar *
				101	htmlnamePop(htmlParserCtxtPtr ctxt)
				102	{
				103	xmlChar *ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	104
Daniel Veillard	1c732d2	2002-11-30 11:22:59 +0000	[diff] [blame]	105	if (ctxt->nameNr <= 0)
				106	return (0);
				107	ctxt->nameNr--;
				108	if (ctxt->nameNr < 0)
				109	return (0);
				110	if (ctxt->nameNr > 0)
				111	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
				112	else
				113	ctxt->name = NULL;
				114	ret = ctxt->nameTab[ctxt->nameNr];
				115	ctxt->nameTab[ctxt->nameNr] = 0;
				116	return (ret);
				117	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	118
				119	/*
				120	* Macros for accessing the content. Those should be used only by the parser,
				121	* and not exported.
				122	*
				123	* Dirty macros, i.e. one need to make assumption on the context to use them
				124	*
				125	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				126	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				127	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				128	* in UNICODE mode. This should be used internally by the parser
				129	* only to compare to ASCII values otherwise it would break when
				130	* running with UTF-8 encoding.
				131	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				132	* to compare on ASCII based substring.
				133	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				134	* it should be used only to compare on ASCII based substring.
				135	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				136	* strings within the parser.
				137	*
				138	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				139	*
				140	* CURRENT Returns the current char value, with the full decoding of
				141	* UTF-8 if we are using this mode. It returns an int.
				142	* NEXT Skip to the next character, this does the proper decoding
				143	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				144	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				145	*/
				146
				147	#define UPPER (toupper(*ctxt->input->cur))
				148
				149	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				150
				151	#define NXT(val) ctxt->input->cur[(val)]
				152
				153	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				154
				155	#define CUR_PTR ctxt->input->cur
				156
				157	#define SHRINK xmlParserInputShrink(ctxt->input)
				158
				159	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				160
				161	#define CURRENT ((int) (*ctxt->input->cur))
				162
				163	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				164
				165	/* Inported from XML */
				166
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	167	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				168	#define CUR ((int) (*ctxt->input->cur))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	169	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				170
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	171	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	172	#define NXT(val) ctxt->input->cur[(val)]
				173	#define CUR_PTR ctxt->input->cur
				174
				175
				176	#define NEXTL(l) do { \
				177	if (*(ctxt->input->cur) == '\n') { \
				178	ctxt->input->line++; ctxt->input->col = 1; \
				179	} else ctxt->input->col++; \
				180	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				181	} while (0)
				182
				183	/************
				184	\
				185	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				186	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				187	************/
				188
				189	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				190	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				191
				192	#define COPY_BUF(l,b,i,v) \
				193	if (l == 1) b[i++] = (xmlChar) v; \
				194	else i += xmlCopyChar(l,&b[i],v)
				195
				196	/**
				197	* htmlCurrentChar:
				198	* @ctxt: the HTML parser context
				199	* @len: pointer to the length of the char read
				200	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	201	* The current char value, if using UTF-8 this may actually span multiple
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	202	* bytes in the input buffer. Implement the end of line normalization:
				203	* 2.11 End-of-Line Handling
				204	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				205	* char, then the encoding converter is plugged in automatically.
				206	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	207	* Returns the current char value and its length
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	208	*/
				209
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	210	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	211	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				212	if (ctxt->instate == XML_PARSER_EOF)
				213	return(0);
				214
				215	if (ctxt->token != 0) {
				216	*len = 0;
				217	return(ctxt->token);
				218	}
				219	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				220	/*
				221	* We are supposed to handle UTF8, check it's valid
				222	* From rfc2044: encoding of the Unicode values on UTF-8:
				223	*
				224	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				225	* 0000 0000-0000 007F 0xxxxxxx
				226	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				227	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				228	*
				229	* Check for the 0x110000 limit too
				230	*/
				231	const unsigned char *cur = ctxt->input->cur;
				232	unsigned char c;
				233	unsigned int val;
				234
				235	c = *cur;
				236	if (c & 0x80) {
				237	if (cur[1] == 0)
				238	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				239	if ((cur[1] & 0xc0) != 0x80)
				240	goto encoding_error;
				241	if ((c & 0xe0) == 0xe0) {
				242
				243	if (cur[2] == 0)
				244	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				245	if ((cur[2] & 0xc0) != 0x80)
				246	goto encoding_error;
				247	if ((c & 0xf0) == 0xf0) {
				248	if (cur[3] == 0)
				249	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				250	if (((c & 0xf8) != 0xf0) \|\|
				251	((cur[3] & 0xc0) != 0x80))
				252	goto encoding_error;
				253	/* 4-byte code */
				254	*len = 4;
				255	val = (cur[0] & 0x7) << 18;
				256	val \|= (cur[1] & 0x3f) << 12;
				257	val \|= (cur[2] & 0x3f) << 6;
				258	val \|= cur[3] & 0x3f;
				259	} else {
				260	/* 3-byte code */
				261	*len = 3;
				262	val = (cur[0] & 0xf) << 12;
				263	val \|= (cur[1] & 0x3f) << 6;
				264	val \|= cur[2] & 0x3f;
				265	}
				266	} else {
				267	/* 2-byte code */
				268	*len = 2;
				269	val = (cur[0] & 0x1f) << 6;
				270	val \|= cur[1] & 0x3f;
				271	}
				272	if (!IS_CHAR(val)) {
				273	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				274	if ((ctxt->sax != NULL) &&
				275	(ctxt->sax->error != NULL))
				276	ctxt->sax->error(ctxt->userData,
				277	"Char 0x%X out of allowed range\n", val);
				278	ctxt->wellFormed = 0;
Daniel Veillard	dad3f68	2002-11-17 16:47:27 +0000	[diff] [blame]	279	if (ctxt->recovery == 0) ctxt->disableSAX = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	280	}
				281	return(val);
				282	} else {
				283	/* 1-byte code */
				284	*len = 1;
				285	return((int) *ctxt->input->cur);
				286	}
				287	}
				288	/*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	289	* Assume it's a fixed length encoding (1) with
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	290	* a compatible encoding for the ASCII set, since
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	291	* XML constructs only use < 128 chars
				292	*/
				293	*len = 1;
				294	if ((int) *ctxt->input->cur < 0x80)
				295	return((int) *ctxt->input->cur);
				296
				297	/*
				298	* Humm this is bad, do an automatic flow conversion
				299	*/
				300	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				301	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				302	return(xmlCurrentChar(ctxt, len));
				303
				304	encoding_error:
				305	/*
				306	* If we detect an UTF8 error that probably mean that the
				307	* input encoding didn't get properly advertized in the
				308	* declaration header. Report the error and switch the encoding
				309	* to ISO-Latin-1 (if you don't like this policy, just declare the
				310	* encoding !)
				311	*/
				312	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				313	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				314	ctxt->sax->error(ctxt->userData,
				315	"Input is not proper UTF-8, indicate encoding !\n");
				316	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				317	ctxt->input->cur[0], ctxt->input->cur[1],
				318	ctxt->input->cur[2], ctxt->input->cur[3]);
				319	}
				320
				321	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				322	*len = 1;
				323	return((int) *ctxt->input->cur);
				324	}
				325
				326	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	327	* htmlSkipBlankChars:
				328	* @ctxt: the HTML parser context
				329	*
				330	* skip all blanks character found at that point in the input streams.
				331	*
				332	* Returns the number of space chars skipped
				333	*/
				334
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	335	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	336	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				337	int res = 0;
				338
				339	while (IS_BLANK(*(ctxt->input->cur))) {
				340	if ((*ctxt->input->cur == 0) &&
				341	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				342	xmlPopInput(ctxt);
				343	} else {
				344	if (*(ctxt->input->cur) == '\n') {
				345	ctxt->input->line++; ctxt->input->col = 1;
				346	} else ctxt->input->col++;
				347	ctxt->input->cur++;
				348	ctxt->nbChars++;
				349	if (*ctxt->input->cur == 0)
				350	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				351	}
				352	res++;
				353	}
				354	return(res);
				355	}
				356
				357
				358
				359	/************************************************************************
				360	* *
				361	* The list of HTML elements and their properties *
				362	* *
				363	************************************************************************/
				364
				365	/*
				366	* Start Tag: 1 means the start tag can be ommited
				367	* End Tag: 1 means the end tag can be ommited
				368	* 2 means it's forbidden (empty elements)
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	369	* 3 means the tag is stylistic and should be closed easily
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	370	* Depr: this element is deprecated
				371	* DTD: 1 means that this element is valid only in the Loose DTD
				372	* 2 means that this element is valid only in the Frameset DTD
				373	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	374	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	375	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	376	static const htmlElemDesc
				377	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	378	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				379	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				380	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				381	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				382	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				383	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				384	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				385	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				386	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				387	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				388	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				389	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				390	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				391	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				392	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				393	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				394	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				395	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				396	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				397	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				398	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				399	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				400	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				401	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				402	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				403	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				404	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				405	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				406	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				407	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				408	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				409	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				410	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				411	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				412	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				413	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				414	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				415	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				416	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				417	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				418	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				419	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				420	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				421	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				422	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				423	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				424	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				425	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				426	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				427	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				428	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				429	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				430	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				431	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				432	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				433	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				434	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				435	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				436	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				437	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				438	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				439	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				440	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
Daniel Veillard	fee408f	2002-11-22 13:18:30 +0000	[diff] [blame]	441	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	442	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				443	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				444	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				445	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				446	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				447	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				448	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				449	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				450	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				451	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				452	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				453	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				454	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				455	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				456	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				457	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				458	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				459	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				460	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				461	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				462	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				463	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				464	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				465	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				466	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				467	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				468	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	469	};
				470
				471	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	472	* start tags that imply the end of current element
				473	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	474	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	475	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				476	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				477	"listing", "xmp", "head", NULL,
				478	"head", "p", NULL,
				479	"title", "p", NULL,
				480	"body", "head", "style", "link", "title", "p", NULL,
				481	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				482	"pre", "listing", "xmp", "head", "li", NULL,
				483	"hr", "p", "head", NULL,
				484	"h1", "p", "head", NULL,
				485	"h2", "p", "head", NULL,
				486	"h3", "p", "head", NULL,
				487	"h4", "p", "head", NULL,
				488	"h5", "p", "head", NULL,
				489	"h6", "p", "head", NULL,
				490	"dir", "p", "head", NULL,
				491	"address", "p", "head", "ul", NULL,
				492	"pre", "p", "head", "ul", NULL,
				493	"listing", "p", "head", NULL,
				494	"xmp", "p", "head", NULL,
				495	"blockquote", "p", "head", NULL,
				496	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				497	"xmp", "head", NULL,
				498	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				499	"head", "dd", NULL,
				500	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				501	"head", "dt", NULL,
				502	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				503	"listing", "xmp", NULL,
				504	"ol", "p", "head", "ul", NULL,
				505	"menu", "p", "head", "ul", NULL,
				506	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				507	"div", "p", "head", NULL,
				508	"noscript", "p", "head", NULL,
				509	"center", "font", "b", "i", "p", "head", NULL,
				510	"a", "a", NULL,
				511	"caption", "p", NULL,
				512	"colgroup", "caption", "colgroup", "col", "p", NULL,
				513	"col", "caption", "col", "p", NULL,
				514	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				515	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	516	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				517	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	518	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				519	"thead", "caption", "col", "colgroup", NULL,
				520	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				521	"tbody", "p", NULL,
				522	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				523	"tfoot", "tbody", "p", NULL,
				524	"optgroup", "option", NULL,
				525	"option", "option", NULL,
				526	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				527	"pre", "listing", "xmp", "a", NULL,
				528	NULL
				529	};
				530
				531	/*
				532	* The list of HTML elements which are supposed not to have
				533	* CDATA content and where a p element will be implied
				534	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	535	* TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	536	* implied paragraph
				537	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	538	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	539	"html",
				540	"head",
				541	"body",
				542	NULL
				543	};
				544
				545	/*
				546	* The list of HTML attributes which are of content %Script;
				547	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				548	* it assumes the name starts with 'on'
				549	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	550	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	551	"onclick",
				552	"ondblclick",
				553	"onmousedown",
				554	"onmouseup",
				555	"onmouseover",
				556	"onmousemove",
				557	"onmouseout",
				558	"onkeypress",
				559	"onkeydown",
				560	"onkeyup",
				561	"onload",
				562	"onunload",
				563	"onfocus",
				564	"onblur",
				565	"onsubmit",
				566	"onrest",
				567	"onchange",
				568	"onselect"
				569	};
				570
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	571	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	572	* This table is used by the htmlparser to know what to do with
				573	* broken html pages. By assigning different priorities to different
				574	* elements the parser can decide how to handle extra endtags.
				575	* Endtags are only allowed to close elements with lower or equal
				576	* priority.
				577	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	578
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	579	typedef struct {
				580	const char *name;
				581	int priority;
				582	} elementPriority;
				583
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	584	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	585	{"div", 150},
				586	{"td", 160},
				587	{"th", 160},
				588	{"tr", 170},
				589	{"thead", 180},
				590	{"tbody", 180},
				591	{"tfoot", 180},
				592	{"table", 190},
				593	{"head", 200},
				594	{"body", 200},
				595	{"html", 220},
				596	{NULL, 100} /* Default priority */
				597	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	598
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	599	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	600	static int htmlStartCloseIndexinitialized = 0;
				601
				602	/************************************************************************
				603	* *
				604	* functions to handle HTML specific data *
				605	* *
				606	************************************************************************/
				607
				608	/**
				609	* htmlInitAutoClose:
				610	*
				611	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				612	* This is not reentrant. Call xmlInitParser() once before processing in
				613	* case of use in multithreaded programs.
				614	*/
				615	void
				616	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	617	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	618
				619	if (htmlStartCloseIndexinitialized) return;
				620
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	621	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				622	indx = 0;
				623	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				624	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	625	while (htmlStartClose[i] != NULL) i++;
				626	i++;
				627	}
				628	htmlStartCloseIndexinitialized = 1;
				629	}
				630
				631	/**
				632	* htmlTagLookup:
				633	* @tag: The tag name in lowercase
				634	*
				635	* Lookup the HTML tag in the ElementTable
				636	*
				637	* Returns the related htmlElemDescPtr or NULL if not found.
				638	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	639	const htmlElemDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	640	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	641	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	642
				643	for (i = 0; i < (sizeof(html40ElementTable) /
				644	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	645	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	646	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	647	}
				648	return(NULL);
				649	}
				650
				651	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	652	* htmlGetEndPriority:
				653	* @name: The name of the element to look up the priority for.
				654	*
				655	* Return value: The "endtag" priority.
				656	**/
				657	static int
				658	htmlGetEndPriority (const xmlChar *name) {
				659	int i = 0;
				660
				661	while ((htmlEndPriority[i].name != NULL) &&
				662	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				663	i++;
				664
				665	return(htmlEndPriority[i].priority);
				666	}
				667
				668	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	669	* htmlCheckAutoClose:
				670	* @newtag: The new tag name
				671	* @oldtag: The old tag name
				672	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	673	* Checks whether the new tag is one of the registered valid tags for
				674	* closing old.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	675	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				676	*
				677	* Returns 0 if no, 1 if yes.
				678	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	679	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	680	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	681	int i, indx;
				682	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	683
				684	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				685
				686	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	687	for (indx = 0; indx < 100;indx++) {
				688	closed = htmlStartCloseIndex[indx];
				689	if (closed == NULL) return(0);
				690	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	691	}
				692
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	693	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	694	i++;
				695	while (htmlStartClose[i] != NULL) {
				696	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				697	return(1);
				698	}
				699	i++;
				700	}
				701	return(0);
				702	}
				703
				704	/**
				705	* htmlAutoCloseOnClose:
				706	* @ctxt: an HTML parser context
				707	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	708	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	709	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	710	* The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	711	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	712	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	713	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	714	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	715	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	716	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	717
				718	#ifdef DEBUG
				719	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				720	for (i = 0;i < ctxt->nameNr;i++)
				721	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				722	#endif
				723
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	724	priority = htmlGetEndPriority (newtag);
				725
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	726	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	727
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	728	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	729	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	730	* A missplaced endtag can only close elements with lower
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	731	* or equal priority, so if we find an element with higher
				732	* priority before we find an element with
				733	* matching name, we just ignore this endtag
				734	*/
				735	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	736	}
				737	if (i < 0) return;
				738
				739	while (!xmlStrEqual(newtag, ctxt->name)) {
				740	info = htmlTagLookup(ctxt->name);
				741	if ((info == NULL) \|\| (info->endTag == 1)) {
				742	#ifdef DEBUG
				743	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				744	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	745	} else if (info->endTag == 3) {
				746	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	747	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	748
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	749	#endif
				750	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				751	ctxt->sax->error(ctxt->userData,
				752	"Opening and ending tag mismatch: %s and %s\n",
				753	newtag, ctxt->name);
				754	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	755	}
				756	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				757	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				758	oldname = htmlnamePop(ctxt);
				759	if (oldname != NULL) {
				760	#ifdef DEBUG
				761	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				762	#endif
				763	xmlFree(oldname);
				764	}
				765	}
				766	}
				767
				768	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	769	* htmlAutoCloseOnEnd:
				770	* @ctxt: an HTML parser context
				771	*
				772	* Close all remaining tags at the end of the stream
				773	*/
				774	static void
				775	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				776	xmlChar *oldname;
				777	int i;
				778
				779	if (ctxt->nameNr == 0)
				780	return;
				781	#ifdef DEBUG
				782	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				783	#endif
				784
				785	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				786	#ifdef DEBUG
				787	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				788	#endif
				789	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				790	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				791	oldname = htmlnamePop(ctxt);
				792	if (oldname != NULL) {
				793	#ifdef DEBUG
				794	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				795	#endif
				796	xmlFree(oldname);
				797	}
				798	}
				799	}
				800
				801	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	802	* htmlAutoClose:
				803	* @ctxt: an HTML parser context
				804	* @newtag: The new tag name or NULL
				805	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	806	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	807	* The list is kept in htmlStartClose array. This function is
				808	* called when a new tag has been detected and generates the
				809	* appropriates closes if possible/needed.
				810	* If newtag is NULL this mean we are at the end of the resource
				811	* and we should check
				812	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	813	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	814	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				815	xmlChar *oldname;
				816	while ((newtag != NULL) && (ctxt->name != NULL) &&
				817	(htmlCheckAutoClose(newtag, ctxt->name))) {
				818	#ifdef DEBUG
				819	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				820	#endif
				821	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				822	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				823	oldname = htmlnamePop(ctxt);
				824	if (oldname != NULL) {
				825	#ifdef DEBUG
				826	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				827	#endif
				828	xmlFree(oldname);
				829	}
				830	}
				831	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	832	htmlAutoCloseOnEnd(ctxt);
				833	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	834	}
				835	while ((newtag == NULL) && (ctxt->name != NULL) &&
				836	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				837	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				838	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				839	#ifdef DEBUG
				840	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				841	#endif
				842	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				843	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				844	oldname = htmlnamePop(ctxt);
				845	if (oldname != NULL) {
				846	#ifdef DEBUG
				847	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				848	#endif
				849	xmlFree(oldname);
				850	}
				851	}
				852
				853	}
				854
				855	/**
				856	* htmlAutoCloseTag:
				857	* @doc: the HTML document
				858	* @name: The tag name
				859	* @elem: the HTML element
				860	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	861	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	862	* The list is kept in htmlStartClose array. This function checks
				863	* if the element or one of it's children would autoclose the
				864	* given tag.
				865	*
				866	* Returns 1 if autoclose, 0 otherwise
				867	*/
				868	int
				869	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				870	htmlNodePtr child;
				871
				872	if (elem == NULL) return(1);
				873	if (xmlStrEqual(name, elem->name)) return(0);
				874	if (htmlCheckAutoClose(elem->name, name)) return(1);
				875	child = elem->children;
				876	while (child != NULL) {
				877	if (htmlAutoCloseTag(doc, name, child)) return(1);
				878	child = child->next;
				879	}
				880	return(0);
				881	}
				882
				883	/**
				884	* htmlIsAutoClosed:
				885	* @doc: the HTML document
				886	* @elem: the HTML element
				887	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	888	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	889	* The list is kept in htmlStartClose array. This function checks
				890	* if a tag is autoclosed by one of it's child
				891	*
				892	* Returns 1 if autoclosed, 0 otherwise
				893	*/
				894	int
				895	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				896	htmlNodePtr child;
				897
				898	if (elem == NULL) return(1);
				899	child = elem->children;
				900	while (child != NULL) {
				901	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				902	child = child->next;
				903	}
				904	return(0);
				905	}
				906
				907	/**
				908	* htmlCheckImplied:
				909	* @ctxt: an HTML parser context
				910	* @newtag: The new tag name
				911	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	912	* The HTML DTD allows a tag to exists only implicitly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	913	* called when a new tag has been detected and generates the
				914	* appropriates implicit tags if missing
				915	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	916	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	917	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				918	if (!htmlOmittedDefaultValue)
				919	return;
				920	if (xmlStrEqual(newtag, BAD_CAST"html"))
				921	return;
				922	if (ctxt->nameNr <= 0) {
				923	#ifdef DEBUG
				924	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				925	#endif
				926	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				927	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				928	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				929	}
				930	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				931	return;
				932	if ((ctxt->nameNr <= 1) &&
				933	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				934	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				935	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				936	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				937	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				938	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				939	/*
				940	* dropped OBJECT ... i you put it first BODY will be
				941	* assumed !
				942	*/
				943	#ifdef DEBUG
				944	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				945	#endif
				946	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				947	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				948	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				949	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				950	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				951	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				952	int i;
				953	for (i = 0;i < ctxt->nameNr;i++) {
				954	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				955	return;
				956	}
				957	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				958	return;
				959	}
				960	}
				961
				962	#ifdef DEBUG
				963	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				964	#endif
				965	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				966	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				967	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				968	}
				969	}
				970
				971	/**
				972	* htmlCheckParagraph
				973	* @ctxt: an HTML parser context
				974	*
				975	* Check whether a p element need to be implied before inserting
				976	* characters in the current element.
				977	*
				978	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				979	* in case of error.
				980	*/
				981
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	982	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	983	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				984	const xmlChar *tag;
				985	int i;
				986
				987	if (ctxt == NULL)
				988	return(-1);
				989	tag = ctxt->name;
				990	if (tag == NULL) {
				991	htmlAutoClose(ctxt, BAD_CAST"p");
				992	htmlCheckImplied(ctxt, BAD_CAST"p");
				993	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				994	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				995	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				996	return(1);
				997	}
				998	if (!htmlOmittedDefaultValue)
				999	return(0);
				1000	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				1001	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				1002	#ifdef DEBUG
				1003	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				1004	#endif
				1005	htmlAutoClose(ctxt, BAD_CAST"p");
				1006	htmlCheckImplied(ctxt, BAD_CAST"p");
				1007	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				1008	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1009	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1010	return(1);
				1011	}
				1012	}
				1013	return(0);
				1014	}
				1015
				1016	/**
				1017	* htmlIsScriptAttribute:
				1018	* @name: an attribute name
				1019	*
				1020	* Check if an attribute is of content type Script
				1021	*
				1022	* Returns 1 is the attribute is a script 0 otherwise
				1023	*/
				1024	int
				1025	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1026	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1027
				1028	if (name == NULL)
				1029	return(0);
				1030	/*
				1031	* all script attributes start with 'on'
				1032	*/
				1033	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1034	return(0);
				1035	for (i = 0;
				1036	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1037	i++) {
				1038	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1039	return(1);
				1040	}
				1041	return(0);
				1042	}
				1043
				1044	/************************************************************************
				1045	* *
				1046	* The list of HTML predefined entities *
				1047	* *
				1048	************************************************************************/
				1049
				1050
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1051	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1052	/*
				1053	* the 4 absolute ones, plus apostrophe.
				1054	*/
				1055	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1056	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1057	{ 39, "apos", "single quote" },
				1058	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1059	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1060
				1061	/*
				1062	* A bunch still in the 128-255 range
				1063	* Replacing them depend really on the charset used.
				1064	*/
				1065	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1066	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1067	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1068	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1069	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1070	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1071	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1072	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1073	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1074	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1075	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1076	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1077	{ 172, "not", "not sign, U+00AC ISOnum" },
				1078	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1079	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1080	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1081	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1082	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1083	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1084	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1085	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1086	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1087	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1088	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1089	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1090	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1091	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1092	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1093	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1094	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1095	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1096	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1097	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1098	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1099	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1100	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1101	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1102	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1103	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1104	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1105	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1106	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1107	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1108	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1109	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1110	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1111	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1112	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1113	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1114	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1115	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1116	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1117	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1118	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1119	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1120	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1121	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1122	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1123	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1124	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1125	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1126	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1127	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1128	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1129	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1130	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1131	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1132	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1133	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1134	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1135	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1136	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1137	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1138	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1139	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1140	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1141	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1142	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1143	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1144	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1145	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1146	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1147	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1148	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1149	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1150	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1151	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1152	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1153	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1154	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1155	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1156	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1157	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1158	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1159	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1160	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1161
				1162	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1163	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1164	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1165	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1166	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1167
				1168	/*
				1169	* Anything below should really be kept as entities references
				1170	*/
				1171	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1172
				1173	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1174	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1175
				1176	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1177	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1178	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1179	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1180	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1181	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1182	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1183	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1184	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1185	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1186	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1187	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1188	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1189	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1190	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1191	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1192	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1193	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1194	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1195	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1196	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1197	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1198	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1199	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1200
				1201	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1202	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1203	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1204	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1205	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1206	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1207	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1208	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1209	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1210	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1211	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1212	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1213	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1214	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1215	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1216	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1217	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1218	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1219	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1220	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1221	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1222	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1223	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1224	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1225	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1226	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1227	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1228	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1229
				1230	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1231	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1232	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1233	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1234	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1235	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1236	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1237	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1238	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1239	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1240	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1241	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1242	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1243	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1244	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1245	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1246	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1247
				1248	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1249	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1250
				1251	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1252
				1253	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1254	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1255
				1256	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1257	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1258
				1259	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1260	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1261
				1262	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1263
				1264	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1265	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1266	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1267	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1268	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1269	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1270	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1271	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1272	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1273	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1274	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1275	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1276	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1277	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1278	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1279	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1280
				1281	{ 8704, "forall","for all, U+2200 ISOtech" },
				1282	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1283	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1284	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1285	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1286	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1287	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1288	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1289	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1290	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1291	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1292	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1293	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1294	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1295	{ 8734, "infin","infinity, U+221E ISOtech" },
				1296	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1297	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1298	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1299	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1300	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1301	{ 8747, "int", "integral, U+222B ISOtech" },
				1302	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1303	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1304	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1305	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1306	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1307	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1308	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1309	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1310	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1311	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1312	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1313	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1314	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1315	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1316	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1317	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1318	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1319	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1320	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1321	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1322	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1323	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1324	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1325	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1326
				1327	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1328	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1329	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1330	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1331
				1332	};
				1333
				1334	/************************************************************************
				1335	* *
				1336	* Commodity functions to handle entities *
				1337	* *
				1338	************************************************************************/
				1339
				1340	/*
				1341	* Macro used to grow the current buffer.
				1342	*/
				1343	#define growBuffer(buffer) { \
				1344	buffer##_size *= 2; \
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	1345	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1346	if (buffer == NULL) { \
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	1347	xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1348	return(NULL); \
				1349	} \
				1350	}
				1351
				1352	/**
				1353	* htmlEntityLookup:
				1354	* @name: the entity name
				1355	*
				1356	* Lookup the given entity in EntitiesTable
				1357	*
				1358	* TODO: the linear scan is really ugly, an hash table is really needed.
				1359	*
				1360	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1361	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1362	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1363	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1364	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1365
				1366	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1367	sizeof(html40EntitiesTable[0]));i++) {
				1368	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1369	#ifdef DEBUG
				1370	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1371	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1372	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1373	}
				1374	}
				1375	return(NULL);
				1376	}
				1377
				1378	/**
				1379	* htmlEntityValueLookup:
				1380	* @value: the entity's unicode value
				1381	*
				1382	* Lookup the given entity in EntitiesTable
				1383	*
				1384	* TODO: the linear scan is really ugly, an hash table is really needed.
				1385	*
				1386	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1387	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1388	const htmlEntityDesc *
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1389	htmlEntityValueLookup(unsigned int value) {
				1390	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1391	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1392	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1393	#endif
				1394
				1395	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1396	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1397	if (html40EntitiesTable[i].value >= value) {
				1398	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1399	break;
				1400	#ifdef DEBUG
				1401	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1402	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1403	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1404	}
				1405	#ifdef DEBUG
				1406	if (lv > html40EntitiesTable[i].value) {
				1407	xmlGenericError(xmlGenericErrorContext,
				1408	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1409	lv, html40EntitiesTable[i].value);
				1410	}
				1411	lv = html40EntitiesTable[i].value;
				1412	#endif
				1413	}
				1414	return(NULL);
				1415	}
				1416
				1417	/**
				1418	* UTF8ToHtml:
				1419	* @out: a pointer to an array of bytes to store the result
				1420	* @outlen: the length of @out
				1421	* @in: a pointer to an array of UTF-8 chars
				1422	* @inlen: the length of @in
				1423	*
				1424	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1425	* plus HTML entities block of chars out.
				1426	*
				1427	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1428	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1429	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1430	* The value of @outlen after return is the number of octets consumed.
				1431	*/
				1432	int
				1433	UTF8ToHtml(unsigned char* out, int *outlen,
				1434	const unsigned char* in, int *inlen) {
				1435	const unsigned char* processed = in;
				1436	const unsigned char* outend;
				1437	const unsigned char* outstart = out;
				1438	const unsigned char* instart = in;
				1439	const unsigned char* inend;
				1440	unsigned int c, d;
				1441	int trailing;
				1442
				1443	if (in == NULL) {
				1444	/*
				1445	* initialization nothing to do
				1446	*/
				1447	*outlen = 0;
				1448	*inlen = 0;
				1449	return(0);
				1450	}
				1451	inend = in + (*inlen);
				1452	outend = out + (*outlen);
				1453	while (in < inend) {
				1454	d = *in++;
				1455	if (d < 0x80) { c= d; trailing= 0; }
				1456	else if (d < 0xC0) {
				1457	/* trailing byte in leading position */
				1458	*outlen = out - outstart;
				1459	*inlen = processed - instart;
				1460	return(-2);
				1461	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1462	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1463	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1464	else {
				1465	/* no chance for this in Ascii */
				1466	*outlen = out - outstart;
				1467	*inlen = processed - instart;
				1468	return(-2);
				1469	}
				1470
				1471	if (inend - in < trailing) {
				1472	break;
				1473	}
				1474
				1475	for ( ; trailing; trailing--) {
				1476	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1477	break;
				1478	c <<= 6;
				1479	c \|= d & 0x3F;
				1480	}
				1481
				1482	/* assertion: c is a single UTF-4 value */
				1483	if (c < 0x80) {
				1484	if (out + 1 >= outend)
				1485	break;
				1486	*out++ = c;
				1487	} else {
				1488	int len;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1489	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1490
				1491	/*
				1492	* Try to lookup a predefined HTML entity for it
				1493	*/
				1494
				1495	ent = htmlEntityValueLookup(c);
				1496	if (ent == NULL) {
				1497	/* no chance for this in Ascii */
				1498	*outlen = out - outstart;
				1499	*inlen = processed - instart;
				1500	return(-2);
				1501	}
				1502	len = strlen(ent->name);
				1503	if (out + 2 + len >= outend)
				1504	break;
				1505	*out++ = '&';
				1506	memcpy(out, ent->name, len);
				1507	out += len;
				1508	*out++ = ';';
				1509	}
				1510	processed = in;
				1511	}
				1512	*outlen = out - outstart;
				1513	*inlen = processed - instart;
				1514	return(0);
				1515	}
				1516
				1517	/**
				1518	* htmlEncodeEntities:
				1519	* @out: a pointer to an array of bytes to store the result
				1520	* @outlen: the length of @out
				1521	* @in: a pointer to an array of UTF-8 chars
				1522	* @inlen: the length of @in
				1523	* @quoteChar: the quote character to escape (' or ") or zero.
				1524	*
				1525	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1526	* plus HTML entities block of chars out.
				1527	*
				1528	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1529	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1530	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1531	* The value of @outlen after return is the number of octets consumed.
				1532	*/
				1533	int
				1534	htmlEncodeEntities(unsigned char* out, int *outlen,
				1535	const unsigned char* in, int *inlen, int quoteChar) {
				1536	const unsigned char* processed = in;
				1537	const unsigned char* outend = out + (*outlen);
				1538	const unsigned char* outstart = out;
				1539	const unsigned char* instart = in;
				1540	const unsigned char* inend = in + (*inlen);
				1541	unsigned int c, d;
				1542	int trailing;
				1543
				1544	while (in < inend) {
				1545	d = *in++;
				1546	if (d < 0x80) { c= d; trailing= 0; }
				1547	else if (d < 0xC0) {
				1548	/* trailing byte in leading position */
				1549	*outlen = out - outstart;
				1550	*inlen = processed - instart;
				1551	return(-2);
				1552	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1553	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1554	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1555	else {
				1556	/* no chance for this in Ascii */
				1557	*outlen = out - outstart;
				1558	*inlen = processed - instart;
				1559	return(-2);
				1560	}
				1561
				1562	if (inend - in < trailing)
				1563	break;
				1564
				1565	while (trailing--) {
				1566	if (((d= *in++) & 0xC0) != 0x80) {
				1567	*outlen = out - outstart;
				1568	*inlen = processed - instart;
				1569	return(-2);
				1570	}
				1571	c <<= 6;
				1572	c \|= d & 0x3F;
				1573	}
				1574
				1575	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1576	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1577	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1578	if (out >= outend)
				1579	break;
				1580	*out++ = c;
				1581	} else {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1582	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1583	const char *cp;
				1584	char nbuf[16];
				1585	int len;
				1586
				1587	/*
				1588	* Try to lookup a predefined HTML entity for it
				1589	*/
				1590	ent = htmlEntityValueLookup(c);
				1591	if (ent == NULL) {
Aleksey Sanin	49cc975	2002-06-14 17:07:10 +0000	[diff] [blame]	1592	snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1593	cp = nbuf;
				1594	}
				1595	else
				1596	cp = ent->name;
				1597	len = strlen(cp);
				1598	if (out + 2 + len > outend)
				1599	break;
				1600	*out++ = '&';
				1601	memcpy(out, cp, len);
				1602	out += len;
				1603	*out++ = ';';
				1604	}
				1605	processed = in;
				1606	}
				1607	*outlen = out - outstart;
				1608	*inlen = processed - instart;
				1609	return(0);
				1610	}
				1611
				1612	/**
				1613	* htmlDecodeEntities:
				1614	* @ctxt: the parser context
				1615	* @len: the len to decode (in bytes !), -1 for no size limit
				1616	* @end: an end marker xmlChar, 0 if none
				1617	* @end2: an end marker xmlChar, 0 if none
				1618	* @end3: an end marker xmlChar, 0 if none
				1619	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1620	* Substitute the HTML entities by their value
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1621	*
				1622	* DEPRECATED !!!!
				1623	*
				1624	* Returns A newly allocated string with the substitution done. The caller
				1625	* must deallocate it !
				1626	*/
				1627	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1628	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1629	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1630	static int deprecated = 0;
				1631	if (!deprecated) {
				1632	xmlGenericError(xmlGenericErrorContext,
				1633	"htmlDecodeEntities() deprecated function reached\n");
				1634	deprecated = 1;
				1635	}
				1636	return(NULL);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1637	}
				1638
				1639	/************************************************************************
				1640	* *
				1641	* Commodity functions to handle streams *
				1642	* *
				1643	************************************************************************/
				1644
				1645	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1646	* htmlNewInputStream:
				1647	* @ctxt: an HTML parser context
				1648	*
				1649	* Create a new input stream structure
				1650	* Returns the new input stream or NULL
				1651	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1652	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1653	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1654	htmlParserInputPtr input;
				1655
				1656	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1657	if (input == NULL) {
				1658	ctxt->errNo = XML_ERR_NO_MEMORY;
				1659	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1660	ctxt->sax->error(ctxt->userData,
				1661	"malloc: couldn't allocate a new input stream\n");
				1662	return(NULL);
				1663	}
				1664	memset(input, 0, sizeof(htmlParserInput));
				1665	input->filename = NULL;
				1666	input->directory = NULL;
				1667	input->base = NULL;
				1668	input->cur = NULL;
				1669	input->buf = NULL;
				1670	input->line = 1;
				1671	input->col = 1;
				1672	input->buf = NULL;
				1673	input->free = NULL;
				1674	input->version = NULL;
				1675	input->consumed = 0;
				1676	input->length = 0;
				1677	return(input);
				1678	}
				1679
				1680
				1681	/************************************************************************
				1682	* *
				1683	* Commodity functions, cleanup needed ? *
				1684	* *
				1685	************************************************************************/
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1686	/*
				1687	* all tags allowing pc data from the html 4.01 loose dtd
				1688	* NOTE: it might be more apropriate to integrate this information
				1689	* into the html40ElementTable array but I don't want to risk any
				1690	* binary incomptibility
				1691	*/
				1692	static const char *allowPCData[] = {
				1693	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
				1694	"blockquote", "body", "button", "caption", "center", "cite", "code",
				1695	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
				1696	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
				1697	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
				1698	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
				1699	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1700
				1701	/**
				1702	* areBlanks:
				1703	* @ctxt: an HTML parser context
				1704	* @str: a xmlChar *
				1705	* @len: the size of @str
				1706	*
				1707	* Is this a sequence of blank chars that one can ignore ?
				1708	*
				1709	* Returns 1 if ignorable 0 otherwise.
				1710	*/
				1711
				1712	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1713	unsigned int i;
				1714	int j;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1715	xmlNodePtr lastChild;
				1716
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1717	for (j = 0;j < len;j++)
				1718	if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1719
				1720	if (CUR == 0) return(1);
				1721	if (CUR != '<') return(0);
				1722	if (ctxt->name == NULL)
				1723	return(1);
				1724	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1725	return(1);
				1726	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1727	return(1);
				1728	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1729	return(1);
				1730	if (ctxt->node == NULL) return(0);
				1731	lastChild = xmlGetLastChild(ctxt->node);
				1732	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1733	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1734	(ctxt->node->content != NULL)) return(0);
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1735	/* keep ws in constructs like ...<b> </b>...
				1736	for all tags "b" allowing PCDATA */
				1737	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				1738	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
				1739	return(0);
				1740	}
				1741	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1742	} else if (xmlNodeIsText(lastChild)) {
				1743	return(0);
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1744	} else {
				1745	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
				1746	for all tags "p" allowing PCDATA */
				1747	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				1748	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
				1749	return(0);
				1750	}
				1751	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1752	}
				1753	return(1);
				1754	}
				1755
				1756	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1757	* htmlNewDocNoDtD:
				1758	* @URI: URI for the dtd, or NULL
				1759	* @ExternalID: the external ID of the DTD, or NULL
				1760	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1761	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				1762	* are NULL
				1763	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1764	* Returns a new document, do not initialize the DTD if not provided
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1765	*/
				1766	htmlDocPtr
				1767	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1768	xmlDocPtr cur;
				1769
				1770	/*
				1771	* Allocate a new document and fill the fields.
				1772	*/
				1773	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1774	if (cur == NULL) {
				1775	xmlGenericError(xmlGenericErrorContext,
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1776	"htmlNewDocNoDtD : malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1777	return(NULL);
				1778	}
				1779	memset(cur, 0, sizeof(xmlDoc));
				1780
				1781	cur->type = XML_HTML_DOCUMENT_NODE;
				1782	cur->version = NULL;
				1783	cur->intSubset = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1784	cur->doc = cur;
				1785	cur->name = NULL;
				1786	cur->children = NULL;
				1787	cur->extSubset = NULL;
				1788	cur->oldNs = NULL;
				1789	cur->encoding = NULL;
				1790	cur->standalone = 1;
				1791	cur->compression = 0;
				1792	cur->ids = NULL;
				1793	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1794	cur->_private = NULL;
Daniel Veillard	b6b0fd8	2001-10-22 12:31:11 +0000	[diff] [blame]	1795	if ((ExternalID != NULL) \|\|
				1796	(URI != NULL))
Daniel Veillard	5151c06	2001-10-23 13:10:19 +0000	[diff] [blame]	1797	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1798	return(cur);
				1799	}
				1800
				1801	/**
				1802	* htmlNewDoc:
				1803	* @URI: URI for the dtd, or NULL
				1804	* @ExternalID: the external ID of the DTD, or NULL
				1805	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1806	* Creates a new HTML document
				1807	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1808	* Returns a new document
				1809	*/
				1810	htmlDocPtr
				1811	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1812	if ((URI == NULL) && (ExternalID == NULL))
				1813	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1814	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1815	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1816
				1817	return(htmlNewDocNoDtD(URI, ExternalID));
				1818	}
				1819
				1820
				1821	/************************************************************************
				1822	* *
				1823	* The parser itself *
				1824	* Relates to http://www.w3.org/TR/html40 *
				1825	* *
				1826	************************************************************************/
				1827
				1828	/************************************************************************
				1829	* *
				1830	* The parser itself *
				1831	* *
				1832	************************************************************************/
				1833
				1834	/**
				1835	* htmlParseHTMLName:
				1836	* @ctxt: an HTML parser context
				1837	*
				1838	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1839	* since HTML names are not case-sensitive.
				1840	*
				1841	* Returns the Tag Name parsed or NULL
				1842	*/
				1843
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1844	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1845	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1846	xmlChar *ret = NULL;
				1847	int i = 0;
				1848	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1849
				1850	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1851	(CUR != ':')) return(NULL);
				1852
				1853	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1854	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1855	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1856	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1857	else loc[i] = CUR;
				1858	i++;
				1859
				1860	NEXT;
				1861	}
				1862
				1863	ret = xmlStrndup(loc, i);
				1864
				1865	return(ret);
				1866	}
				1867
				1868	/**
				1869	* htmlParseName:
				1870	* @ctxt: an HTML parser context
				1871	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1872	* parse an HTML name, this routine is case sensitive.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1873	*
				1874	* Returns the Name parsed or NULL
				1875	*/
				1876
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1877	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1878	htmlParseName(htmlParserCtxtPtr ctxt) {
				1879	xmlChar buf[HTML_MAX_NAMELEN];
				1880	int len = 0;
				1881
				1882	GROW;
				1883	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1884	return(NULL);
				1885	}
				1886
				1887	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1888	(CUR == '.') \|\| (CUR == '-') \|\|
				1889	(CUR == '_') \|\| (CUR == ':') \|\|
				1890	(IS_COMBINING(CUR)) \|\|
				1891	(IS_EXTENDER(CUR))) {
				1892	buf[len++] = CUR;
				1893	NEXT;
				1894	if (len >= HTML_MAX_NAMELEN) {
				1895	xmlGenericError(xmlGenericErrorContext,
				1896	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1897	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1898	(CUR == '.') \|\| (CUR == '-') \|\|
				1899	(CUR == '_') \|\| (CUR == ':') \|\|
				1900	(IS_COMBINING(CUR)) \|\|
				1901	(IS_EXTENDER(CUR)))
				1902	NEXT;
				1903	break;
				1904	}
				1905	}
				1906	return(xmlStrndup(buf, len));
				1907	}
				1908
				1909	/**
				1910	* htmlParseHTMLAttribute:
				1911	* @ctxt: an HTML parser context
				1912	* @stop: a char stop value
				1913	*
				1914	* parse an HTML attribute value till the stop (quote), if
				1915	* stop is 0 then it stops at the first space
				1916	*
				1917	* Returns the attribute parsed or NULL
				1918	*/
				1919
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1920	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1921	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1922	xmlChar *buffer = NULL;
				1923	int buffer_size = 0;
				1924	xmlChar *out = NULL;
				1925	xmlChar *name = NULL;
				1926
				1927	xmlChar *cur = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1928	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1929
				1930	/*
				1931	* allocate a translation buffer.
				1932	*/
				1933	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1934	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1935	if (buffer == NULL) {
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	1936	xmlGenericError(xmlGenericErrorContext,
				1937	"htmlParseHTMLAttribute: malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1938	return(NULL);
				1939	}
				1940	out = buffer;
				1941
				1942	/*
				1943	* Ok loop until we reach one of the ending chars
				1944	*/
Daniel Veillard	957fdcf	2001-11-06 22:50:19 +0000	[diff] [blame]	1945	while ((CUR != 0) && (CUR != stop)) {
				1946	if ((stop == 0) && (CUR == '>')) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1947	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1948	if (CUR == '&') {
				1949	if (NXT(1) == '#') {
				1950	unsigned int c;
				1951	int bits;
				1952
				1953	c = htmlParseCharRef(ctxt);
				1954	if (c < 0x80)
				1955	{ *out++ = c; bits= -6; }
				1956	else if (c < 0x800)
				1957	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1958	else if (c < 0x10000)
				1959	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1960	else
				1961	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1962
				1963	for ( ; bits >= 0; bits-= 6) {
				1964	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1965	}
Daniel Veillard	ce02dbc	2002-10-22 19:14:58 +0000	[diff] [blame]	1966
				1967	if (out - buffer > buffer_size - 100) {
				1968	int indx = out - buffer;
				1969
				1970	growBuffer(buffer);
				1971	out = &buffer[indx];
				1972	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1973	} else {
				1974	ent = htmlParseEntityRef(ctxt, &name);
				1975	if (name == NULL) {
				1976	*out++ = '&';
				1977	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1978	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1979
				1980	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1981	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1982	}
				1983	} else if (ent == NULL) {
				1984	*out++ = '&';
				1985	cur = name;
				1986	while (*cur != 0) {
				1987	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1988	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1989
				1990	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1991	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1992	}
				1993	out++ = cur++;
				1994	}
				1995	xmlFree(name);
				1996	} else {
				1997	unsigned int c;
				1998	int bits;
				1999
				2000	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2001	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2002
				2003	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2004	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2005	}
				2006	c = (xmlChar)ent->value;
				2007	if (c < 0x80)
				2008	{ *out++ = c; bits= -6; }
				2009	else if (c < 0x800)
				2010	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2011	else if (c < 0x10000)
				2012	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2013	else
				2014	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2015
				2016	for ( ; bits >= 0; bits-= 6) {
				2017	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2018	}
				2019	xmlFree(name);
				2020	}
				2021	}
				2022	} else {
				2023	unsigned int c;
				2024	int bits, l;
				2025
				2026	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2027	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2028
				2029	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2030	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2031	}
				2032	c = CUR_CHAR(l);
				2033	if (c < 0x80)
				2034	{ *out++ = c; bits= -6; }
				2035	else if (c < 0x800)
				2036	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2037	else if (c < 0x10000)
				2038	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2039	else
				2040	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2041
				2042	for ( ; bits >= 0; bits-= 6) {
				2043	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2044	}
				2045	NEXT;
				2046	}
				2047	}
				2048	*out++ = 0;
				2049	return(buffer);
				2050	}
				2051
				2052	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2053	* htmlParseEntityRef:
				2054	* @ctxt: an HTML parser context
				2055	* @str: location to store the entity name
				2056	*
				2057	* parse an HTML ENTITY references
				2058	*
				2059	* [68] EntityRef ::= '&' Name ';'
				2060	*
				2061	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2062	* if non-NULL *str will have to be freed by the caller.
				2063	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2064	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2065	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2066	xmlChar *name;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2067	const htmlEntityDesc * ent = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2068	*str = NULL;
				2069
				2070	if (CUR == '&') {
				2071	NEXT;
				2072	name = htmlParseName(ctxt);
				2073	if (name == NULL) {
				2074	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2075	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2076	ctxt->wellFormed = 0;
				2077	} else {
				2078	GROW;
				2079	if (CUR == ';') {
				2080	*str = name;
				2081
				2082	/*
				2083	* Lookup the entity in the table.
				2084	*/
				2085	ent = htmlEntityLookup(name);
				2086	if (ent != NULL) /* OK that's ugly !!! */
				2087	NEXT;
				2088	} else {
				2089	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2090	ctxt->sax->error(ctxt->userData,
				2091	"htmlParseEntityRef: expecting ';'\n");
				2092	*str = name;
				2093	}
				2094	}
				2095	}
				2096	return(ent);
				2097	}
				2098
				2099	/**
				2100	* htmlParseAttValue:
				2101	* @ctxt: an HTML parser context
				2102	*
				2103	* parse a value for an attribute
				2104	* Note: the parser won't do substitution of entities here, this
				2105	* will be handled later in xmlStringGetNodeList, unless it was
				2106	* asked for ctxt->replaceEntities != 0
				2107	*
				2108	* Returns the AttValue parsed or NULL.
				2109	*/
				2110
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2111	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2112	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2113	xmlChar *ret = NULL;
				2114
				2115	if (CUR == '"') {
				2116	NEXT;
				2117	ret = htmlParseHTMLAttribute(ctxt, '"');
				2118	if (CUR != '"') {
				2119	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2120	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2121	ctxt->wellFormed = 0;
				2122	} else
				2123	NEXT;
				2124	} else if (CUR == '\'') {
				2125	NEXT;
				2126	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2127	if (CUR != '\'') {
				2128	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2129	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2130	ctxt->wellFormed = 0;
				2131	} else
				2132	NEXT;
				2133	} else {
				2134	/*
				2135	* That's an HTMLism, the attribute value may not be quoted
				2136	*/
				2137	ret = htmlParseHTMLAttribute(ctxt, 0);
				2138	if (ret == NULL) {
				2139	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2140	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2141	ctxt->wellFormed = 0;
				2142	}
				2143	}
				2144	return(ret);
				2145	}
				2146
				2147	/**
				2148	* htmlParseSystemLiteral:
				2149	* @ctxt: an HTML parser context
				2150	*
				2151	* parse an HTML Literal
				2152	*
				2153	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2154	*
				2155	* Returns the SystemLiteral parsed or NULL
				2156	*/
				2157
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2158	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2159	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2160	const xmlChar *q;
				2161	xmlChar *ret = NULL;
				2162
				2163	if (CUR == '"') {
				2164	NEXT;
				2165	q = CUR_PTR;
				2166	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2167	NEXT;
				2168	if (!IS_CHAR(CUR)) {
				2169	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2170	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2171	ctxt->wellFormed = 0;
				2172	} else {
				2173	ret = xmlStrndup(q, CUR_PTR - q);
				2174	NEXT;
				2175	}
				2176	} else if (CUR == '\'') {
				2177	NEXT;
				2178	q = CUR_PTR;
				2179	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2180	NEXT;
				2181	if (!IS_CHAR(CUR)) {
				2182	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2183	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2184	ctxt->wellFormed = 0;
				2185	} else {
				2186	ret = xmlStrndup(q, CUR_PTR - q);
				2187	NEXT;
				2188	}
				2189	} else {
				2190	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2191	ctxt->sax->error(ctxt->userData,
				2192	"SystemLiteral \" or ' expected\n");
				2193	ctxt->wellFormed = 0;
				2194	}
				2195
				2196	return(ret);
				2197	}
				2198
				2199	/**
				2200	* htmlParsePubidLiteral:
				2201	* @ctxt: an HTML parser context
				2202	*
				2203	* parse an HTML public literal
				2204	*
				2205	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2206	*
				2207	* Returns the PubidLiteral parsed or NULL.
				2208	*/
				2209
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2210	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2211	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2212	const xmlChar *q;
				2213	xmlChar *ret = NULL;
				2214	/*
				2215	* Name ::= (Letter \| '_') (NameChar)*
				2216	*/
				2217	if (CUR == '"') {
				2218	NEXT;
				2219	q = CUR_PTR;
				2220	while (IS_PUBIDCHAR(CUR)) NEXT;
				2221	if (CUR != '"') {
				2222	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2223	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2224	ctxt->wellFormed = 0;
				2225	} else {
				2226	ret = xmlStrndup(q, CUR_PTR - q);
				2227	NEXT;
				2228	}
				2229	} else if (CUR == '\'') {
				2230	NEXT;
				2231	q = CUR_PTR;
				2232	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2233	NEXT;
				2234	if (!IS_LETTER(CUR)) {
				2235	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2236	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2237	ctxt->wellFormed = 0;
				2238	} else {
				2239	ret = xmlStrndup(q, CUR_PTR - q);
				2240	NEXT;
				2241	}
				2242	} else {
				2243	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2244	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2245	ctxt->wellFormed = 0;
				2246	}
				2247
				2248	return(ret);
				2249	}
				2250
				2251	/**
				2252	* htmlParseScript:
				2253	* @ctxt: an HTML parser context
				2254	*
				2255	* parse the content of an HTML SCRIPT or STYLE element
				2256	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2257	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2258	* http://www.w3.org/TR/html4/types.html#type-script
				2259	* http://www.w3.org/TR/html4/types.html#h-6.15
				2260	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2261	*
				2262	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2263	* element and the value of intrinsic event attributes. User agents must
				2264	* not evaluate script data as HTML markup but instead must pass it on as
				2265	* data to a script engine.
				2266	* NOTES:
				2267	* - The content is passed like CDATA
				2268	* - the attributes for style and scripting "onXXX" are also described
				2269	* as CDATA but SGML allows entities references in attributes so their
				2270	* processing is identical as other attributes
				2271	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2272	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2273	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2274	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2275	int nbchar = 0;
				2276	xmlChar cur;
				2277
				2278	SHRINK;
				2279	cur = CUR;
				2280	while (IS_CHAR(cur)) {
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	2281	if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
				2282	(NXT(3) == '-')) {
				2283	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2284	if (ctxt->sax->cdataBlock!= NULL) {
				2285	/*
				2286	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2287	*/
				2288	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2289	}
				2290	}
				2291	nbchar = 0;
				2292	htmlParseComment(ctxt);
				2293	cur = CUR;
				2294	continue;
				2295	} else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2296	/*
				2297	* One should break here, the specification is clear:
				2298	* Authors should therefore escape "</" within the content.
				2299	* Escape mechanisms are specific to each scripting or
				2300	* style sheet language.
				2301	*/
				2302	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2303	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2304	break; /* while */
				2305	}
				2306	buf[nbchar++] = cur;
				2307	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2308	if (ctxt->sax->cdataBlock!= NULL) {
				2309	/*
				2310	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2311	*/
				2312	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2313	}
				2314	nbchar = 0;
				2315	}
				2316	NEXT;
				2317	cur = CUR;
				2318	}
				2319	if (!(IS_CHAR(cur))) {
				2320	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2321	ctxt->sax->error(ctxt->userData,
				2322	"Invalid char in CDATA 0x%X\n", cur);
				2323	ctxt->wellFormed = 0;
				2324	NEXT;
				2325	}
				2326
				2327	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2328	if (ctxt->sax->cdataBlock!= NULL) {
				2329	/*
				2330	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2331	*/
				2332	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2333	}
				2334	}
				2335	}
				2336
				2337
				2338	/**
				2339	* htmlParseCharData:
				2340	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2341	*
				2342	* parse a CharData section.
				2343	* if we are within a CDATA section ']]>' marks an end of section.
				2344	*
				2345	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2346	*/
				2347
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2348	static void
				2349	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2350	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2351	int nbchar = 0;
				2352	int cur, l;
				2353
				2354	SHRINK;
				2355	cur = CUR_CHAR(l);
				2356	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2357	((cur != '&') \|\| (ctxt->token == '&')) &&
				2358	(IS_CHAR(cur))) {
				2359	COPY_BUF(l,buf,nbchar,cur);
				2360	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2361	/*
				2362	* Ok the segment is to be consumed as chars.
				2363	*/
				2364	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2365	if (areBlanks(ctxt, buf, nbchar)) {
				2366	if (ctxt->sax->ignorableWhitespace != NULL)
				2367	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2368	buf, nbchar);
				2369	} else {
				2370	htmlCheckParagraph(ctxt);
				2371	if (ctxt->sax->characters != NULL)
				2372	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2373	}
				2374	}
				2375	nbchar = 0;
				2376	}
				2377	NEXTL(l);
				2378	cur = CUR_CHAR(l);
				2379	}
				2380	if (nbchar != 0) {
				2381	/*
				2382	* Ok the segment is to be consumed as chars.
				2383	*/
				2384	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2385	if (areBlanks(ctxt, buf, nbchar)) {
				2386	if (ctxt->sax->ignorableWhitespace != NULL)
				2387	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2388	} else {
				2389	htmlCheckParagraph(ctxt);
				2390	if (ctxt->sax->characters != NULL)
				2391	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2392	}
				2393	}
Daniel Veillard	7cc95c0	2001-10-17 15:45:12 +0000	[diff] [blame]	2394	} else {
				2395	/*
				2396	* Loop detection
				2397	*/
				2398	if (cur == 0)
				2399	ctxt->instate = XML_PARSER_EOF;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2400	}
				2401	}
				2402
				2403	/**
				2404	* htmlParseExternalID:
				2405	* @ctxt: an HTML parser context
				2406	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2407	*
				2408	* Parse an External ID or a Public ID
				2409	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2410	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2411	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2412	*
				2413	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2414	*
				2415	* Returns the function returns SystemLiteral and in the second
				2416	* case publicID receives PubidLiteral, is strict is off
				2417	* it is possible to return NULL and have publicID set.
				2418	*/
				2419
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2420	static xmlChar *
				2421	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2422	xmlChar *URI = NULL;
				2423
				2424	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2425	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2426	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2427	SKIP(6);
				2428	if (!IS_BLANK(CUR)) {
				2429	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2430	ctxt->sax->error(ctxt->userData,
				2431	"Space required after 'SYSTEM'\n");
				2432	ctxt->wellFormed = 0;
				2433	}
				2434	SKIP_BLANKS;
				2435	URI = htmlParseSystemLiteral(ctxt);
				2436	if (URI == NULL) {
				2437	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2438	ctxt->sax->error(ctxt->userData,
				2439	"htmlParseExternalID: SYSTEM, no URI\n");
				2440	ctxt->wellFormed = 0;
				2441	}
				2442	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2443	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2444	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2445	SKIP(6);
				2446	if (!IS_BLANK(CUR)) {
				2447	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2448	ctxt->sax->error(ctxt->userData,
				2449	"Space required after 'PUBLIC'\n");
				2450	ctxt->wellFormed = 0;
				2451	}
				2452	SKIP_BLANKS;
				2453	*publicID = htmlParsePubidLiteral(ctxt);
				2454	if (*publicID == NULL) {
				2455	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2456	ctxt->sax->error(ctxt->userData,
				2457	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2458	ctxt->wellFormed = 0;
				2459	}
				2460	SKIP_BLANKS;
				2461	if ((CUR == '"') \|\| (CUR == '\'')) {
				2462	URI = htmlParseSystemLiteral(ctxt);
				2463	}
				2464	}
				2465	return(URI);
				2466	}
				2467
				2468	/**
				2469	* htmlParseComment:
				2470	* @ctxt: an HTML parser context
				2471	*
				2472	* Parse an XML (SGML) comment <!-- .... -->
				2473	*
				2474	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2475	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2476	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2477	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2478	xmlChar *buf = NULL;
				2479	int len;
				2480	int size = HTML_PARSER_BUFFER_SIZE;
				2481	int q, ql;
				2482	int r, rl;
				2483	int cur, l;
				2484	xmlParserInputState state;
				2485
				2486	/*
				2487	* Check that there is a comment right here.
				2488	*/
				2489	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2490	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2491
				2492	state = ctxt->instate;
				2493	ctxt->instate = XML_PARSER_COMMENT;
				2494	SHRINK;
				2495	SKIP(4);
				2496	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2497	if (buf == NULL) {
				2498	xmlGenericError(xmlGenericErrorContext,
				2499	"malloc of %d byte failed\n", size);
				2500	ctxt->instate = state;
				2501	return;
				2502	}
				2503	q = CUR_CHAR(ql);
				2504	NEXTL(ql);
				2505	r = CUR_CHAR(rl);
				2506	NEXTL(rl);
				2507	cur = CUR_CHAR(l);
				2508	len = 0;
				2509	while (IS_CHAR(cur) &&
				2510	((cur != '>') \|\|
				2511	(r != '-') \|\| (q != '-'))) {
				2512	if (len + 5 >= size) {
				2513	size *= 2;
				2514	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2515	if (buf == NULL) {
				2516	xmlGenericError(xmlGenericErrorContext,
				2517	"realloc of %d byte failed\n", size);
				2518	ctxt->instate = state;
				2519	return;
				2520	}
				2521	}
				2522	COPY_BUF(ql,buf,len,q);
				2523	q = r;
				2524	ql = rl;
				2525	r = cur;
				2526	rl = l;
				2527	NEXTL(l);
				2528	cur = CUR_CHAR(l);
				2529	if (cur == 0) {
				2530	SHRINK;
				2531	GROW;
				2532	cur = CUR_CHAR(l);
				2533	}
				2534	}
				2535	buf[len] = 0;
				2536	if (!IS_CHAR(cur)) {
				2537	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2538	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2539	ctxt->sax->error(ctxt->userData,
				2540	"Comment not terminated \n<!--%.50s\n", buf);
				2541	ctxt->wellFormed = 0;
				2542	xmlFree(buf);
				2543	} else {
				2544	NEXT;
				2545	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2546	(!ctxt->disableSAX))
				2547	ctxt->sax->comment(ctxt->userData, buf);
				2548	xmlFree(buf);
				2549	}
				2550	ctxt->instate = state;
				2551	}
				2552
				2553	/**
				2554	* htmlParseCharRef:
				2555	* @ctxt: an HTML parser context
				2556	*
				2557	* parse Reference declarations
				2558	*
				2559	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2560	* '&#x' [0-9a-fA-F]+ ';'
				2561	*
				2562	* Returns the value parsed (as an int)
				2563	*/
				2564	int
				2565	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2566	int val = 0;
				2567
				2568	if ((CUR == '&') && (NXT(1) == '#') &&
				2569	(NXT(2) == 'x')) {
				2570	SKIP(3);
				2571	while (CUR != ';') {
				2572	if ((CUR >= '0') && (CUR <= '9'))
				2573	val = val * 16 + (CUR - '0');
				2574	else if ((CUR >= 'a') && (CUR <= 'f'))
				2575	val = val * 16 + (CUR - 'a') + 10;
				2576	else if ((CUR >= 'A') && (CUR <= 'F'))
				2577	val = val * 16 + (CUR - 'A') + 10;
				2578	else {
				2579	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2580	ctxt->sax->error(ctxt->userData,
				2581	"htmlParseCharRef: invalid hexadecimal value\n");
				2582	ctxt->wellFormed = 0;
				2583	return(0);
				2584	}
				2585	NEXT;
				2586	}
				2587	if (CUR == ';')
				2588	NEXT;
				2589	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2590	SKIP(2);
				2591	while (CUR != ';') {
				2592	if ((CUR >= '0') && (CUR <= '9'))
				2593	val = val * 10 + (CUR - '0');
				2594	else {
				2595	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2596	ctxt->sax->error(ctxt->userData,
				2597	"htmlParseCharRef: invalid decimal value\n");
				2598	ctxt->wellFormed = 0;
				2599	return(0);
				2600	}
				2601	NEXT;
				2602	}
				2603	if (CUR == ';')
				2604	NEXT;
				2605	} else {
				2606	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2607	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2608	ctxt->wellFormed = 0;
				2609	}
				2610	/*
				2611	* Check the value IS_CHAR ...
				2612	*/
				2613	if (IS_CHAR(val)) {
				2614	return(val);
				2615	} else {
				2616	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2617	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2618	val);
				2619	ctxt->wellFormed = 0;
				2620	}
				2621	return(0);
				2622	}
				2623
				2624
				2625	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	2626	* htmlParseDocTypeDecl:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2627	* @ctxt: an HTML parser context
				2628	*
				2629	* parse a DOCTYPE declaration
				2630	*
				2631	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2632	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2633	*/
				2634
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2635	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2636	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2637	xmlChar *name;
				2638	xmlChar *ExternalID = NULL;
				2639	xmlChar *URI = NULL;
				2640
				2641	/*
				2642	* We know that '<!DOCTYPE' has been detected.
				2643	*/
				2644	SKIP(9);
				2645
				2646	SKIP_BLANKS;
				2647
				2648	/*
				2649	* Parse the DOCTYPE name.
				2650	*/
				2651	name = htmlParseName(ctxt);
				2652	if (name == NULL) {
				2653	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2654	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2655	ctxt->wellFormed = 0;
				2656	}
				2657	/*
				2658	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2659	*/
				2660
				2661	SKIP_BLANKS;
				2662
				2663	/*
				2664	* Check for SystemID and ExternalID
				2665	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2666	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2667	SKIP_BLANKS;
				2668
				2669	/*
				2670	* We should be at the end of the DOCTYPE declaration.
				2671	*/
				2672	if (CUR != '>') {
				2673	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard	f6ed8bc	2001-10-02 09:22:47 +0000	[diff] [blame]	2674	ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2675	ctxt->wellFormed = 0;
				2676	/* We shouldn't try to resynchronize ... */
				2677	}
				2678	NEXT;
				2679
				2680	/*
				2681	* Create or update the document accordingly to the DOCTYPE
				2682	*/
				2683	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2684	(!ctxt->disableSAX))
				2685	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2686
				2687	/*
				2688	* Cleanup, since we don't use all those identifiers
				2689	*/
				2690	if (URI != NULL) xmlFree(URI);
				2691	if (ExternalID != NULL) xmlFree(ExternalID);
				2692	if (name != NULL) xmlFree(name);
				2693	}
				2694
				2695	/**
				2696	* htmlParseAttribute:
				2697	* @ctxt: an HTML parser context
				2698	* @value: a xmlChar ** used to store the value of the attribute
				2699	*
				2700	* parse an attribute
				2701	*
				2702	* [41] Attribute ::= Name Eq AttValue
				2703	*
				2704	* [25] Eq ::= S? '=' S?
				2705	*
				2706	* With namespace:
				2707	*
				2708	* [NS 11] Attribute ::= QName Eq AttValue
				2709	*
				2710	* Also the case QName == xmlns:??? is handled independently as a namespace
				2711	* definition.
				2712	*
				2713	* Returns the attribute name, and the value in *value.
				2714	*/
				2715
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2716	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2717	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2718	xmlChar name, val = NULL;
				2719
				2720	*value = NULL;
				2721	name = htmlParseHTMLName(ctxt);
				2722	if (name == NULL) {
				2723	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2724	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2725	ctxt->wellFormed = 0;
				2726	return(NULL);
				2727	}
				2728
				2729	/*
				2730	* read the value
				2731	*/
				2732	SKIP_BLANKS;
				2733	if (CUR == '=') {
				2734	NEXT;
				2735	SKIP_BLANKS;
				2736	val = htmlParseAttValue(ctxt);
				2737	/******
				2738	} else {
				2739	* TODO : some attribute must have values, some may not
				2740	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2741	ctxt->sax->warning(ctxt->userData,
				2742	"No value for attribute %s\n", name); */
				2743	}
				2744
				2745	*value = val;
				2746	return(name);
				2747	}
				2748
				2749	/**
				2750	* htmlCheckEncoding:
				2751	* @ctxt: an HTML parser context
				2752	* @attvalue: the attribute value
				2753	*
				2754	* Checks an http-equiv attribute from a Meta tag to detect
				2755	* the encoding
				2756	* If a new encoding is detected the parser is switched to decode
				2757	* it and pass UTF8
				2758	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2759	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2760	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2761	const xmlChar *encoding;
				2762
				2763	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2764	return;
				2765
				2766	/* do not change encoding */
				2767	if (ctxt->input->encoding != NULL)
				2768	return;
				2769
				2770	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2771	if (encoding != NULL) {
				2772	encoding += 8;
				2773	} else {
				2774	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2775	if (encoding != NULL)
				2776	encoding += 9;
				2777	}
				2778	if (encoding != NULL) {
				2779	xmlCharEncoding enc;
				2780	xmlCharEncodingHandlerPtr handler;
				2781
				2782	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2783
				2784	if (ctxt->input->encoding != NULL)
				2785	xmlFree((xmlChar *) ctxt->input->encoding);
				2786	ctxt->input->encoding = xmlStrdup(encoding);
				2787
				2788	enc = xmlParseCharEncoding((const char *) encoding);
				2789	/*
				2790	* registered set of known encodings
				2791	*/
				2792	if (enc != XML_CHAR_ENCODING_ERROR) {
				2793	xmlSwitchEncoding(ctxt, enc);
				2794	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2795	} else {
				2796	/*
				2797	* fallback for unknown encodings
				2798	*/
				2799	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2800	if (handler != NULL) {
				2801	xmlSwitchToEncoding(ctxt, handler);
				2802	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2803	} else {
				2804	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2805	}
				2806	}
				2807
				2808	if ((ctxt->input->buf != NULL) &&
				2809	(ctxt->input->buf->encoder != NULL) &&
				2810	(ctxt->input->buf->raw != NULL) &&
				2811	(ctxt->input->buf->buffer != NULL)) {
				2812	int nbchars;
				2813	int processed;
				2814
				2815	/*
				2816	* convert as much as possible to the parser reading buffer.
				2817	*/
				2818	processed = ctxt->input->cur - ctxt->input->base;
				2819	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2820	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2821	ctxt->input->buf->buffer,
				2822	ctxt->input->buf->raw);
				2823	if (nbchars < 0) {
				2824	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2825	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2826	ctxt->sax->error(ctxt->userData,
				2827	"htmlCheckEncoding: encoder error\n");
				2828	}
				2829	ctxt->input->base =
				2830	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2831	}
				2832	}
				2833	}
				2834
				2835	/**
				2836	* htmlCheckMeta:
				2837	* @ctxt: an HTML parser context
				2838	* @atts: the attributes values
				2839	*
				2840	* Checks an attributes from a Meta tag
				2841	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2842	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2843	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2844	int i;
				2845	const xmlChar att, value;
				2846	int http = 0;
				2847	const xmlChar *content = NULL;
				2848
				2849	if ((ctxt == NULL) \|\| (atts == NULL))
				2850	return;
				2851
				2852	i = 0;
				2853	att = atts[i++];
				2854	while (att != NULL) {
				2855	value = atts[i++];
				2856	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2857	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2858	http = 1;
				2859	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2860	content = value;
				2861	att = atts[i++];
				2862	}
				2863	if ((http) && (content != NULL))
				2864	htmlCheckEncoding(ctxt, content);
				2865
				2866	}
				2867
				2868	/**
				2869	* htmlParseStartTag:
				2870	* @ctxt: an HTML parser context
				2871	*
				2872	* parse a start of tag either for rule element or
				2873	* EmptyElement. In both case we don't parse the tag closing chars.
				2874	*
				2875	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2876	*
				2877	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2878	*
				2879	* With namespace:
				2880	*
				2881	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2882	*
				2883	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2884	*
				2885	*/
				2886
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2887	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2888	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2889	xmlChar *name;
				2890	xmlChar *attname;
				2891	xmlChar *attvalue;
				2892	const xmlChar **atts = NULL;
				2893	int nbatts = 0;
				2894	int maxatts = 0;
				2895	int meta = 0;
				2896	int i;
				2897
				2898	if (CUR != '<') return;
				2899	NEXT;
				2900
				2901	GROW;
				2902	name = htmlParseHTMLName(ctxt);
				2903	if (name == NULL) {
				2904	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2905	ctxt->sax->error(ctxt->userData,
				2906	"htmlParseStartTag: invalid element name\n");
				2907	ctxt->wellFormed = 0;
				2908	/* Dump the bogus tag like browsers do */
				2909	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2910	NEXT;
				2911	return;
				2912	}
				2913	if (xmlStrEqual(name, BAD_CAST"meta"))
				2914	meta = 1;
				2915
				2916	/*
				2917	* Check for auto-closure of HTML elements.
				2918	*/
				2919	htmlAutoClose(ctxt, name);
				2920
				2921	/*
				2922	* Check for implied HTML elements.
				2923	*/
				2924	htmlCheckImplied(ctxt, name);
				2925
				2926	/*
				2927	* Avoid html at any level > 0, head at any level != 1
				2928	* or any attempt to recurse body
				2929	*/
				2930	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2931	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2932	ctxt->sax->error(ctxt->userData,
				2933	"htmlParseStartTag: misplaced <html> tag\n");
				2934	ctxt->wellFormed = 0;
				2935	xmlFree(name);
				2936	return;
				2937	}
				2938	if ((ctxt->nameNr != 1) &&
				2939	(xmlStrEqual(name, BAD_CAST"head"))) {
				2940	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2941	ctxt->sax->error(ctxt->userData,
				2942	"htmlParseStartTag: misplaced <head> tag\n");
				2943	ctxt->wellFormed = 0;
				2944	xmlFree(name);
				2945	return;
				2946	}
				2947	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2948	int indx;
				2949	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2950	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2951	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2952	ctxt->sax->error(ctxt->userData,
				2953	"htmlParseStartTag: misplaced <body> tag\n");
				2954	ctxt->wellFormed = 0;
				2955	xmlFree(name);
				2956	return;
				2957	}
				2958	}
				2959	}
				2960
				2961	/*
				2962	* Now parse the attributes, it ends up with the ending
				2963	*
				2964	* (S Attribute)* S?
				2965	*/
				2966	SKIP_BLANKS;
				2967	while ((IS_CHAR(CUR)) &&
				2968	(CUR != '>') &&
				2969	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2970	long cons = ctxt->nbChars;
				2971
				2972	GROW;
				2973	attname = htmlParseAttribute(ctxt, &attvalue);
				2974	if (attname != NULL) {
				2975
				2976	/*
				2977	* Well formedness requires at most one declaration of an attribute
				2978	*/
				2979	for (i = 0; i < nbatts;i += 2) {
				2980	if (xmlStrEqual(atts[i], attname)) {
				2981	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2982	ctxt->sax->error(ctxt->userData,
				2983	"Attribute %s redefined\n",
				2984	attname);
				2985	ctxt->wellFormed = 0;
				2986	xmlFree(attname);
				2987	if (attvalue != NULL)
				2988	xmlFree(attvalue);
				2989	goto failed;
				2990	}
				2991	}
				2992
				2993	/*
				2994	* Add the pair to atts
				2995	*/
				2996	if (atts == NULL) {
				2997	maxatts = 10;
				2998	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				2999	if (atts == NULL) {
				3000	xmlGenericError(xmlGenericErrorContext,
				3001	"malloc of %ld byte failed\n",
				3002	maxatts * (long)sizeof(xmlChar *));
				3003	if (name != NULL) xmlFree(name);
				3004	return;
				3005	}
				3006	} else if (nbatts + 4 > maxatts) {
				3007	maxatts *= 2;
				3008	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3009	maxatts * sizeof(xmlChar *));
				3010	if (atts == NULL) {
				3011	xmlGenericError(xmlGenericErrorContext,
				3012	"realloc of %ld byte failed\n",
				3013	maxatts * (long)sizeof(xmlChar *));
				3014	if (name != NULL) xmlFree(name);
				3015	return;
				3016	}
				3017	}
				3018	atts[nbatts++] = attname;
				3019	atts[nbatts++] = attvalue;
				3020	atts[nbatts] = NULL;
				3021	atts[nbatts + 1] = NULL;
				3022	}
				3023	else {
				3024	/* Dump the bogus attribute string up to the next blank or
				3025	* the end of the tag. */
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	3026	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3027	&& ((CUR != '/') \|\| (NXT(1) != '>')))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3028	NEXT;
				3029	}
				3030
				3031	failed:
				3032	SKIP_BLANKS;
				3033	if (cons == ctxt->nbChars) {
				3034	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3035	ctxt->sax->error(ctxt->userData,
				3036	"htmlParseStartTag: problem parsing attributes\n");
				3037	ctxt->wellFormed = 0;
				3038	break;
				3039	}
				3040	}
				3041
				3042	/*
				3043	* Handle specific association to the META tag
				3044	*/
				3045	if (meta)
				3046	htmlCheckMeta(ctxt, atts);
				3047
				3048	/*
				3049	* SAX: Start of Element !
				3050	*/
				3051	htmlnamePush(ctxt, xmlStrdup(name));
				3052	#ifdef DEBUG
				3053	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3054	#endif
				3055	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3056	ctxt->sax->startElement(ctxt->userData, name, atts);
				3057
				3058	if (atts != NULL) {
				3059	for (i = 0;i < nbatts;i++) {
				3060	if (atts[i] != NULL)
				3061	xmlFree((xmlChar *) atts[i]);
				3062	}
				3063	xmlFree((void *) atts);
				3064	}
				3065	if (name != NULL) xmlFree(name);
				3066	}
				3067
				3068	/**
				3069	* htmlParseEndTag:
				3070	* @ctxt: an HTML parser context
				3071	*
				3072	* parse an end of tag
				3073	*
				3074	* [42] ETag ::= '</' Name S? '>'
				3075	*
				3076	* With namespace
				3077	*
				3078	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3079	*
				3080	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3081	*/
				3082
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3083	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3084	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3085	xmlChar *name;
				3086	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3087	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3088
				3089	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3090	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3091	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3092	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3093	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3094	}
				3095	SKIP(2);
				3096
				3097	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3098	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3099
				3100	/*
				3101	* We should definitely be at the ending "S? '>'" part
				3102	*/
				3103	SKIP_BLANKS;
				3104	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3105	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3106	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3107	ctxt->wellFormed = 0;
				3108	} else
				3109	NEXT;
				3110
				3111	/*
				3112	* If the name read is not one of the element in the parsing stack
				3113	* then return, it's just an error.
				3114	*/
				3115	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3116	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3117	}
				3118	if (i < 0) {
				3119	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3120	ctxt->sax->error(ctxt->userData,
				3121	"Unexpected end tag : %s\n", name);
				3122	xmlFree(name);
				3123	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3124	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3125	}
				3126
				3127
				3128	/*
				3129	* Check for auto-closure of HTML elements.
				3130	*/
				3131
				3132	htmlAutoCloseOnClose(ctxt, name);
				3133
				3134	/*
				3135	* Well formedness constraints, opening and closing must match.
				3136	* With the exception that the autoclose may have popped stuff out
				3137	* of the stack.
				3138	*/
				3139	if (!xmlStrEqual(name, ctxt->name)) {
				3140	#ifdef DEBUG
				3141	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3142	#endif
				3143	if ((ctxt->name != NULL) &&
				3144	(!xmlStrEqual(ctxt->name, name))) {
				3145	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3146	ctxt->sax->error(ctxt->userData,
				3147	"Opening and ending tag mismatch: %s and %s\n",
				3148	name, ctxt->name);
				3149	ctxt->wellFormed = 0;
				3150	}
				3151	}
				3152
				3153	/*
				3154	* SAX: End of Tag
				3155	*/
				3156	oldname = ctxt->name;
				3157	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3158	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3159	ctxt->sax->endElement(ctxt->userData, name);
				3160	oldname = htmlnamePop(ctxt);
				3161	if (oldname != NULL) {
				3162	#ifdef DEBUG
				3163	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3164	#endif
				3165	xmlFree(oldname);
				3166	#ifdef DEBUG
				3167	} else {
				3168	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3169	#endif
				3170	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3171	ret = 1;
				3172	} else {
				3173	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3174	}
				3175
				3176	if (name != NULL)
				3177	xmlFree(name);
				3178
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3179	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3180	}
				3181
				3182
				3183	/**
				3184	* htmlParseReference:
				3185	* @ctxt: an HTML parser context
				3186	*
				3187	* parse and handle entity references in content,
				3188	* this will end-up in a call to character() since this is either a
				3189	* CharRef, or a predefined entity.
				3190	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3191	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3192	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3193	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3194	xmlChar out[6];
				3195	xmlChar *name;
				3196	if (CUR != '&') return;
				3197
				3198	if (NXT(1) == '#') {
				3199	unsigned int c;
				3200	int bits, i = 0;
				3201
				3202	c = htmlParseCharRef(ctxt);
				3203	if (c == 0)
				3204	return;
				3205
				3206	if (c < 0x80) { out[i++]= c; bits= -6; }
				3207	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3208	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3209	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3210
				3211	for ( ; bits >= 0; bits-= 6) {
				3212	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3213	}
				3214	out[i] = 0;
				3215
				3216	htmlCheckParagraph(ctxt);
				3217	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3218	ctxt->sax->characters(ctxt->userData, out, i);
				3219	} else {
				3220	ent = htmlParseEntityRef(ctxt, &name);
				3221	if (name == NULL) {
				3222	htmlCheckParagraph(ctxt);
				3223	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3224	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3225	return;
				3226	}
Daniel Veillard	e645e8c	2002-10-22 17:35:37 +0000	[diff] [blame]	3227	if ((ent == NULL) \|\| !(ent->value > 0)) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3228	htmlCheckParagraph(ctxt);
				3229	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3230	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3231	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3232	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3233	}
				3234	} else {
				3235	unsigned int c;
				3236	int bits, i = 0;
				3237
				3238	c = ent->value;
				3239	if (c < 0x80)
				3240	{ out[i++]= c; bits= -6; }
				3241	else if (c < 0x800)
				3242	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3243	else if (c < 0x10000)
				3244	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3245	else
				3246	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3247
				3248	for ( ; bits >= 0; bits-= 6) {
				3249	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3250	}
				3251	out[i] = 0;
				3252
				3253	htmlCheckParagraph(ctxt);
				3254	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3255	ctxt->sax->characters(ctxt->userData, out, i);
				3256	}
				3257	xmlFree(name);
				3258	}
				3259	}
				3260
				3261	/**
				3262	* htmlParseContent:
				3263	* @ctxt: an HTML parser context
				3264	* @name: the node name
				3265	*
				3266	* Parse a content: comment, sub-element, reference or text.
				3267	*
				3268	*/
				3269
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3270	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3271	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3272	xmlChar *currentNode;
				3273	int depth;
				3274
				3275	currentNode = xmlStrdup(ctxt->name);
				3276	depth = ctxt->nameNr;
				3277	while (1) {
				3278	long cons = ctxt->nbChars;
				3279
				3280	GROW;
				3281	/*
				3282	* Our tag or one of it's parent or children is ending.
				3283	*/
				3284	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3285	if (htmlParseEndTag(ctxt) &&
				3286	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3287	if (currentNode != NULL)
				3288	xmlFree(currentNode);
				3289	return;
				3290	}
				3291	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3292	}
				3293
				3294	/*
				3295	* Has this node been popped out during parsing of
				3296	* the next element
				3297	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3298	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3299	(!xmlStrEqual(currentNode, ctxt->name)))
				3300	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3301	if (currentNode != NULL) xmlFree(currentNode);
				3302	return;
				3303	}
				3304
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3305	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3306	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3307	/*
				3308	* Handle SCRIPT/STYLE separately
				3309	*/
				3310	htmlParseScript(ctxt);
				3311	} else {
				3312	/*
				3313	* Sometimes DOCTYPE arrives in the middle of the document
				3314	*/
				3315	if ((CUR == '<') && (NXT(1) == '!') &&
				3316	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3317	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3318	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3319	(UPP(8) == 'E')) {
				3320	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3321	ctxt->sax->error(ctxt->userData,
				3322	"Misplaced DOCTYPE declaration\n");
				3323	ctxt->wellFormed = 0;
				3324	htmlParseDocTypeDecl(ctxt);
				3325	}
				3326
				3327	/*
				3328	* First case : a comment
				3329	*/
				3330	if ((CUR == '<') && (NXT(1) == '!') &&
				3331	(NXT(2) == '-') && (NXT(3) == '-')) {
				3332	htmlParseComment(ctxt);
				3333	}
				3334
				3335	/*
				3336	* Second case : a sub-element.
				3337	*/
				3338	else if (CUR == '<') {
				3339	htmlParseElement(ctxt);
				3340	}
				3341
				3342	/*
				3343	* Third case : a reference. If if has not been resolved,
				3344	* parsing returns it's Name, create the node
				3345	*/
				3346	else if (CUR == '&') {
				3347	htmlParseReference(ctxt);
				3348	}
				3349
				3350	/*
				3351	* Fourth : end of the resource
				3352	*/
				3353	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3354	htmlAutoCloseOnEnd(ctxt);
				3355	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3356	}
				3357
				3358	/*
				3359	* Last case, text. Note that References are handled directly.
				3360	*/
				3361	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3362	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3363	}
				3364
				3365	if (cons == ctxt->nbChars) {
				3366	if (ctxt->node != NULL) {
				3367	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3368	ctxt->sax->error(ctxt->userData,
				3369	"detected an error in element content\n");
				3370	ctxt->wellFormed = 0;
				3371	}
				3372	break;
				3373	}
				3374	}
				3375	GROW;
				3376	}
				3377	if (currentNode != NULL) xmlFree(currentNode);
				3378	}
				3379
				3380	/**
				3381	* htmlParseElement:
				3382	* @ctxt: an HTML parser context
				3383	*
				3384	* parse an HTML element, this is highly recursive
				3385	*
				3386	* [39] element ::= EmptyElemTag \| STag content ETag
				3387	*
				3388	* [41] Attribute ::= Name Eq AttValue
				3389	*/
				3390
				3391	void
				3392	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3393	xmlChar *name;
				3394	xmlChar *currentNode = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3395	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3396	htmlParserNodeInfo node_info;
				3397	xmlChar *oldname;
				3398	int depth = ctxt->nameNr;
Daniel Veillard	3fbe8e3	2001-10-06 13:30:33 +0000	[diff] [blame]	3399	const xmlChar *oldptr;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3400
				3401	/* Capture start position */
				3402	if (ctxt->record_info) {
				3403	node_info.begin_pos = ctxt->input->consumed +
				3404	(CUR_PTR - ctxt->input->base);
				3405	node_info.begin_line = ctxt->input->line;
				3406	}
				3407
				3408	oldname = xmlStrdup(ctxt->name);
				3409	htmlParseStartTag(ctxt);
				3410	name = ctxt->name;
				3411	#ifdef DEBUG
				3412	if (oldname == NULL)
				3413	xmlGenericError(xmlGenericErrorContext,
				3414	"Start of element %s\n", name);
				3415	else if (name == NULL)
				3416	xmlGenericError(xmlGenericErrorContext,
				3417	"Start of element failed, was %s\n", oldname);
				3418	else
				3419	xmlGenericError(xmlGenericErrorContext,
				3420	"Start of element %s, was %s\n", name, oldname);
				3421	#endif
				3422	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3423	(name == NULL)) {
				3424	if (CUR == '>')
				3425	NEXT;
				3426	if (oldname != NULL)
				3427	xmlFree(oldname);
				3428	return;
				3429	}
				3430	if (oldname != NULL)
				3431	xmlFree(oldname);
				3432
				3433	/*
				3434	* Lookup the info for that element.
				3435	*/
				3436	info = htmlTagLookup(name);
				3437	if (info == NULL) {
				3438	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3439	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3440	name);
				3441	ctxt->wellFormed = 0;
				3442	} else if (info->depr) {
				3443	/***************************
				3444	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3445	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3446	name);
				3447	***************************/
				3448	}
				3449
				3450	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	3451	* Check for an Empty Element labeled the XML/SGML way
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3452	*/
				3453	if ((CUR == '/') && (NXT(1) == '>')) {
				3454	SKIP(2);
				3455	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3456	ctxt->sax->endElement(ctxt->userData, name);
				3457	oldname = htmlnamePop(ctxt);
				3458	#ifdef DEBUG
				3459	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3460	#endif
				3461	if (oldname != NULL)
				3462	xmlFree(oldname);
				3463	return;
				3464	}
				3465
				3466	if (CUR == '>') {
				3467	NEXT;
				3468	} else {
				3469	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3470	ctxt->sax->error(ctxt->userData,
				3471	"Couldn't find end of Start Tag %s\n",
				3472	name);
				3473	ctxt->wellFormed = 0;
				3474
				3475	/*
				3476	* end of parsing of this node.
				3477	*/
				3478	if (xmlStrEqual(name, ctxt->name)) {
				3479	nodePop(ctxt);
				3480	oldname = htmlnamePop(ctxt);
				3481	#ifdef DEBUG
				3482	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3483	#endif
				3484	if (oldname != NULL)
				3485	xmlFree(oldname);
				3486	}
				3487
				3488	/*
				3489	* Capture end position and add node
				3490	*/
				3491	if ( currentNode != NULL && ctxt->record_info ) {
				3492	node_info.end_pos = ctxt->input->consumed +
				3493	(CUR_PTR - ctxt->input->base);
				3494	node_info.end_line = ctxt->input->line;
				3495	node_info.node = ctxt->node;
				3496	xmlParserAddNodeInfo(ctxt, &node_info);
				3497	}
				3498	return;
				3499	}
				3500
				3501	/*
				3502	* Check for an Empty Element from DTD definition
				3503	*/
				3504	if ((info != NULL) && (info->empty)) {
				3505	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3506	ctxt->sax->endElement(ctxt->userData, name);
				3507	oldname = htmlnamePop(ctxt);
				3508	#ifdef DEBUG
				3509	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3510	#endif
				3511	if (oldname != NULL)
				3512	xmlFree(oldname);
				3513	return;
				3514	}
				3515
				3516	/*
				3517	* Parse the content of the element:
				3518	*/
				3519	currentNode = xmlStrdup(ctxt->name);
				3520	depth = ctxt->nameNr;
				3521	while (IS_CHAR(CUR)) {
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3522	oldptr = ctxt->input->cur;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3523	htmlParseContent(ctxt);
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3524	if (oldptr==ctxt->input->cur) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3525	if (ctxt->nameNr < depth) break;
				3526	}
				3527
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3528	/*
				3529	* Capture end position and add node
				3530	*/
				3531	if ( currentNode != NULL && ctxt->record_info ) {
				3532	node_info.end_pos = ctxt->input->consumed +
				3533	(CUR_PTR - ctxt->input->base);
				3534	node_info.end_line = ctxt->input->line;
				3535	node_info.node = ctxt->node;
				3536	xmlParserAddNodeInfo(ctxt, &node_info);
				3537	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3538	if (!IS_CHAR(CUR)) {
				3539	htmlAutoCloseOnEnd(ctxt);
				3540	}
				3541
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3542	if (currentNode != NULL)
				3543	xmlFree(currentNode);
				3544	}
				3545
				3546	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	3547	* htmlParseDocument:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3548	* @ctxt: an HTML parser context
				3549	*
				3550	* parse an HTML document (and build a tree if using the standard SAX
				3551	* interface).
				3552	*
				3553	* Returns 0, -1 in case of error. the parser context is augmented
				3554	* as a result of the parsing.
				3555	*/
				3556
Daniel Veillard	1b31e4a	2002-05-27 14:44:50 +0000	[diff] [blame]	3557	int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3558	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3559	xmlDtdPtr dtd;
				3560
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	3561	xmlInitParser();
				3562
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3563	htmlDefaultSAXHandlerInit();
				3564	ctxt->html = 1;
				3565
				3566	GROW;
				3567	/*
				3568	* SAX: beginning of the document processing.
				3569	*/
				3570	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3571	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3572
				3573	/*
				3574	* Wipe out everything which is before the first '<'
				3575	*/
				3576	SKIP_BLANKS;
				3577	if (CUR == 0) {
				3578	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3579	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3580	ctxt->wellFormed = 0;
				3581	}
				3582
				3583	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3584	ctxt->sax->startDocument(ctxt->userData);
				3585
				3586
				3587	/*
				3588	* Parse possible comments before any content
				3589	*/
				3590	while ((CUR == '<') && (NXT(1) == '!') &&
				3591	(NXT(2) == '-') && (NXT(3) == '-')) {
				3592	htmlParseComment(ctxt);
				3593	SKIP_BLANKS;
				3594	}
				3595
				3596
				3597	/*
				3598	* Then possibly doc type declaration(s) and more Misc
				3599	* (doctypedecl Misc*)?
				3600	*/
				3601	if ((CUR == '<') && (NXT(1) == '!') &&
				3602	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3603	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3604	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3605	(UPP(8) == 'E')) {
				3606	htmlParseDocTypeDecl(ctxt);
				3607	}
				3608	SKIP_BLANKS;
				3609
				3610	/*
				3611	* Parse possible comments before any content
				3612	*/
				3613	while ((CUR == '<') && (NXT(1) == '!') &&
				3614	(NXT(2) == '-') && (NXT(3) == '-')) {
				3615	htmlParseComment(ctxt);
				3616	SKIP_BLANKS;
				3617	}
				3618
				3619	/*
				3620	* Time to start parsing the tree itself
				3621	*/
				3622	htmlParseContent(ctxt);
				3623
				3624	/*
				3625	* autoclose
				3626	*/
				3627	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3628	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3629
				3630
				3631	/*
				3632	* SAX: end of the document processing.
				3633	*/
				3634	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3635	ctxt->sax->endDocument(ctxt->userData);
				3636
				3637	if (ctxt->myDoc != NULL) {
				3638	dtd = xmlGetIntSubset(ctxt->myDoc);
				3639	if (dtd == NULL)
				3640	ctxt->myDoc->intSubset =
				3641	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3642	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3643	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3644	}
				3645	if (! ctxt->wellFormed) return(-1);
				3646	return(0);
				3647	}
				3648
				3649
				3650	/************************************************************************
				3651	* *
				3652	* Parser contexts handling *
				3653	* *
				3654	************************************************************************/
				3655
				3656	/**
				3657	* xmlInitParserCtxt:
				3658	* @ctxt: an HTML parser context
				3659	*
				3660	* Initialize a parser context
				3661	*/
				3662
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3663	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3664	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3665	{
				3666	htmlSAXHandler *sax;
				3667
				3668	if (ctxt == NULL) return;
				3669	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3670
				3671	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3672	if (sax == NULL) {
				3673	xmlGenericError(xmlGenericErrorContext,
				3674	"htmlInitParserCtxt: out of memory\n");
				3675	}
				3676	else
				3677	memset(sax, 0, sizeof(htmlSAXHandler));
				3678
				3679	/* Allocate the Input stack */
				3680	ctxt->inputTab = (htmlParserInputPtr *)
				3681	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3682	if (ctxt->inputTab == NULL) {
				3683	xmlGenericError(xmlGenericErrorContext,
				3684	"htmlInitParserCtxt: out of memory\n");
				3685	ctxt->inputNr = 0;
				3686	ctxt->inputMax = 0;
				3687	ctxt->input = NULL;
				3688	return;
				3689	}
				3690	ctxt->inputNr = 0;
				3691	ctxt->inputMax = 5;
				3692	ctxt->input = NULL;
				3693	ctxt->version = NULL;
				3694	ctxt->encoding = NULL;
				3695	ctxt->standalone = -1;
				3696	ctxt->instate = XML_PARSER_START;
				3697
				3698	/* Allocate the Node stack */
				3699	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3700	if (ctxt->nodeTab == NULL) {
				3701	xmlGenericError(xmlGenericErrorContext,
				3702	"htmlInitParserCtxt: out of memory\n");
				3703	ctxt->nodeNr = 0;
				3704	ctxt->nodeMax = 0;
				3705	ctxt->node = NULL;
				3706	ctxt->inputNr = 0;
				3707	ctxt->inputMax = 0;
				3708	ctxt->input = NULL;
				3709	return;
				3710	}
				3711	ctxt->nodeNr = 0;
				3712	ctxt->nodeMax = 10;
				3713	ctxt->node = NULL;
				3714
				3715	/* Allocate the Name stack */
				3716	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3717	if (ctxt->nameTab == NULL) {
				3718	xmlGenericError(xmlGenericErrorContext,
				3719	"htmlInitParserCtxt: out of memory\n");
				3720	ctxt->nameNr = 0;
				3721	ctxt->nameMax = 10;
				3722	ctxt->name = NULL;
				3723	ctxt->nodeNr = 0;
				3724	ctxt->nodeMax = 0;
				3725	ctxt->node = NULL;
				3726	ctxt->inputNr = 0;
				3727	ctxt->inputMax = 0;
				3728	ctxt->input = NULL;
				3729	return;
				3730	}
				3731	ctxt->nameNr = 0;
				3732	ctxt->nameMax = 10;
				3733	ctxt->name = NULL;
				3734
				3735	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3736	else {
				3737	ctxt->sax = sax;
				3738	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3739	}
				3740	ctxt->userData = ctxt;
				3741	ctxt->myDoc = NULL;
				3742	ctxt->wellFormed = 1;
				3743	ctxt->replaceEntities = 0;
Daniel Veillard	635ef72	2001-10-29 11:48:19 +0000	[diff] [blame]	3744	ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3745	ctxt->html = 1;
				3746	ctxt->record_info = 0;
				3747	ctxt->validate = 0;
				3748	ctxt->nbChars = 0;
				3749	ctxt->checkIndex = 0;
Daniel Veillard	dc2cee2	2001-08-22 16:30:37 +0000	[diff] [blame]	3750	ctxt->catalogs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3751	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3752	}
				3753
				3754	/**
				3755	* htmlFreeParserCtxt:
				3756	* @ctxt: an HTML parser context
				3757	*
				3758	* Free all the memory used by a parser context. However the parsed
				3759	* document in ctxt->myDoc is not freed.
				3760	*/
				3761
				3762	void
				3763	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3764	{
				3765	xmlFreeParserCtxt(ctxt);
				3766	}
				3767
				3768	/**
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3769	* htmlNewParserCtxt:
				3770	*
				3771	* Allocate and initialize a new parser context.
				3772	*
				3773	* Returns the xmlParserCtxtPtr or NULL
				3774	*/
				3775
				3776	static htmlParserCtxtPtr
				3777	htmlNewParserCtxt(void)
				3778	{
				3779	xmlParserCtxtPtr ctxt;
				3780
				3781	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
				3782	if (ctxt == NULL) {
				3783	xmlGenericError(xmlGenericErrorContext,
				3784	"xmlNewParserCtxt : cannot allocate context\n");
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3785	return(NULL);
				3786	}
				3787	memset(ctxt, 0, sizeof(xmlParserCtxt));
				3788	htmlInitParserCtxt(ctxt);
				3789	return(ctxt);
				3790	}
				3791
				3792	/**
				3793	* htmlCreateMemoryParserCtxt:
				3794	* @buffer: a pointer to a char array
				3795	* @size: the size of the array
				3796	*
				3797	* Create a parser context for an HTML in-memory document.
				3798	*
				3799	* Returns the new parser context or NULL
				3800	*/
				3801	static htmlParserCtxtPtr
				3802	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
				3803	xmlParserCtxtPtr ctxt;
				3804	xmlParserInputPtr input;
				3805	xmlParserInputBufferPtr buf;
				3806
				3807	if (buffer == NULL)
				3808	return(NULL);
				3809	if (size <= 0)
				3810	return(NULL);
				3811
				3812	ctxt = htmlNewParserCtxt();
				3813	if (ctxt == NULL)
				3814	return(NULL);
				3815
				3816	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				3817	if (buf == NULL) return(NULL);
				3818
				3819	input = xmlNewInputStream(ctxt);
				3820	if (input == NULL) {
				3821	xmlFreeParserCtxt(ctxt);
				3822	return(NULL);
				3823	}
				3824
				3825	input->filename = NULL;
				3826	input->buf = buf;
				3827	input->base = input->buf->buffer->content;
				3828	input->cur = input->buf->buffer->content;
				3829	input->end = &input->buf->buffer->content[input->buf->buffer->use];
				3830
				3831	inputPush(ctxt, input);
				3832	return(ctxt);
				3833	}
				3834
				3835	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	3836	* htmlCreateDocParserCtxt:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3837	* @cur: a pointer to an array of xmlChar
				3838	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3839	*
				3840	* Create a parser context for an HTML document.
				3841	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3842	* TODO: check the need to add encoding handling there
				3843	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3844	* Returns the new parser context or NULL
				3845	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3846	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3847	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3848	int len;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3849
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3850	if (cur == NULL)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3851	return(NULL);
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame]	3852	len = xmlStrlen(cur);
				3853	return(htmlCreateMemoryParserCtxt((char *)cur, len));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3854	}
				3855
				3856	/************************************************************************
				3857	* *
				3858	* Progressive parsing interfaces *
				3859	* *
				3860	************************************************************************/
				3861
				3862	/**
				3863	* htmlParseLookupSequence:
				3864	* @ctxt: an HTML parser context
				3865	* @first: the first char to lookup
				3866	* @next: the next char to lookup or zero
				3867	* @third: the next char to lookup or zero
				3868	*
				3869	* Try to find if a sequence (first, next, third) or just (first next) or
				3870	* (first) is available in the input stream.
				3871	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3872	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3873	* parser, do not use liberally.
				3874	* This is basically similar to xmlParseLookupSequence()
				3875	*
				3876	* Returns the index to the current parsing point if the full sequence
				3877	* is available, -1 otherwise.
				3878	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3879	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3880	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3881	xmlChar next, xmlChar third) {
				3882	int base, len;
				3883	htmlParserInputPtr in;
				3884	const xmlChar *buf;
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	3885	int incomment = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3886
				3887	in = ctxt->input;
				3888	if (in == NULL) return(-1);
				3889	base = in->cur - in->base;
				3890	if (base < 0) return(-1);
				3891	if (ctxt->checkIndex > base)
				3892	base = ctxt->checkIndex;
				3893	if (in->buf == NULL) {
				3894	buf = in->base;
				3895	len = in->length;
				3896	} else {
				3897	buf = in->buf->buffer->content;
				3898	len = in->buf->buffer->use;
				3899	}
				3900	/* take into account the sequence length */
				3901	if (third) len -= 2;
				3902	else if (next) len --;
				3903	for (;base < len;base++) {
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	3904	if (!incomment && (base + 4 < len)) {
				3905	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
				3906	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
				3907	incomment = 1;
				3908	}
				3909	/* do not increment base, some people use <!--> */
				3910	}
				3911	if (incomment) {
				3912	if (base + 3 < len)
				3913	return(-1);
				3914	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
				3915	(buf[base + 2] == '>')) {
				3916	incomment = 0;
				3917	base += 2;
				3918	}
				3919	continue;
				3920	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3921	if (buf[base] == first) {
				3922	if (third != 0) {
				3923	if ((buf[base + 1] != next) \|\|
				3924	(buf[base + 2] != third)) continue;
				3925	} else if (next != 0) {
				3926	if (buf[base + 1] != next) continue;
				3927	}
				3928	ctxt->checkIndex = 0;
				3929	#ifdef DEBUG_PUSH
				3930	if (next == 0)
				3931	xmlGenericError(xmlGenericErrorContext,
				3932	"HPP: lookup '%c' found at %d\n",
				3933	first, base);
				3934	else if (third == 0)
				3935	xmlGenericError(xmlGenericErrorContext,
				3936	"HPP: lookup '%c%c' found at %d\n",
				3937	first, next, base);
				3938	else
				3939	xmlGenericError(xmlGenericErrorContext,
				3940	"HPP: lookup '%c%c%c' found at %d\n",
				3941	first, next, third, base);
				3942	#endif
				3943	return(base - (in->cur - in->base));
				3944	}
				3945	}
				3946	ctxt->checkIndex = base;
				3947	#ifdef DEBUG_PUSH
				3948	if (next == 0)
				3949	xmlGenericError(xmlGenericErrorContext,
				3950	"HPP: lookup '%c' failed\n", first);
				3951	else if (third == 0)
				3952	xmlGenericError(xmlGenericErrorContext,
				3953	"HPP: lookup '%c%c' failed\n", first, next);
				3954	else
				3955	xmlGenericError(xmlGenericErrorContext,
				3956	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3957	#endif
				3958	return(-1);
				3959	}
				3960
				3961	/**
				3962	* htmlParseTryOrFinish:
				3963	* @ctxt: an HTML parser context
				3964	* @terminate: last chunk indicator
				3965	*
				3966	* Try to progress on parsing
				3967	*
				3968	* Returns zero if no parsing was possible
				3969	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3970	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3971	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3972	int ret = 0;
				3973	htmlParserInputPtr in;
				3974	int avail = 0;
				3975	xmlChar cur, next;
				3976
				3977	#ifdef DEBUG_PUSH
				3978	switch (ctxt->instate) {
				3979	case XML_PARSER_EOF:
				3980	xmlGenericError(xmlGenericErrorContext,
				3981	"HPP: try EOF\n"); break;
				3982	case XML_PARSER_START:
				3983	xmlGenericError(xmlGenericErrorContext,
				3984	"HPP: try START\n"); break;
				3985	case XML_PARSER_MISC:
				3986	xmlGenericError(xmlGenericErrorContext,
				3987	"HPP: try MISC\n");break;
				3988	case XML_PARSER_COMMENT:
				3989	xmlGenericError(xmlGenericErrorContext,
				3990	"HPP: try COMMENT\n");break;
				3991	case XML_PARSER_PROLOG:
				3992	xmlGenericError(xmlGenericErrorContext,
				3993	"HPP: try PROLOG\n");break;
				3994	case XML_PARSER_START_TAG:
				3995	xmlGenericError(xmlGenericErrorContext,
				3996	"HPP: try START_TAG\n");break;
				3997	case XML_PARSER_CONTENT:
				3998	xmlGenericError(xmlGenericErrorContext,
				3999	"HPP: try CONTENT\n");break;
				4000	case XML_PARSER_CDATA_SECTION:
				4001	xmlGenericError(xmlGenericErrorContext,
				4002	"HPP: try CDATA_SECTION\n");break;
				4003	case XML_PARSER_END_TAG:
				4004	xmlGenericError(xmlGenericErrorContext,
				4005	"HPP: try END_TAG\n");break;
				4006	case XML_PARSER_ENTITY_DECL:
				4007	xmlGenericError(xmlGenericErrorContext,
				4008	"HPP: try ENTITY_DECL\n");break;
				4009	case XML_PARSER_ENTITY_VALUE:
				4010	xmlGenericError(xmlGenericErrorContext,
				4011	"HPP: try ENTITY_VALUE\n");break;
				4012	case XML_PARSER_ATTRIBUTE_VALUE:
				4013	xmlGenericError(xmlGenericErrorContext,
				4014	"HPP: try ATTRIBUTE_VALUE\n");break;
				4015	case XML_PARSER_DTD:
				4016	xmlGenericError(xmlGenericErrorContext,
				4017	"HPP: try DTD\n");break;
				4018	case XML_PARSER_EPILOG:
				4019	xmlGenericError(xmlGenericErrorContext,
				4020	"HPP: try EPILOG\n");break;
				4021	case XML_PARSER_PI:
				4022	xmlGenericError(xmlGenericErrorContext,
				4023	"HPP: try PI\n");break;
				4024	case XML_PARSER_SYSTEM_LITERAL:
				4025	xmlGenericError(xmlGenericErrorContext,
				4026	"HPP: try SYSTEM_LITERAL\n");break;
				4027	}
				4028	#endif
				4029
				4030	while (1) {
				4031
				4032	in = ctxt->input;
				4033	if (in == NULL) break;
				4034	if (in->buf == NULL)
				4035	avail = in->length - (in->cur - in->base);
				4036	else
				4037	avail = in->buf->buffer->use - (in->cur - in->base);
				4038	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4039	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4040	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4041	/*
				4042	* SAX: end of the document processing.
				4043	*/
				4044	ctxt->instate = XML_PARSER_EOF;
				4045	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4046	ctxt->sax->endDocument(ctxt->userData);
				4047	}
				4048	}
				4049	if (avail < 1)
				4050	goto done;
				4051	switch (ctxt->instate) {
				4052	case XML_PARSER_EOF:
				4053	/*
				4054	* Document parsing is done !
				4055	*/
				4056	goto done;
				4057	case XML_PARSER_START:
				4058	/*
				4059	* Very first chars read from the document flow.
				4060	*/
				4061	cur = in->cur[0];
				4062	if (IS_BLANK(cur)) {
				4063	SKIP_BLANKS;
				4064	if (in->buf == NULL)
				4065	avail = in->length - (in->cur - in->base);
				4066	else
				4067	avail = in->buf->buffer->use - (in->cur - in->base);
				4068	}
				4069	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4070	ctxt->sax->setDocumentLocator(ctxt->userData,
				4071	&xmlDefaultSAXLocator);
				4072	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4073	(!ctxt->disableSAX))
				4074	ctxt->sax->startDocument(ctxt->userData);
				4075
				4076	cur = in->cur[0];
				4077	next = in->cur[1];
				4078	if ((cur == '<') && (next == '!') &&
				4079	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4080	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4081	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4082	(UPP(8) == 'E')) {
				4083	if ((!terminate) &&
				4084	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4085	goto done;
				4086	#ifdef DEBUG_PUSH
				4087	xmlGenericError(xmlGenericErrorContext,
				4088	"HPP: Parsing internal subset\n");
				4089	#endif
				4090	htmlParseDocTypeDecl(ctxt);
				4091	ctxt->instate = XML_PARSER_PROLOG;
				4092	#ifdef DEBUG_PUSH
				4093	xmlGenericError(xmlGenericErrorContext,
				4094	"HPP: entering PROLOG\n");
				4095	#endif
				4096	} else {
				4097	ctxt->instate = XML_PARSER_MISC;
				4098	}
				4099	#ifdef DEBUG_PUSH
				4100	xmlGenericError(xmlGenericErrorContext,
				4101	"HPP: entering MISC\n");
				4102	#endif
				4103	break;
				4104	case XML_PARSER_MISC:
				4105	SKIP_BLANKS;
				4106	if (in->buf == NULL)
				4107	avail = in->length - (in->cur - in->base);
				4108	else
				4109	avail = in->buf->buffer->use - (in->cur - in->base);
				4110	if (avail < 2)
				4111	goto done;
				4112	cur = in->cur[0];
				4113	next = in->cur[1];
				4114	if ((cur == '<') && (next == '!') &&
				4115	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4116	if ((!terminate) &&
				4117	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4118	goto done;
				4119	#ifdef DEBUG_PUSH
				4120	xmlGenericError(xmlGenericErrorContext,
				4121	"HPP: Parsing Comment\n");
				4122	#endif
				4123	htmlParseComment(ctxt);
				4124	ctxt->instate = XML_PARSER_MISC;
				4125	} else if ((cur == '<') && (next == '!') &&
				4126	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4127	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4128	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4129	(UPP(8) == 'E')) {
				4130	if ((!terminate) &&
				4131	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4132	goto done;
				4133	#ifdef DEBUG_PUSH
				4134	xmlGenericError(xmlGenericErrorContext,
				4135	"HPP: Parsing internal subset\n");
				4136	#endif
				4137	htmlParseDocTypeDecl(ctxt);
				4138	ctxt->instate = XML_PARSER_PROLOG;
				4139	#ifdef DEBUG_PUSH
				4140	xmlGenericError(xmlGenericErrorContext,
				4141	"HPP: entering PROLOG\n");
				4142	#endif
				4143	} else if ((cur == '<') && (next == '!') &&
				4144	(avail < 9)) {
				4145	goto done;
				4146	} else {
				4147	ctxt->instate = XML_PARSER_START_TAG;
				4148	#ifdef DEBUG_PUSH
				4149	xmlGenericError(xmlGenericErrorContext,
				4150	"HPP: entering START_TAG\n");
				4151	#endif
				4152	}
				4153	break;
				4154	case XML_PARSER_PROLOG:
				4155	SKIP_BLANKS;
				4156	if (in->buf == NULL)
				4157	avail = in->length - (in->cur - in->base);
				4158	else
				4159	avail = in->buf->buffer->use - (in->cur - in->base);
				4160	if (avail < 2)
				4161	goto done;
				4162	cur = in->cur[0];
				4163	next = in->cur[1];
				4164	if ((cur == '<') && (next == '!') &&
				4165	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4166	if ((!terminate) &&
				4167	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4168	goto done;
				4169	#ifdef DEBUG_PUSH
				4170	xmlGenericError(xmlGenericErrorContext,
				4171	"HPP: Parsing Comment\n");
				4172	#endif
				4173	htmlParseComment(ctxt);
				4174	ctxt->instate = XML_PARSER_PROLOG;
				4175	} else if ((cur == '<') && (next == '!') &&
				4176	(avail < 4)) {
				4177	goto done;
				4178	} else {
				4179	ctxt->instate = XML_PARSER_START_TAG;
				4180	#ifdef DEBUG_PUSH
				4181	xmlGenericError(xmlGenericErrorContext,
				4182	"HPP: entering START_TAG\n");
				4183	#endif
				4184	}
				4185	break;
				4186	case XML_PARSER_EPILOG:
				4187	if (in->buf == NULL)
				4188	avail = in->length - (in->cur - in->base);
				4189	else
				4190	avail = in->buf->buffer->use - (in->cur - in->base);
				4191	if (avail < 1)
				4192	goto done;
				4193	cur = in->cur[0];
				4194	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4195	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4196	goto done;
				4197	}
				4198	if (avail < 2)
				4199	goto done;
				4200	next = in->cur[1];
				4201	if ((cur == '<') && (next == '!') &&
				4202	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4203	if ((!terminate) &&
				4204	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4205	goto done;
				4206	#ifdef DEBUG_PUSH
				4207	xmlGenericError(xmlGenericErrorContext,
				4208	"HPP: Parsing Comment\n");
				4209	#endif
				4210	htmlParseComment(ctxt);
				4211	ctxt->instate = XML_PARSER_EPILOG;
				4212	} else if ((cur == '<') && (next == '!') &&
				4213	(avail < 4)) {
				4214	goto done;
				4215	} else {
				4216	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4217	ctxt->wellFormed = 0;
				4218	ctxt->instate = XML_PARSER_EOF;
				4219	#ifdef DEBUG_PUSH
				4220	xmlGenericError(xmlGenericErrorContext,
				4221	"HPP: entering EOF\n");
				4222	#endif
				4223	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4224	ctxt->sax->endDocument(ctxt->userData);
				4225	goto done;
				4226	}
				4227	break;
				4228	case XML_PARSER_START_TAG: {
				4229	xmlChar name, oldname;
				4230	int depth = ctxt->nameNr;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	4231	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4232
				4233	if (avail < 2)
				4234	goto done;
				4235	cur = in->cur[0];
				4236	if (cur != '<') {
				4237	ctxt->instate = XML_PARSER_CONTENT;
				4238	#ifdef DEBUG_PUSH
				4239	xmlGenericError(xmlGenericErrorContext,
				4240	"HPP: entering CONTENT\n");
				4241	#endif
				4242	break;
				4243	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4244	if (in->cur[1] == '/') {
				4245	ctxt->instate = XML_PARSER_END_TAG;
				4246	ctxt->checkIndex = 0;
				4247	#ifdef DEBUG_PUSH
				4248	xmlGenericError(xmlGenericErrorContext,
				4249	"HPP: entering END_TAG\n");
				4250	#endif
				4251	break;
				4252	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4253	if ((!terminate) &&
				4254	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4255	goto done;
				4256
				4257	oldname = xmlStrdup(ctxt->name);
				4258	htmlParseStartTag(ctxt);
				4259	name = ctxt->name;
				4260	#ifdef DEBUG
				4261	if (oldname == NULL)
				4262	xmlGenericError(xmlGenericErrorContext,
				4263	"Start of element %s\n", name);
				4264	else if (name == NULL)
				4265	xmlGenericError(xmlGenericErrorContext,
				4266	"Start of element failed, was %s\n",
				4267	oldname);
				4268	else
				4269	xmlGenericError(xmlGenericErrorContext,
				4270	"Start of element %s, was %s\n",
				4271	name, oldname);
				4272	#endif
				4273	if (((depth == ctxt->nameNr) &&
				4274	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4275	(name == NULL)) {
				4276	if (CUR == '>')
				4277	NEXT;
				4278	if (oldname != NULL)
				4279	xmlFree(oldname);
				4280	break;
				4281	}
				4282	if (oldname != NULL)
				4283	xmlFree(oldname);
				4284
				4285	/*
				4286	* Lookup the info for that element.
				4287	*/
				4288	info = htmlTagLookup(name);
				4289	if (info == NULL) {
				4290	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4291	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4292	name);
				4293	ctxt->wellFormed = 0;
				4294	} else if (info->depr) {
				4295	/***************************
				4296	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4297	ctxt->sax->warning(ctxt->userData,
				4298	"Tag %s is deprecated\n",
				4299	name);
				4300	***************************/
				4301	}
				4302
				4303	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	4304	* Check for an Empty Element labeled the XML/SGML way
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4305	*/
				4306	if ((CUR == '/') && (NXT(1) == '>')) {
				4307	SKIP(2);
				4308	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4309	ctxt->sax->endElement(ctxt->userData, name);
				4310	oldname = htmlnamePop(ctxt);
				4311	#ifdef DEBUG
				4312	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4313	oldname);
				4314	#endif
				4315	if (oldname != NULL)
				4316	xmlFree(oldname);
				4317	ctxt->instate = XML_PARSER_CONTENT;
				4318	#ifdef DEBUG_PUSH
				4319	xmlGenericError(xmlGenericErrorContext,
				4320	"HPP: entering CONTENT\n");
				4321	#endif
				4322	break;
				4323	}
				4324
				4325	if (CUR == '>') {
				4326	NEXT;
				4327	} else {
				4328	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4329	ctxt->sax->error(ctxt->userData,
				4330	"Couldn't find end of Start Tag %s\n",
				4331	name);
				4332	ctxt->wellFormed = 0;
				4333
				4334	/*
				4335	* end of parsing of this node.
				4336	*/
				4337	if (xmlStrEqual(name, ctxt->name)) {
				4338	nodePop(ctxt);
				4339	oldname = htmlnamePop(ctxt);
				4340	#ifdef DEBUG
				4341	xmlGenericError(xmlGenericErrorContext,
				4342	"End of start tag problem: popping out %s\n", oldname);
				4343	#endif
				4344	if (oldname != NULL)
				4345	xmlFree(oldname);
				4346	}
				4347
				4348	ctxt->instate = XML_PARSER_CONTENT;
				4349	#ifdef DEBUG_PUSH
				4350	xmlGenericError(xmlGenericErrorContext,
				4351	"HPP: entering CONTENT\n");
				4352	#endif
				4353	break;
				4354	}
				4355
				4356	/*
				4357	* Check for an Empty Element from DTD definition
				4358	*/
				4359	if ((info != NULL) && (info->empty)) {
				4360	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4361	ctxt->sax->endElement(ctxt->userData, name);
				4362	oldname = htmlnamePop(ctxt);
				4363	#ifdef DEBUG
				4364	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4365	#endif
				4366	if (oldname != NULL)
				4367	xmlFree(oldname);
				4368	}
				4369	ctxt->instate = XML_PARSER_CONTENT;
				4370	#ifdef DEBUG_PUSH
				4371	xmlGenericError(xmlGenericErrorContext,
				4372	"HPP: entering CONTENT\n");
				4373	#endif
				4374	break;
				4375	}
				4376	case XML_PARSER_CONTENT: {
				4377	long cons;
				4378	/*
				4379	* Handle preparsed entities and charRef
				4380	*/
				4381	if (ctxt->token != 0) {
				4382	xmlChar chr[2] = { 0 , 0 } ;
				4383
				4384	chr[0] = (xmlChar) ctxt->token;
				4385	htmlCheckParagraph(ctxt);
				4386	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4387	ctxt->sax->characters(ctxt->userData, chr, 1);
				4388	ctxt->token = 0;
				4389	ctxt->checkIndex = 0;
				4390	}
				4391	if ((avail == 1) && (terminate)) {
				4392	cur = in->cur[0];
				4393	if ((cur != '<') && (cur != '&')) {
				4394	if (ctxt->sax != NULL) {
				4395	if (IS_BLANK(cur)) {
				4396	if (ctxt->sax->ignorableWhitespace != NULL)
				4397	ctxt->sax->ignorableWhitespace(
				4398	ctxt->userData, &cur, 1);
				4399	} else {
				4400	htmlCheckParagraph(ctxt);
				4401	if (ctxt->sax->characters != NULL)
				4402	ctxt->sax->characters(
				4403	ctxt->userData, &cur, 1);
				4404	}
				4405	}
				4406	ctxt->token = 0;
				4407	ctxt->checkIndex = 0;
Daniel Veillard	bc6e1a3	2002-11-18 15:07:25 +0000	[diff] [blame]	4408	in->cur++;
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	4409	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4410	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4411	}
				4412	if (avail < 2)
				4413	goto done;
				4414	cur = in->cur[0];
				4415	next = in->cur[1];
				4416	cons = ctxt->nbChars;
				4417	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4418	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4419	/*
				4420	* Handle SCRIPT/STYLE separately
				4421	*/
				4422	if ((!terminate) &&
				4423	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4424	goto done;
				4425	htmlParseScript(ctxt);
				4426	if ((cur == '<') && (next == '/')) {
				4427	ctxt->instate = XML_PARSER_END_TAG;
				4428	ctxt->checkIndex = 0;
				4429	#ifdef DEBUG_PUSH
				4430	xmlGenericError(xmlGenericErrorContext,
				4431	"HPP: entering END_TAG\n");
				4432	#endif
				4433	break;
				4434	}
				4435	} else {
				4436	/*
				4437	* Sometimes DOCTYPE arrives in the middle of the document
				4438	*/
				4439	if ((cur == '<') && (next == '!') &&
				4440	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4441	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4442	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4443	(UPP(8) == 'E')) {
				4444	if ((!terminate) &&
				4445	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4446	goto done;
				4447	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4448	ctxt->sax->error(ctxt->userData,
				4449	"Misplaced DOCTYPE declaration\n");
				4450	ctxt->wellFormed = 0;
				4451	htmlParseDocTypeDecl(ctxt);
				4452	} else if ((cur == '<') && (next == '!') &&
				4453	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4454	if ((!terminate) &&
				4455	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4456	goto done;
				4457	#ifdef DEBUG_PUSH
				4458	xmlGenericError(xmlGenericErrorContext,
				4459	"HPP: Parsing Comment\n");
				4460	#endif
				4461	htmlParseComment(ctxt);
				4462	ctxt->instate = XML_PARSER_CONTENT;
				4463	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4464	goto done;
				4465	} else if ((cur == '<') && (next == '/')) {
				4466	ctxt->instate = XML_PARSER_END_TAG;
				4467	ctxt->checkIndex = 0;
				4468	#ifdef DEBUG_PUSH
				4469	xmlGenericError(xmlGenericErrorContext,
				4470	"HPP: entering END_TAG\n");
				4471	#endif
				4472	break;
				4473	} else if (cur == '<') {
				4474	ctxt->instate = XML_PARSER_START_TAG;
				4475	ctxt->checkIndex = 0;
				4476	#ifdef DEBUG_PUSH
				4477	xmlGenericError(xmlGenericErrorContext,
				4478	"HPP: entering START_TAG\n");
				4479	#endif
				4480	break;
				4481	} else if (cur == '&') {
				4482	if ((!terminate) &&
				4483	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4484	goto done;
				4485	#ifdef DEBUG_PUSH
				4486	xmlGenericError(xmlGenericErrorContext,
				4487	"HPP: Parsing Reference\n");
				4488	#endif
				4489	/* TODO: check generation of subtrees if noent !!! */
				4490	htmlParseReference(ctxt);
				4491	} else {
				4492	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4493	/*
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	4494	* Goal of the following test is:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4495	* - minimize calls to the SAX 'character' callback
				4496	* when they are mergeable
				4497	*/
				4498	if ((ctxt->inputNr == 1) &&
				4499	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4500	if ((!terminate) &&
				4501	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4502	goto done;
				4503	}
				4504	ctxt->checkIndex = 0;
				4505	#ifdef DEBUG_PUSH
				4506	xmlGenericError(xmlGenericErrorContext,
				4507	"HPP: Parsing char data\n");
				4508	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4509	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4510	}
				4511	}
				4512	if (cons == ctxt->nbChars) {
				4513	if (ctxt->node != NULL) {
				4514	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4515	ctxt->sax->error(ctxt->userData,
				4516	"detected an error in element content\n");
				4517	ctxt->wellFormed = 0;
				4518	}
				4519	NEXT;
				4520	break;
				4521	}
				4522
				4523	break;
				4524	}
				4525	case XML_PARSER_END_TAG:
				4526	if (avail < 2)
				4527	goto done;
				4528	if ((!terminate) &&
				4529	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4530	goto done;
				4531	htmlParseEndTag(ctxt);
				4532	if (ctxt->nameNr == 0) {
				4533	ctxt->instate = XML_PARSER_EPILOG;
				4534	} else {
				4535	ctxt->instate = XML_PARSER_CONTENT;
				4536	}
				4537	ctxt->checkIndex = 0;
				4538	#ifdef DEBUG_PUSH
				4539	xmlGenericError(xmlGenericErrorContext,
				4540	"HPP: entering CONTENT\n");
				4541	#endif
				4542	break;
				4543	case XML_PARSER_CDATA_SECTION:
				4544	xmlGenericError(xmlGenericErrorContext,
				4545	"HPP: internal error, state == CDATA\n");
				4546	ctxt->instate = XML_PARSER_CONTENT;
				4547	ctxt->checkIndex = 0;
				4548	#ifdef DEBUG_PUSH
				4549	xmlGenericError(xmlGenericErrorContext,
				4550	"HPP: entering CONTENT\n");
				4551	#endif
				4552	break;
				4553	case XML_PARSER_DTD:
				4554	xmlGenericError(xmlGenericErrorContext,
				4555	"HPP: internal error, state == DTD\n");
				4556	ctxt->instate = XML_PARSER_CONTENT;
				4557	ctxt->checkIndex = 0;
				4558	#ifdef DEBUG_PUSH
				4559	xmlGenericError(xmlGenericErrorContext,
				4560	"HPP: entering CONTENT\n");
				4561	#endif
				4562	break;
				4563	case XML_PARSER_COMMENT:
				4564	xmlGenericError(xmlGenericErrorContext,
				4565	"HPP: internal error, state == COMMENT\n");
				4566	ctxt->instate = XML_PARSER_CONTENT;
				4567	ctxt->checkIndex = 0;
				4568	#ifdef DEBUG_PUSH
				4569	xmlGenericError(xmlGenericErrorContext,
				4570	"HPP: entering CONTENT\n");
				4571	#endif
				4572	break;
				4573	case XML_PARSER_PI:
				4574	xmlGenericError(xmlGenericErrorContext,
				4575	"HPP: internal error, state == PI\n");
				4576	ctxt->instate = XML_PARSER_CONTENT;
				4577	ctxt->checkIndex = 0;
				4578	#ifdef DEBUG_PUSH
				4579	xmlGenericError(xmlGenericErrorContext,
				4580	"HPP: entering CONTENT\n");
				4581	#endif
				4582	break;
				4583	case XML_PARSER_ENTITY_DECL:
				4584	xmlGenericError(xmlGenericErrorContext,
				4585	"HPP: internal error, state == ENTITY_DECL\n");
				4586	ctxt->instate = XML_PARSER_CONTENT;
				4587	ctxt->checkIndex = 0;
				4588	#ifdef DEBUG_PUSH
				4589	xmlGenericError(xmlGenericErrorContext,
				4590	"HPP: entering CONTENT\n");
				4591	#endif
				4592	break;
				4593	case XML_PARSER_ENTITY_VALUE:
				4594	xmlGenericError(xmlGenericErrorContext,
				4595	"HPP: internal error, state == ENTITY_VALUE\n");
				4596	ctxt->instate = XML_PARSER_CONTENT;
				4597	ctxt->checkIndex = 0;
				4598	#ifdef DEBUG_PUSH
				4599	xmlGenericError(xmlGenericErrorContext,
				4600	"HPP: entering DTD\n");
				4601	#endif
				4602	break;
				4603	case XML_PARSER_ATTRIBUTE_VALUE:
				4604	xmlGenericError(xmlGenericErrorContext,
				4605	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4606	ctxt->instate = XML_PARSER_START_TAG;
				4607	ctxt->checkIndex = 0;
				4608	#ifdef DEBUG_PUSH
				4609	xmlGenericError(xmlGenericErrorContext,
				4610	"HPP: entering START_TAG\n");
				4611	#endif
				4612	break;
				4613	case XML_PARSER_SYSTEM_LITERAL:
				4614	xmlGenericError(xmlGenericErrorContext,
				4615	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4616	ctxt->instate = XML_PARSER_CONTENT;
				4617	ctxt->checkIndex = 0;
				4618	#ifdef DEBUG_PUSH
				4619	xmlGenericError(xmlGenericErrorContext,
				4620	"HPP: entering CONTENT\n");
				4621	#endif
				4622	break;
				4623	case XML_PARSER_IGNORE:
				4624	xmlGenericError(xmlGenericErrorContext,
				4625	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4626	ctxt->instate = XML_PARSER_CONTENT;
				4627	ctxt->checkIndex = 0;
				4628	#ifdef DEBUG_PUSH
				4629	xmlGenericError(xmlGenericErrorContext,
				4630	"HPP: entering CONTENT\n");
				4631	#endif
				4632	break;
Daniel Veillard	044fc6b	2002-03-04 17:09:44 +0000	[diff] [blame]	4633	case XML_PARSER_PUBLIC_LITERAL:
				4634	xmlGenericError(xmlGenericErrorContext,
				4635	"HPP: internal error, state == XML_PARSER_LITERAL\n");
				4636	ctxt->instate = XML_PARSER_CONTENT;
				4637	ctxt->checkIndex = 0;
				4638	#ifdef DEBUG_PUSH
				4639	xmlGenericError(xmlGenericErrorContext,
				4640	"HPP: entering CONTENT\n");
				4641	#endif
				4642	break;
				4643
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4644	}
				4645	}
				4646	done:
				4647	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4648	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4649	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4650	/*
				4651	* SAX: end of the document processing.
				4652	*/
				4653	ctxt->instate = XML_PARSER_EOF;
				4654	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4655	ctxt->sax->endDocument(ctxt->userData);
				4656	}
				4657	}
				4658	if ((ctxt->myDoc != NULL) &&
				4659	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4660	(ctxt->instate == XML_PARSER_EPILOG))) {
				4661	xmlDtdPtr dtd;
				4662	dtd = xmlGetIntSubset(ctxt->myDoc);
				4663	if (dtd == NULL)
				4664	ctxt->myDoc->intSubset =
				4665	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4666	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4667	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4668	}
				4669	#ifdef DEBUG_PUSH
				4670	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4671	#endif
				4672	return(ret);
				4673	}
				4674
				4675	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4676	* htmlParseChunk:
				4677	* @ctxt: an XML parser context
				4678	* @chunk: an char array
				4679	* @size: the size in byte of the chunk
				4680	* @terminate: last chunk indicator
				4681	*
				4682	* Parse a Chunk of memory
				4683	*
				4684	* Returns zero if no error, the xmlParserErrors otherwise.
				4685	*/
				4686	int
				4687	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4688	int terminate) {
				4689	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4690	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4691	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4692	int cur = ctxt->input->cur - ctxt->input->base;
				4693
				4694	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4695	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4696	ctxt->input->cur = ctxt->input->base + cur;
				4697	#ifdef DEBUG_PUSH
				4698	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4699	#endif
				4700
				4701	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4702	htmlParseTryOrFinish(ctxt, terminate);
				4703	} else if (ctxt->instate != XML_PARSER_EOF) {
				4704	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4705	htmlParseTryOrFinish(ctxt, terminate);
				4706	}
				4707	if (terminate) {
				4708	if ((ctxt->instate != XML_PARSER_EOF) &&
				4709	(ctxt->instate != XML_PARSER_EPILOG) &&
				4710	(ctxt->instate != XML_PARSER_MISC)) {
				4711	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4712	ctxt->wellFormed = 0;
				4713	}
				4714	if (ctxt->instate != XML_PARSER_EOF) {
				4715	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4716	ctxt->sax->endDocument(ctxt->userData);
				4717	}
				4718	ctxt->instate = XML_PARSER_EOF;
				4719	}
				4720	return((xmlParserErrors) ctxt->errNo);
				4721	}
				4722
				4723	/************************************************************************
				4724	* *
				4725	* User entry points *
				4726	* *
				4727	************************************************************************/
				4728
				4729	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	4730	* htmlCreatePushParserCtxt:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4731	* @sax: a SAX handler
				4732	* @user_data: The user data returned on SAX callbacks
				4733	* @chunk: a pointer to an array of chars
				4734	* @size: number of chars in the array
				4735	* @filename: an optional file name or URI
				4736	* @enc: an optional encoding
				4737	*
				4738	* Create a parser context for using the HTML parser in push mode
				4739	* To allow content encoding detection, @size should be >= 4
				4740	* The value of @filename is used for fetching external entities
				4741	* and error/warning reports.
				4742	*
				4743	* Returns the new parser context or NULL
				4744	*/
				4745	htmlParserCtxtPtr
				4746	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4747	const char chunk, int size, const char filename,
				4748	xmlCharEncoding enc) {
				4749	htmlParserCtxtPtr ctxt;
				4750	htmlParserInputPtr inputStream;
				4751	xmlParserInputBufferPtr buf;
				4752
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4753	xmlInitParser();
				4754
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4755	buf = xmlAllocParserInputBuffer(enc);
				4756	if (buf == NULL) return(NULL);
				4757
				4758	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4759	if (ctxt == NULL) {
				4760	xmlFree(buf);
				4761	return(NULL);
				4762	}
				4763	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4764	htmlInitParserCtxt(ctxt);
				4765	if (sax != NULL) {
				4766	if (ctxt->sax != &htmlDefaultSAXHandler)
				4767	xmlFree(ctxt->sax);
				4768	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4769	if (ctxt->sax == NULL) {
				4770	xmlFree(buf);
				4771	xmlFree(ctxt);
				4772	return(NULL);
				4773	}
				4774	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4775	if (user_data != NULL)
				4776	ctxt->userData = user_data;
				4777	}
				4778	if (filename == NULL) {
				4779	ctxt->directory = NULL;
				4780	} else {
				4781	ctxt->directory = xmlParserGetDirectory(filename);
				4782	}
				4783
				4784	inputStream = htmlNewInputStream(ctxt);
				4785	if (inputStream == NULL) {
				4786	xmlFreeParserCtxt(ctxt);
				4787	return(NULL);
				4788	}
				4789
				4790	if (filename == NULL)
				4791	inputStream->filename = NULL;
				4792	else
				4793	inputStream->filename = xmlMemStrdup(filename);
				4794	inputStream->buf = buf;
				4795	inputStream->base = inputStream->buf->buffer->content;
				4796	inputStream->cur = inputStream->buf->buffer->content;
				4797
				4798	inputPush(ctxt, inputStream);
				4799
				4800	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4801	(ctxt->input->buf != NULL)) {
				4802	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4803	#ifdef DEBUG_PUSH
				4804	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4805	#endif
				4806	}
				4807
				4808	return(ctxt);
				4809	}
				4810
				4811	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	4812	* htmlSAXParseDoc:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4813	* @cur: a pointer to an array of xmlChar
				4814	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4815	* @sax: the SAX handler block
				4816	* @userData: if using SAX, this pointer will be provided on callbacks.
				4817	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4818	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4819	* to handle parse events. If sax is NULL, fallback to the default DOM
				4820	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4821	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4822	* Returns the resulting document tree unless SAX is NULL or the document is
				4823	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4824	*/
				4825
				4826	htmlDocPtr
				4827	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4828	htmlDocPtr ret;
				4829	htmlParserCtxtPtr ctxt;
				4830
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4831	xmlInitParser();
				4832
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4833	if (cur == NULL) return(NULL);
				4834
				4835
				4836	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4837	if (ctxt == NULL) return(NULL);
				4838	if (sax != NULL) {
				4839	ctxt->sax = sax;
				4840	ctxt->userData = userData;
				4841	}
				4842
				4843	htmlParseDocument(ctxt);
				4844	ret = ctxt->myDoc;
				4845	if (sax != NULL) {
				4846	ctxt->sax = NULL;
				4847	ctxt->userData = NULL;
				4848	}
				4849	htmlFreeParserCtxt(ctxt);
				4850
				4851	return(ret);
				4852	}
				4853
				4854	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	4855	* htmlParseDoc:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4856	* @cur: a pointer to an array of xmlChar
				4857	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4858	*
				4859	* parse an HTML in-memory document and build a tree.
				4860	*
				4861	* Returns the resulting document tree
				4862	*/
				4863
				4864	htmlDocPtr
				4865	htmlParseDoc(xmlChar cur, const char encoding) {
				4866	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4867	}
				4868
				4869
				4870	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	4871	* htmlCreateFileParserCtxt:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4872	* @filename: the filename
				4873	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4874	*
				4875	* Create a parser context for a file content.
				4876	* Automatic support for ZLIB/Compress compressed document is provided
				4877	* by default if found at compile-time.
				4878	*
				4879	* Returns the new parser context or NULL
				4880	*/
				4881	htmlParserCtxtPtr
				4882	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4883	{
				4884	htmlParserCtxtPtr ctxt;
				4885	htmlParserInputPtr inputStream;
				4886	xmlParserInputBufferPtr buf;
				4887	/* htmlCharEncoding enc; */
				4888	xmlChar content, content_line = (xmlChar *) "charset=";
				4889
				4890	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4891	if (buf == NULL) return(NULL);
				4892
				4893	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4894	if (ctxt == NULL) {
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	4895	xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4896	return(NULL);
				4897	}
				4898	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4899	htmlInitParserCtxt(ctxt);
				4900	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4901	if (inputStream == NULL) {
Daniel Veillard	3487c8d	2002-09-05 11:33:25 +0000	[diff] [blame]	4902	xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4903	xmlFree(ctxt);
				4904	return(NULL);
				4905	}
				4906	memset(inputStream, 0, sizeof(htmlParserInput));
				4907
Daniel Veillard	a646cfd	2002-09-17 21:50:03 +0000	[diff] [blame]	4908	inputStream->filename = (char *)
				4909	xmlNormalizeWindowsPath((xmlChar *)filename);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4910	inputStream->line = 1;
				4911	inputStream->col = 1;
				4912	inputStream->buf = buf;
				4913	inputStream->directory = NULL;
				4914
				4915	inputStream->base = inputStream->buf->buffer->content;
				4916	inputStream->cur = inputStream->buf->buffer->content;
				4917	inputStream->free = NULL;
				4918
				4919	inputPush(ctxt, inputStream);
				4920
				4921	/* set encoding */
				4922	if (encoding) {
				4923	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4924	if (content) {
				4925	strcpy ((char )content, (char )content_line);
				4926	strcat ((char )content, (char )encoding);
				4927	htmlCheckEncoding (ctxt, content);
				4928	xmlFree (content);
				4929	}
				4930	}
				4931
				4932	return(ctxt);
				4933	}
				4934
				4935	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	4936	* htmlSAXParseFile:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4937	* @filename: the filename
				4938	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4939	* @sax: the SAX handler block
				4940	* @userData: if using SAX, this pointer will be provided on callbacks.
				4941	*
				4942	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4943	* compressed document is provided by default if found at compile-time.
				4944	* It use the given SAX function block to handle the parsing callback.
				4945	* If sax is NULL, fallback to the default DOM tree building routines.
				4946	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4947	* Returns the resulting document tree unless SAX is NULL or the document is
				4948	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4949	*/
				4950
				4951	htmlDocPtr
				4952	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4953	void *userData) {
				4954	htmlDocPtr ret;
				4955	htmlParserCtxtPtr ctxt;
				4956	htmlSAXHandlerPtr oldsax = NULL;
				4957
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4958	xmlInitParser();
				4959
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4960	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4961	if (ctxt == NULL) return(NULL);
				4962	if (sax != NULL) {
				4963	oldsax = ctxt->sax;
				4964	ctxt->sax = sax;
				4965	ctxt->userData = userData;
				4966	}
				4967
				4968	htmlParseDocument(ctxt);
				4969
				4970	ret = ctxt->myDoc;
				4971	if (sax != NULL) {
				4972	ctxt->sax = oldsax;
				4973	ctxt->userData = NULL;
				4974	}
				4975	htmlFreeParserCtxt(ctxt);
				4976
				4977	return(ret);
				4978	}
				4979
				4980	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame^]	4981	* htmlParseFile:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4982	* @filename: the filename
				4983	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4984	*
				4985	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4986	* compressed document is provided by default if found at compile-time.
				4987	*
				4988	* Returns the resulting document tree
				4989	*/
				4990
				4991	htmlDocPtr
				4992	htmlParseFile(const char filename, const char encoding) {
				4993	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4994	}
				4995
				4996	/**
				4997	* htmlHandleOmittedElem:
				4998	* @val: int 0 or 1
				4999	*
				5000	* Set and return the previous value for handling HTML omitted tags.
				5001	*
				5002	* Returns the last value for 0 for no handling, 1 for auto insertion.
				5003	*/
				5004
				5005	int
				5006	htmlHandleOmittedElem(int val) {
				5007	int old = htmlOmittedDefaultValue;
				5008
				5009	htmlOmittedDefaultValue = val;
				5010	return(old);
				5011	}
				5012
				5013	#endif /* LIBXML_HTML_ENABLED */