Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: c8e9cca6b5ef71a2eeddf1c8cdf150b0fd63ddee [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Daniel Veillard	34ce8be	2002-03-18 19:37:11 +0000	[diff] [blame]	9	#define IN_LIBXML
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	10	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	11	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	12
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	13	#include <string.h>
				14	#ifdef HAVE_CTYPE_H
				15	#include <ctype.h>
				16	#endif
				17	#ifdef HAVE_STDLIB_H
				18	#include <stdlib.h>
				19	#endif
				20	#ifdef HAVE_SYS_STAT_H
				21	#include <sys/stat.h>
				22	#endif
				23	#ifdef HAVE_FCNTL_H
				24	#include <fcntl.h>
				25	#endif
				26	#ifdef HAVE_UNISTD_H
				27	#include <unistd.h>
				28	#endif
				29	#ifdef HAVE_ZLIB_H
				30	#include <zlib.h>
				31	#endif
				32
				33	#include <libxml/xmlmemory.h>
				34	#include <libxml/tree.h>
				35	#include <libxml/parser.h>
				36	#include <libxml/parserInternals.h>
				37	#include <libxml/xmlerror.h>
				38	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	39	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	40	#include <libxml/entities.h>
				41	#include <libxml/encoding.h>
				42	#include <libxml/valid.h>
				43	#include <libxml/xmlIO.h>
Daniel Veillard	3c01b1d	2001-10-17 15:58:35 +0000	[diff] [blame]	44	#include <libxml/globals.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	45
				46	#define HTML_MAX_NAMELEN 1000
				47	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				48	#define HTML_PARSER_BUFFER_SIZE 100
				49
				50	/* #define DEBUG */
				51	/* #define DEBUG_PUSH */
				52
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	53	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	54
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	55	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				56	xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	57	static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	58
				59	/************************************************************************
				60	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	61	* Parser stacks related functions and macros *
				62	* *
				63	************************************************************************/
				64
				65	/*
				66	* Generic function for accessing stacks in the Parser Context
				67	*/
				68
				69	#define PUSH_AND_POP(scope, type, name) \
				70	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				71	if (ctxt->name##Nr >= ctxt->name##Max) { \
				72	ctxt->name##Max *= 2; \
				73	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				74	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				75	if (ctxt->name##Tab == NULL) { \
				76	xmlGenericError(xmlGenericErrorContext, \
				77	"realloc failed !\n"); \
				78	return(0); \
				79	} \
				80	} \
				81	ctxt->name##Tab[ctxt->name##Nr] = value; \
				82	ctxt->name = value; \
				83	return(ctxt->name##Nr++); \
				84	} \
				85	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				86	type ret; \
				87	if (ctxt->name##Nr < 0) return(0); \
				88	ctxt->name##Nr--; \
				89	if (ctxt->name##Nr < 0) return(0); \
				90	if (ctxt->name##Nr > 0) \
				91	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				92	else \
				93	ctxt->name = NULL; \
				94	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				95	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				96	return(ret); \
				97	} \
				98
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	99	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				100	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	101
				102	/*
				103	* Macros for accessing the content. Those should be used only by the parser,
				104	* and not exported.
				105	*
				106	* Dirty macros, i.e. one need to make assumption on the context to use them
				107	*
				108	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				109	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				110	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				111	* in UNICODE mode. This should be used internally by the parser
				112	* only to compare to ASCII values otherwise it would break when
				113	* running with UTF-8 encoding.
				114	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				115	* to compare on ASCII based substring.
				116	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				117	* it should be used only to compare on ASCII based substring.
				118	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				119	* strings within the parser.
				120	*
				121	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				122	*
				123	* CURRENT Returns the current char value, with the full decoding of
				124	* UTF-8 if we are using this mode. It returns an int.
				125	* NEXT Skip to the next character, this does the proper decoding
				126	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				127	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				128	*/
				129
				130	#define UPPER (toupper(*ctxt->input->cur))
				131
				132	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				133
				134	#define NXT(val) ctxt->input->cur[(val)]
				135
				136	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				137
				138	#define CUR_PTR ctxt->input->cur
				139
				140	#define SHRINK xmlParserInputShrink(ctxt->input)
				141
				142	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				143
				144	#define CURRENT ((int) (*ctxt->input->cur))
				145
				146	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				147
				148	/* Inported from XML */
				149
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	150	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				151	#define CUR ((int) (*ctxt->input->cur))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	152	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				153
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	154	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	155	#define NXT(val) ctxt->input->cur[(val)]
				156	#define CUR_PTR ctxt->input->cur
				157
				158
				159	#define NEXTL(l) do { \
				160	if (*(ctxt->input->cur) == '\n') { \
				161	ctxt->input->line++; ctxt->input->col = 1; \
				162	} else ctxt->input->col++; \
				163	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				164	} while (0)
				165
				166	/************
				167	\
				168	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				169	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				170	************/
				171
				172	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				173	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				174
				175	#define COPY_BUF(l,b,i,v) \
				176	if (l == 1) b[i++] = (xmlChar) v; \
				177	else i += xmlCopyChar(l,&b[i],v)
				178
				179	/**
				180	* htmlCurrentChar:
				181	* @ctxt: the HTML parser context
				182	* @len: pointer to the length of the char read
				183	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	184	* The current char value, if using UTF-8 this may actually span multiple
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	185	* bytes in the input buffer. Implement the end of line normalization:
				186	* 2.11 End-of-Line Handling
				187	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				188	* char, then the encoding converter is plugged in automatically.
				189	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	190	* Returns the current char value and its length
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	*/
				192
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	193	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	194	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				195	if (ctxt->instate == XML_PARSER_EOF)
				196	return(0);
				197
				198	if (ctxt->token != 0) {
				199	*len = 0;
				200	return(ctxt->token);
				201	}
				202	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				203	/*
				204	* We are supposed to handle UTF8, check it's valid
				205	* From rfc2044: encoding of the Unicode values on UTF-8:
				206	*
				207	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				208	* 0000 0000-0000 007F 0xxxxxxx
				209	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				210	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				211	*
				212	* Check for the 0x110000 limit too
				213	*/
				214	const unsigned char *cur = ctxt->input->cur;
				215	unsigned char c;
				216	unsigned int val;
				217
				218	c = *cur;
				219	if (c & 0x80) {
				220	if (cur[1] == 0)
				221	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				222	if ((cur[1] & 0xc0) != 0x80)
				223	goto encoding_error;
				224	if ((c & 0xe0) == 0xe0) {
				225
				226	if (cur[2] == 0)
				227	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				228	if ((cur[2] & 0xc0) != 0x80)
				229	goto encoding_error;
				230	if ((c & 0xf0) == 0xf0) {
				231	if (cur[3] == 0)
				232	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				233	if (((c & 0xf8) != 0xf0) \|\|
				234	((cur[3] & 0xc0) != 0x80))
				235	goto encoding_error;
				236	/* 4-byte code */
				237	*len = 4;
				238	val = (cur[0] & 0x7) << 18;
				239	val \|= (cur[1] & 0x3f) << 12;
				240	val \|= (cur[2] & 0x3f) << 6;
				241	val \|= cur[3] & 0x3f;
				242	} else {
				243	/* 3-byte code */
				244	*len = 3;
				245	val = (cur[0] & 0xf) << 12;
				246	val \|= (cur[1] & 0x3f) << 6;
				247	val \|= cur[2] & 0x3f;
				248	}
				249	} else {
				250	/* 2-byte code */
				251	*len = 2;
				252	val = (cur[0] & 0x1f) << 6;
				253	val \|= cur[1] & 0x3f;
				254	}
				255	if (!IS_CHAR(val)) {
				256	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				257	if ((ctxt->sax != NULL) &&
				258	(ctxt->sax->error != NULL))
				259	ctxt->sax->error(ctxt->userData,
				260	"Char 0x%X out of allowed range\n", val);
				261	ctxt->wellFormed = 0;
				262	ctxt->disableSAX = 1;
				263	}
				264	return(val);
				265	} else {
				266	/* 1-byte code */
				267	*len = 1;
				268	return((int) *ctxt->input->cur);
				269	}
				270	}
				271	/*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	272	* Assume it's a fixed length encoding (1) with
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	273	* a compatible encoding for the ASCII set, since
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	274	* XML constructs only use < 128 chars
				275	*/
				276	*len = 1;
				277	if ((int) *ctxt->input->cur < 0x80)
				278	return((int) *ctxt->input->cur);
				279
				280	/*
				281	* Humm this is bad, do an automatic flow conversion
				282	*/
				283	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				284	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				285	return(xmlCurrentChar(ctxt, len));
				286
				287	encoding_error:
				288	/*
				289	* If we detect an UTF8 error that probably mean that the
				290	* input encoding didn't get properly advertized in the
				291	* declaration header. Report the error and switch the encoding
				292	* to ISO-Latin-1 (if you don't like this policy, just declare the
				293	* encoding !)
				294	*/
				295	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				296	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				297	ctxt->sax->error(ctxt->userData,
				298	"Input is not proper UTF-8, indicate encoding !\n");
				299	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				300	ctxt->input->cur[0], ctxt->input->cur[1],
				301	ctxt->input->cur[2], ctxt->input->cur[3]);
				302	}
				303
				304	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				305	*len = 1;
				306	return((int) *ctxt->input->cur);
				307	}
				308
				309	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	310	* htmlSkipBlankChars:
				311	* @ctxt: the HTML parser context
				312	*
				313	* skip all blanks character found at that point in the input streams.
				314	*
				315	* Returns the number of space chars skipped
				316	*/
				317
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	318	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	319	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				320	int res = 0;
				321
				322	while (IS_BLANK(*(ctxt->input->cur))) {
				323	if ((*ctxt->input->cur == 0) &&
				324	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				325	xmlPopInput(ctxt);
				326	} else {
				327	if (*(ctxt->input->cur) == '\n') {
				328	ctxt->input->line++; ctxt->input->col = 1;
				329	} else ctxt->input->col++;
				330	ctxt->input->cur++;
				331	ctxt->nbChars++;
				332	if (*ctxt->input->cur == 0)
				333	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				334	}
				335	res++;
				336	}
				337	return(res);
				338	}
				339
				340
				341
				342	/************************************************************************
				343	* *
				344	* The list of HTML elements and their properties *
				345	* *
				346	************************************************************************/
				347
				348	/*
				349	* Start Tag: 1 means the start tag can be ommited
				350	* End Tag: 1 means the end tag can be ommited
				351	* 2 means it's forbidden (empty elements)
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	352	* 3 means the tag is stylistic and should be closed easily
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	353	* Depr: this element is deprecated
				354	* DTD: 1 means that this element is valid only in the Loose DTD
				355	* 2 means that this element is valid only in the Frameset DTD
				356	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	357	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	358	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	359	static const htmlElemDesc
				360	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	361	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				362	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				363	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				364	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				365	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				366	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				367	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				368	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				369	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				370	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				371	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				372	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				373	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				374	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				375	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				376	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				377	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				378	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				379	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				380	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				381	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				382	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				383	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				384	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				385	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				386	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				387	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				388	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				389	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				390	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				391	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				392	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				393	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				394	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				395	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				398	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				399	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				400	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				401	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				402	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				403	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				404	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				405	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				406	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				407	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				408	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				409	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				410	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				411	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				412	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				413	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				414	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				415	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				416	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				417	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				418	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				419	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				420	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				421	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				422	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				423	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				424	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				425	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				426	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				427	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				428	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				429	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				430	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				431	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				432	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				433	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				434	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				435	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				436	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				437	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				438	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				439	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				440	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				441	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				442	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				443	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				444	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				445	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				446	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				447	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				448	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				449	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				450	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				451	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	452	};
				453
				454	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	455	* start tags that imply the end of current element
				456	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	457	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	458	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				459	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				460	"listing", "xmp", "head", NULL,
				461	"head", "p", NULL,
				462	"title", "p", NULL,
				463	"body", "head", "style", "link", "title", "p", NULL,
				464	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				465	"pre", "listing", "xmp", "head", "li", NULL,
				466	"hr", "p", "head", NULL,
				467	"h1", "p", "head", NULL,
				468	"h2", "p", "head", NULL,
				469	"h3", "p", "head", NULL,
				470	"h4", "p", "head", NULL,
				471	"h5", "p", "head", NULL,
				472	"h6", "p", "head", NULL,
				473	"dir", "p", "head", NULL,
				474	"address", "p", "head", "ul", NULL,
				475	"pre", "p", "head", "ul", NULL,
				476	"listing", "p", "head", NULL,
				477	"xmp", "p", "head", NULL,
				478	"blockquote", "p", "head", NULL,
				479	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				480	"xmp", "head", NULL,
				481	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				482	"head", "dd", NULL,
				483	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				484	"head", "dt", NULL,
				485	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				486	"listing", "xmp", NULL,
				487	"ol", "p", "head", "ul", NULL,
				488	"menu", "p", "head", "ul", NULL,
				489	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				490	"div", "p", "head", NULL,
				491	"noscript", "p", "head", NULL,
				492	"center", "font", "b", "i", "p", "head", NULL,
				493	"a", "a", NULL,
				494	"caption", "p", NULL,
				495	"colgroup", "caption", "colgroup", "col", "p", NULL,
				496	"col", "caption", "col", "p", NULL,
				497	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				498	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	499	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				500	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	501	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				502	"thead", "caption", "col", "colgroup", NULL,
				503	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				504	"tbody", "p", NULL,
				505	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				506	"tfoot", "tbody", "p", NULL,
				507	"optgroup", "option", NULL,
				508	"option", "option", NULL,
				509	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				510	"pre", "listing", "xmp", "a", NULL,
				511	NULL
				512	};
				513
				514	/*
				515	* The list of HTML elements which are supposed not to have
				516	* CDATA content and where a p element will be implied
				517	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	518	* TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	519	* implied paragraph
				520	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	521	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	522	"html",
				523	"head",
				524	"body",
				525	NULL
				526	};
				527
				528	/*
				529	* The list of HTML attributes which are of content %Script;
				530	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				531	* it assumes the name starts with 'on'
				532	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	533	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	534	"onclick",
				535	"ondblclick",
				536	"onmousedown",
				537	"onmouseup",
				538	"onmouseover",
				539	"onmousemove",
				540	"onmouseout",
				541	"onkeypress",
				542	"onkeydown",
				543	"onkeyup",
				544	"onload",
				545	"onunload",
				546	"onfocus",
				547	"onblur",
				548	"onsubmit",
				549	"onrest",
				550	"onchange",
				551	"onselect"
				552	};
				553
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	554	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	555	* This table is used by the htmlparser to know what to do with
				556	* broken html pages. By assigning different priorities to different
				557	* elements the parser can decide how to handle extra endtags.
				558	* Endtags are only allowed to close elements with lower or equal
				559	* priority.
				560	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	561
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	562	typedef struct {
				563	const char *name;
				564	int priority;
				565	} elementPriority;
				566
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	567	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	568	{"div", 150},
				569	{"td", 160},
				570	{"th", 160},
				571	{"tr", 170},
				572	{"thead", 180},
				573	{"tbody", 180},
				574	{"tfoot", 180},
				575	{"table", 190},
				576	{"head", 200},
				577	{"body", 200},
				578	{"html", 220},
				579	{NULL, 100} /* Default priority */
				580	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	581
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	582	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	583	static int htmlStartCloseIndexinitialized = 0;
				584
				585	/************************************************************************
				586	* *
				587	* functions to handle HTML specific data *
				588	* *
				589	************************************************************************/
				590
				591	/**
				592	* htmlInitAutoClose:
				593	*
				594	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				595	* This is not reentrant. Call xmlInitParser() once before processing in
				596	* case of use in multithreaded programs.
				597	*/
				598	void
				599	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	600	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	601
				602	if (htmlStartCloseIndexinitialized) return;
				603
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	604	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				605	indx = 0;
				606	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				607	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	608	while (htmlStartClose[i] != NULL) i++;
				609	i++;
				610	}
				611	htmlStartCloseIndexinitialized = 1;
				612	}
				613
				614	/**
				615	* htmlTagLookup:
				616	* @tag: The tag name in lowercase
				617	*
				618	* Lookup the HTML tag in the ElementTable
				619	*
				620	* Returns the related htmlElemDescPtr or NULL if not found.
				621	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	622	const htmlElemDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	623	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	624	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	625
				626	for (i = 0; i < (sizeof(html40ElementTable) /
				627	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	628	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	629	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	630	}
				631	return(NULL);
				632	}
				633
				634	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	635	* htmlGetEndPriority:
				636	* @name: The name of the element to look up the priority for.
				637	*
				638	* Return value: The "endtag" priority.
				639	**/
				640	static int
				641	htmlGetEndPriority (const xmlChar *name) {
				642	int i = 0;
				643
				644	while ((htmlEndPriority[i].name != NULL) &&
				645	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				646	i++;
				647
				648	return(htmlEndPriority[i].priority);
				649	}
				650
				651	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	652	* htmlCheckAutoClose:
				653	* @newtag: The new tag name
				654	* @oldtag: The old tag name
				655	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	656	* Checks whether the new tag is one of the registered valid tags for
				657	* closing old.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	658	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				659	*
				660	* Returns 0 if no, 1 if yes.
				661	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	662	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	663	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	664	int i, indx;
				665	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	666
				667	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				668
				669	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	670	for (indx = 0; indx < 100;indx++) {
				671	closed = htmlStartCloseIndex[indx];
				672	if (closed == NULL) return(0);
				673	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	674	}
				675
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	676	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	677	i++;
				678	while (htmlStartClose[i] != NULL) {
				679	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				680	return(1);
				681	}
				682	i++;
				683	}
				684	return(0);
				685	}
				686
				687	/**
				688	* htmlAutoCloseOnClose:
				689	* @ctxt: an HTML parser context
				690	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	691	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	692	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	693	* The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	694	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	695	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	696	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	697	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	698	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	699	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	700
				701	#ifdef DEBUG
				702	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				703	for (i = 0;i < ctxt->nameNr;i++)
				704	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				705	#endif
				706
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	707	priority = htmlGetEndPriority (newtag);
				708
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	709	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	710
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	711	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	712	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	713	* A missplaced endtag can only close elements with lower
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	714	* or equal priority, so if we find an element with higher
				715	* priority before we find an element with
				716	* matching name, we just ignore this endtag
				717	*/
				718	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	719	}
				720	if (i < 0) return;
				721
				722	while (!xmlStrEqual(newtag, ctxt->name)) {
				723	info = htmlTagLookup(ctxt->name);
				724	if ((info == NULL) \|\| (info->endTag == 1)) {
				725	#ifdef DEBUG
				726	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				727	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	728	} else if (info->endTag == 3) {
				729	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	730	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	731
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	732	#endif
				733	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				734	ctxt->sax->error(ctxt->userData,
				735	"Opening and ending tag mismatch: %s and %s\n",
				736	newtag, ctxt->name);
				737	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	738	}
				739	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				740	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				741	oldname = htmlnamePop(ctxt);
				742	if (oldname != NULL) {
				743	#ifdef DEBUG
				744	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				745	#endif
				746	xmlFree(oldname);
				747	}
				748	}
				749	}
				750
				751	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	752	* htmlAutoCloseOnEnd:
				753	* @ctxt: an HTML parser context
				754	*
				755	* Close all remaining tags at the end of the stream
				756	*/
				757	static void
				758	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				759	xmlChar *oldname;
				760	int i;
				761
				762	if (ctxt->nameNr == 0)
				763	return;
				764	#ifdef DEBUG
				765	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				766	#endif
				767
				768	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				769	#ifdef DEBUG
				770	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				771	#endif
				772	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				773	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				774	oldname = htmlnamePop(ctxt);
				775	if (oldname != NULL) {
				776	#ifdef DEBUG
				777	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				778	#endif
				779	xmlFree(oldname);
				780	}
				781	}
				782	}
				783
				784	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	785	* htmlAutoClose:
				786	* @ctxt: an HTML parser context
				787	* @newtag: The new tag name or NULL
				788	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	789	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	790	* The list is kept in htmlStartClose array. This function is
				791	* called when a new tag has been detected and generates the
				792	* appropriates closes if possible/needed.
				793	* If newtag is NULL this mean we are at the end of the resource
				794	* and we should check
				795	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	796	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	797	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				798	xmlChar *oldname;
				799	while ((newtag != NULL) && (ctxt->name != NULL) &&
				800	(htmlCheckAutoClose(newtag, ctxt->name))) {
				801	#ifdef DEBUG
				802	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				803	#endif
				804	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				805	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				806	oldname = htmlnamePop(ctxt);
				807	if (oldname != NULL) {
				808	#ifdef DEBUG
				809	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				810	#endif
				811	xmlFree(oldname);
				812	}
				813	}
				814	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	815	htmlAutoCloseOnEnd(ctxt);
				816	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	817	}
				818	while ((newtag == NULL) && (ctxt->name != NULL) &&
				819	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				820	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				821	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				822	#ifdef DEBUG
				823	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				824	#endif
				825	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				826	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				827	oldname = htmlnamePop(ctxt);
				828	if (oldname != NULL) {
				829	#ifdef DEBUG
				830	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				831	#endif
				832	xmlFree(oldname);
				833	}
				834	}
				835
				836	}
				837
				838	/**
				839	* htmlAutoCloseTag:
				840	* @doc: the HTML document
				841	* @name: The tag name
				842	* @elem: the HTML element
				843	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	844	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	845	* The list is kept in htmlStartClose array. This function checks
				846	* if the element or one of it's children would autoclose the
				847	* given tag.
				848	*
				849	* Returns 1 if autoclose, 0 otherwise
				850	*/
				851	int
				852	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				853	htmlNodePtr child;
				854
				855	if (elem == NULL) return(1);
				856	if (xmlStrEqual(name, elem->name)) return(0);
				857	if (htmlCheckAutoClose(elem->name, name)) return(1);
				858	child = elem->children;
				859	while (child != NULL) {
				860	if (htmlAutoCloseTag(doc, name, child)) return(1);
				861	child = child->next;
				862	}
				863	return(0);
				864	}
				865
				866	/**
				867	* htmlIsAutoClosed:
				868	* @doc: the HTML document
				869	* @elem: the HTML element
				870	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	871	* The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	872	* The list is kept in htmlStartClose array. This function checks
				873	* if a tag is autoclosed by one of it's child
				874	*
				875	* Returns 1 if autoclosed, 0 otherwise
				876	*/
				877	int
				878	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				879	htmlNodePtr child;
				880
				881	if (elem == NULL) return(1);
				882	child = elem->children;
				883	while (child != NULL) {
				884	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				885	child = child->next;
				886	}
				887	return(0);
				888	}
				889
				890	/**
				891	* htmlCheckImplied:
				892	* @ctxt: an HTML parser context
				893	* @newtag: The new tag name
				894	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	895	* The HTML DTD allows a tag to exists only implicitly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	896	* called when a new tag has been detected and generates the
				897	* appropriates implicit tags if missing
				898	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	899	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	900	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				901	if (!htmlOmittedDefaultValue)
				902	return;
				903	if (xmlStrEqual(newtag, BAD_CAST"html"))
				904	return;
				905	if (ctxt->nameNr <= 0) {
				906	#ifdef DEBUG
				907	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				908	#endif
				909	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				910	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				911	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				912	}
				913	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				914	return;
				915	if ((ctxt->nameNr <= 1) &&
				916	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				917	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				918	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				919	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				920	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				921	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				922	/*
				923	* dropped OBJECT ... i you put it first BODY will be
				924	* assumed !
				925	*/
				926	#ifdef DEBUG
				927	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				928	#endif
				929	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				930	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				931	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				932	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				933	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				934	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				935	int i;
				936	for (i = 0;i < ctxt->nameNr;i++) {
				937	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				938	return;
				939	}
				940	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				941	return;
				942	}
				943	}
				944
				945	#ifdef DEBUG
				946	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				947	#endif
				948	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				949	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				950	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				951	}
				952	}
				953
				954	/**
				955	* htmlCheckParagraph
				956	* @ctxt: an HTML parser context
				957	*
				958	* Check whether a p element need to be implied before inserting
				959	* characters in the current element.
				960	*
				961	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				962	* in case of error.
				963	*/
				964
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	965	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	966	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				967	const xmlChar *tag;
				968	int i;
				969
				970	if (ctxt == NULL)
				971	return(-1);
				972	tag = ctxt->name;
				973	if (tag == NULL) {
				974	htmlAutoClose(ctxt, BAD_CAST"p");
				975	htmlCheckImplied(ctxt, BAD_CAST"p");
				976	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				977	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				978	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				979	return(1);
				980	}
				981	if (!htmlOmittedDefaultValue)
				982	return(0);
				983	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				984	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				985	#ifdef DEBUG
				986	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				987	#endif
				988	htmlAutoClose(ctxt, BAD_CAST"p");
				989	htmlCheckImplied(ctxt, BAD_CAST"p");
				990	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				991	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				992	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				993	return(1);
				994	}
				995	}
				996	return(0);
				997	}
				998
				999	/**
				1000	* htmlIsScriptAttribute:
				1001	* @name: an attribute name
				1002	*
				1003	* Check if an attribute is of content type Script
				1004	*
				1005	* Returns 1 is the attribute is a script 0 otherwise
				1006	*/
				1007	int
				1008	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1009	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1010
				1011	if (name == NULL)
				1012	return(0);
				1013	/*
				1014	* all script attributes start with 'on'
				1015	*/
				1016	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1017	return(0);
				1018	for (i = 0;
				1019	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1020	i++) {
				1021	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1022	return(1);
				1023	}
				1024	return(0);
				1025	}
				1026
				1027	/************************************************************************
				1028	* *
				1029	* The list of HTML predefined entities *
				1030	* *
				1031	************************************************************************/
				1032
				1033
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1034	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1035	/*
				1036	* the 4 absolute ones, plus apostrophe.
				1037	*/
				1038	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1039	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1040	{ 39, "apos", "single quote" },
				1041	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1042	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1043
				1044	/*
				1045	* A bunch still in the 128-255 range
				1046	* Replacing them depend really on the charset used.
				1047	*/
				1048	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1049	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1050	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1051	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1052	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1053	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1054	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1055	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1056	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1057	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1058	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1059	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1060	{ 172, "not", "not sign, U+00AC ISOnum" },
				1061	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1062	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1063	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1064	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1065	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1066	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1067	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1068	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1069	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1070	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1071	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1072	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1073	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1074	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1075	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1076	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1077	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1078	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1079	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1080	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1081	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1082	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1083	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1084	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1085	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1086	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1087	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1088	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1089	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1090	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1091	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1092	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1093	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1094	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1095	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1096	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1097	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1098	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1099	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1100	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1101	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1102	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1103	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1104	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1105	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1106	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1107	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1108	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1109	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1110	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1111	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1112	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1113	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1114	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1115	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1116	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1117	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1118	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1119	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1120	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1121	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1122	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1123	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1124	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1125	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1126	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1127	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1128	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1129	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1130	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1131	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1132	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1133	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1134	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1135	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1136	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1137	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1138	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1139	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1140	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1141	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1142	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1143	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1144
				1145	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1146	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1147	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1148	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1149	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1150
				1151	/*
				1152	* Anything below should really be kept as entities references
				1153	*/
				1154	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1155
				1156	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1157	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1158
				1159	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1160	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1161	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1162	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1163	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1164	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1165	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1166	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1167	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1168	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1169	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1170	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1171	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1172	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1173	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1174	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1175	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1176	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1177	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1178	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1179	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1180	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1181	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1182	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1183
				1184	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1185	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1186	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1187	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1188	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1189	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1190	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1191	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1192	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1193	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1194	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1195	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1196	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1197	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1198	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1199	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1200	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1201	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1202	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1203	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1204	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1205	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1206	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1207	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1208	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1209	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1210	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1211	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1212
				1213	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1214	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1215	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1216	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1217	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1218	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1219	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1220	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1221	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1222	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1223	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1224	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1225	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1226	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1227	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1228	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1229	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1230
				1231	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1232	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1233
				1234	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1235
				1236	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1237	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1238
				1239	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1240	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1241
				1242	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1243	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1244
				1245	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1246
				1247	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1248	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1249	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1250	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1251	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1252	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1253	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1254	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1255	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1256	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1257	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1258	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1259	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1260	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1261	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1262	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1263
				1264	{ 8704, "forall","for all, U+2200 ISOtech" },
				1265	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1266	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1267	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1268	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1269	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1270	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1271	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1272	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1273	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1274	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1275	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1276	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1277	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1278	{ 8734, "infin","infinity, U+221E ISOtech" },
				1279	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1280	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1281	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1282	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1283	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1284	{ 8747, "int", "integral, U+222B ISOtech" },
				1285	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1286	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1287	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1288	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1289	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1290	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1291	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1292	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1293	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1294	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1295	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1296	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1297	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1298	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1299	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1300	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1301	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1302	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1303	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1304	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1305	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1306	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1307	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1308	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1309
				1310	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1311	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1312	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1313	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1314
				1315	};
				1316
				1317	/************************************************************************
				1318	* *
				1319	* Commodity functions to handle entities *
				1320	* *
				1321	************************************************************************/
				1322
				1323	/*
				1324	* Macro used to grow the current buffer.
				1325	*/
				1326	#define growBuffer(buffer) { \
				1327	buffer##_size *= 2; \
				1328	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1329	if (buffer == NULL) { \
				1330	perror("realloc failed"); \
				1331	return(NULL); \
				1332	} \
				1333	}
				1334
				1335	/**
				1336	* htmlEntityLookup:
				1337	* @name: the entity name
				1338	*
				1339	* Lookup the given entity in EntitiesTable
				1340	*
				1341	* TODO: the linear scan is really ugly, an hash table is really needed.
				1342	*
				1343	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1344	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1345	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1346	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1347	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1348
				1349	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1350	sizeof(html40EntitiesTable[0]));i++) {
				1351	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1352	#ifdef DEBUG
				1353	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1354	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1355	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1356	}
				1357	}
				1358	return(NULL);
				1359	}
				1360
				1361	/**
				1362	* htmlEntityValueLookup:
				1363	* @value: the entity's unicode value
				1364	*
				1365	* Lookup the given entity in EntitiesTable
				1366	*
				1367	* TODO: the linear scan is really ugly, an hash table is really needed.
				1368	*
				1369	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1370	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1371	const htmlEntityDesc *
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1372	htmlEntityValueLookup(unsigned int value) {
				1373	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1374	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1375	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1376	#endif
				1377
				1378	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1379	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1380	if (html40EntitiesTable[i].value >= value) {
				1381	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1382	break;
				1383	#ifdef DEBUG
				1384	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1385	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1386	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1387	}
				1388	#ifdef DEBUG
				1389	if (lv > html40EntitiesTable[i].value) {
				1390	xmlGenericError(xmlGenericErrorContext,
				1391	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1392	lv, html40EntitiesTable[i].value);
				1393	}
				1394	lv = html40EntitiesTable[i].value;
				1395	#endif
				1396	}
				1397	return(NULL);
				1398	}
				1399
				1400	/**
				1401	* UTF8ToHtml:
				1402	* @out: a pointer to an array of bytes to store the result
				1403	* @outlen: the length of @out
				1404	* @in: a pointer to an array of UTF-8 chars
				1405	* @inlen: the length of @in
				1406	*
				1407	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1408	* plus HTML entities block of chars out.
				1409	*
				1410	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1411	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1412	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1413	* The value of @outlen after return is the number of octets consumed.
				1414	*/
				1415	int
				1416	UTF8ToHtml(unsigned char* out, int *outlen,
				1417	const unsigned char* in, int *inlen) {
				1418	const unsigned char* processed = in;
				1419	const unsigned char* outend;
				1420	const unsigned char* outstart = out;
				1421	const unsigned char* instart = in;
				1422	const unsigned char* inend;
				1423	unsigned int c, d;
				1424	int trailing;
				1425
				1426	if (in == NULL) {
				1427	/*
				1428	* initialization nothing to do
				1429	*/
				1430	*outlen = 0;
				1431	*inlen = 0;
				1432	return(0);
				1433	}
				1434	inend = in + (*inlen);
				1435	outend = out + (*outlen);
				1436	while (in < inend) {
				1437	d = *in++;
				1438	if (d < 0x80) { c= d; trailing= 0; }
				1439	else if (d < 0xC0) {
				1440	/* trailing byte in leading position */
				1441	*outlen = out - outstart;
				1442	*inlen = processed - instart;
				1443	return(-2);
				1444	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1445	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1446	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1447	else {
				1448	/* no chance for this in Ascii */
				1449	*outlen = out - outstart;
				1450	*inlen = processed - instart;
				1451	return(-2);
				1452	}
				1453
				1454	if (inend - in < trailing) {
				1455	break;
				1456	}
				1457
				1458	for ( ; trailing; trailing--) {
				1459	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1460	break;
				1461	c <<= 6;
				1462	c \|= d & 0x3F;
				1463	}
				1464
				1465	/* assertion: c is a single UTF-4 value */
				1466	if (c < 0x80) {
				1467	if (out + 1 >= outend)
				1468	break;
				1469	*out++ = c;
				1470	} else {
				1471	int len;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1472	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1473
				1474	/*
				1475	* Try to lookup a predefined HTML entity for it
				1476	*/
				1477
				1478	ent = htmlEntityValueLookup(c);
				1479	if (ent == NULL) {
				1480	/* no chance for this in Ascii */
				1481	*outlen = out - outstart;
				1482	*inlen = processed - instart;
				1483	return(-2);
				1484	}
				1485	len = strlen(ent->name);
				1486	if (out + 2 + len >= outend)
				1487	break;
				1488	*out++ = '&';
				1489	memcpy(out, ent->name, len);
				1490	out += len;
				1491	*out++ = ';';
				1492	}
				1493	processed = in;
				1494	}
				1495	*outlen = out - outstart;
				1496	*inlen = processed - instart;
				1497	return(0);
				1498	}
				1499
				1500	/**
				1501	* htmlEncodeEntities:
				1502	* @out: a pointer to an array of bytes to store the result
				1503	* @outlen: the length of @out
				1504	* @in: a pointer to an array of UTF-8 chars
				1505	* @inlen: the length of @in
				1506	* @quoteChar: the quote character to escape (' or ") or zero.
				1507	*
				1508	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1509	* plus HTML entities block of chars out.
				1510	*
				1511	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1512	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1513	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1514	* The value of @outlen after return is the number of octets consumed.
				1515	*/
				1516	int
				1517	htmlEncodeEntities(unsigned char* out, int *outlen,
				1518	const unsigned char* in, int *inlen, int quoteChar) {
				1519	const unsigned char* processed = in;
				1520	const unsigned char* outend = out + (*outlen);
				1521	const unsigned char* outstart = out;
				1522	const unsigned char* instart = in;
				1523	const unsigned char* inend = in + (*inlen);
				1524	unsigned int c, d;
				1525	int trailing;
				1526
				1527	while (in < inend) {
				1528	d = *in++;
				1529	if (d < 0x80) { c= d; trailing= 0; }
				1530	else if (d < 0xC0) {
				1531	/* trailing byte in leading position */
				1532	*outlen = out - outstart;
				1533	*inlen = processed - instart;
				1534	return(-2);
				1535	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1536	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1537	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1538	else {
				1539	/* no chance for this in Ascii */
				1540	*outlen = out - outstart;
				1541	*inlen = processed - instart;
				1542	return(-2);
				1543	}
				1544
				1545	if (inend - in < trailing)
				1546	break;
				1547
				1548	while (trailing--) {
				1549	if (((d= *in++) & 0xC0) != 0x80) {
				1550	*outlen = out - outstart;
				1551	*inlen = processed - instart;
				1552	return(-2);
				1553	}
				1554	c <<= 6;
				1555	c \|= d & 0x3F;
				1556	}
				1557
				1558	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1559	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1560	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1561	if (out >= outend)
				1562	break;
				1563	*out++ = c;
				1564	} else {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1565	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1566	const char *cp;
				1567	char nbuf[16];
				1568	int len;
				1569
				1570	/*
				1571	* Try to lookup a predefined HTML entity for it
				1572	*/
				1573	ent = htmlEntityValueLookup(c);
				1574	if (ent == NULL) {
Aleksey Sanin	49cc975	2002-06-14 17:07:10 +0000	[diff] [blame]	1575	snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1576	cp = nbuf;
				1577	}
				1578	else
				1579	cp = ent->name;
				1580	len = strlen(cp);
				1581	if (out + 2 + len > outend)
				1582	break;
				1583	*out++ = '&';
				1584	memcpy(out, cp, len);
				1585	out += len;
				1586	*out++ = ';';
				1587	}
				1588	processed = in;
				1589	}
				1590	*outlen = out - outstart;
				1591	*inlen = processed - instart;
				1592	return(0);
				1593	}
				1594
				1595	/**
				1596	* htmlDecodeEntities:
				1597	* @ctxt: the parser context
				1598	* @len: the len to decode (in bytes !), -1 for no size limit
				1599	* @end: an end marker xmlChar, 0 if none
				1600	* @end2: an end marker xmlChar, 0 if none
				1601	* @end3: an end marker xmlChar, 0 if none
				1602	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1603	* Substitute the HTML entities by their value
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1604	*
				1605	* DEPRECATED !!!!
				1606	*
				1607	* Returns A newly allocated string with the substitution done. The caller
				1608	* must deallocate it !
				1609	*/
				1610	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1611	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1612	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1613	static int deprecated = 0;
				1614	if (!deprecated) {
				1615	xmlGenericError(xmlGenericErrorContext,
				1616	"htmlDecodeEntities() deprecated function reached\n");
				1617	deprecated = 1;
				1618	}
				1619	return(NULL);
				1620	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1621	xmlChar *name = NULL;
				1622	xmlChar *buffer = NULL;
				1623	unsigned int buffer_size = 0;
				1624	unsigned int nbchars = 0;
				1625	htmlEntityDescPtr ent;
				1626	unsigned int max = (unsigned int) len;
				1627	int c,l;
				1628
				1629	if (ctxt->depth > 40) {
				1630	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1631	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1632	ctxt->sax->error(ctxt->userData,
				1633	"Detected entity reference loop\n");
				1634	ctxt->wellFormed = 0;
				1635	ctxt->disableSAX = 1;
				1636	return(NULL);
				1637	}
				1638
				1639	/*
				1640	* allocate a translation buffer.
				1641	*/
				1642	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1643	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1644	if (buffer == NULL) {
				1645	perror("xmlDecodeEntities: malloc failed");
				1646	return(NULL);
				1647	}
				1648
				1649	/*
				1650	* Ok loop until we reach one of the ending char or a size limit.
				1651	*/
				1652	c = CUR_CHAR(l);
				1653	while ((nbchars < max) && (c != end) &&
				1654	(c != end2) && (c != end3)) {
				1655
				1656	if (c == 0) break;
				1657	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1658	int val = htmlParseCharRef(ctxt);
				1659	COPY_BUF(0,buffer,nbchars,val);
				1660	NEXTL(l);
				1661	} else if ((c == '&') && (ctxt->token != '&')) {
				1662	ent = htmlParseEntityRef(ctxt, &name);
				1663	if (name != NULL) {
				1664	if (ent != NULL) {
				1665	int val = ent->value;
				1666	COPY_BUF(0,buffer,nbchars,val);
				1667	NEXTL(l);
				1668	} else {
				1669	const xmlChar *cur = name;
				1670
				1671	buffer[nbchars++] = '&';
				1672	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1673	growBuffer(buffer);
				1674	}
				1675	while (*cur != 0) {
				1676	buffer[nbchars++] = *cur++;
				1677	}
				1678	buffer[nbchars++] = ';';
				1679	}
				1680	}
				1681	} else {
				1682	COPY_BUF(l,buffer,nbchars,c);
				1683	NEXTL(l);
				1684	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1685	growBuffer(buffer);
				1686	}
				1687	}
				1688	c = CUR_CHAR(l);
				1689	}
				1690	buffer[nbchars++] = 0;
				1691	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1692	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1693	}
				1694
				1695	/************************************************************************
				1696	* *
				1697	* Commodity functions to handle streams *
				1698	* *
				1699	************************************************************************/
				1700
				1701	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1702	* htmlNewInputStream:
				1703	* @ctxt: an HTML parser context
				1704	*
				1705	* Create a new input stream structure
				1706	* Returns the new input stream or NULL
				1707	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1708	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1709	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1710	htmlParserInputPtr input;
				1711
				1712	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1713	if (input == NULL) {
				1714	ctxt->errNo = XML_ERR_NO_MEMORY;
				1715	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1716	ctxt->sax->error(ctxt->userData,
				1717	"malloc: couldn't allocate a new input stream\n");
				1718	return(NULL);
				1719	}
				1720	memset(input, 0, sizeof(htmlParserInput));
				1721	input->filename = NULL;
				1722	input->directory = NULL;
				1723	input->base = NULL;
				1724	input->cur = NULL;
				1725	input->buf = NULL;
				1726	input->line = 1;
				1727	input->col = 1;
				1728	input->buf = NULL;
				1729	input->free = NULL;
				1730	input->version = NULL;
				1731	input->consumed = 0;
				1732	input->length = 0;
				1733	return(input);
				1734	}
				1735
				1736
				1737	/************************************************************************
				1738	* *
				1739	* Commodity functions, cleanup needed ? *
				1740	* *
				1741	************************************************************************/
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1742	/*
				1743	* all tags allowing pc data from the html 4.01 loose dtd
				1744	* NOTE: it might be more apropriate to integrate this information
				1745	* into the html40ElementTable array but I don't want to risk any
				1746	* binary incomptibility
				1747	*/
				1748	static const char *allowPCData[] = {
				1749	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
				1750	"blockquote", "body", "button", "caption", "center", "cite", "code",
				1751	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
				1752	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
				1753	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
				1754	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
				1755	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1756
				1757	/**
				1758	* areBlanks:
				1759	* @ctxt: an HTML parser context
				1760	* @str: a xmlChar *
				1761	* @len: the size of @str
				1762	*
				1763	* Is this a sequence of blank chars that one can ignore ?
				1764	*
				1765	* Returns 1 if ignorable 0 otherwise.
				1766	*/
				1767
				1768	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1769	unsigned int i;
				1770	int j;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1771	xmlNodePtr lastChild;
				1772
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1773	for (j = 0;j < len;j++)
				1774	if (!(IS_BLANK(str[j]))) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1775
				1776	if (CUR == 0) return(1);
				1777	if (CUR != '<') return(0);
				1778	if (ctxt->name == NULL)
				1779	return(1);
				1780	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1781	return(1);
				1782	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1783	return(1);
				1784	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1785	return(1);
				1786	if (ctxt->node == NULL) return(0);
				1787	lastChild = xmlGetLastChild(ctxt->node);
				1788	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1789	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1790	(ctxt->node->content != NULL)) return(0);
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1791	/* keep ws in constructs like ...<b> </b>...
				1792	for all tags "b" allowing PCDATA */
				1793	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				1794	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
				1795	return(0);
				1796	}
				1797	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1798	} else if (xmlNodeIsText(lastChild)) {
				1799	return(0);
Daniel Veillard	8c9872c	2002-07-05 18:17:10 +0000	[diff] [blame]	1800	} else {
				1801	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
				1802	for all tags "p" allowing PCDATA */
				1803	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
				1804	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
				1805	return(0);
				1806	}
				1807	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1808	}
				1809	return(1);
				1810	}
				1811
				1812	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1813	* htmlNewDocNoDtD:
				1814	* @URI: URI for the dtd, or NULL
				1815	* @ExternalID: the external ID of the DTD, or NULL
				1816	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1817	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				1818	* are NULL
				1819	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1820	* Returns a new document, do not initialize the DTD if not provided
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1821	*/
				1822	htmlDocPtr
				1823	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1824	xmlDocPtr cur;
				1825
				1826	/*
				1827	* Allocate a new document and fill the fields.
				1828	*/
				1829	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1830	if (cur == NULL) {
				1831	xmlGenericError(xmlGenericErrorContext,
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1832	"htmlNewDocNoDtD : malloc failed\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1833	return(NULL);
				1834	}
				1835	memset(cur, 0, sizeof(xmlDoc));
				1836
				1837	cur->type = XML_HTML_DOCUMENT_NODE;
				1838	cur->version = NULL;
				1839	cur->intSubset = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1840	cur->doc = cur;
				1841	cur->name = NULL;
				1842	cur->children = NULL;
				1843	cur->extSubset = NULL;
				1844	cur->oldNs = NULL;
				1845	cur->encoding = NULL;
				1846	cur->standalone = 1;
				1847	cur->compression = 0;
				1848	cur->ids = NULL;
				1849	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1850	cur->_private = NULL;
Daniel Veillard	b6b0fd8	2001-10-22 12:31:11 +0000	[diff] [blame]	1851	if ((ExternalID != NULL) \|\|
				1852	(URI != NULL))
Daniel Veillard	5151c06	2001-10-23 13:10:19 +0000	[diff] [blame]	1853	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1854	return(cur);
				1855	}
				1856
				1857	/**
				1858	* htmlNewDoc:
				1859	* @URI: URI for the dtd, or NULL
				1860	* @ExternalID: the external ID of the DTD, or NULL
				1861	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1862	* Creates a new HTML document
				1863	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1864	* Returns a new document
				1865	*/
				1866	htmlDocPtr
				1867	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1868	if ((URI == NULL) && (ExternalID == NULL))
				1869	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1870	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1871	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1872
				1873	return(htmlNewDocNoDtD(URI, ExternalID));
				1874	}
				1875
				1876
				1877	/************************************************************************
				1878	* *
				1879	* The parser itself *
				1880	* Relates to http://www.w3.org/TR/html40 *
				1881	* *
				1882	************************************************************************/
				1883
				1884	/************************************************************************
				1885	* *
				1886	* The parser itself *
				1887	* *
				1888	************************************************************************/
				1889
				1890	/**
				1891	* htmlParseHTMLName:
				1892	* @ctxt: an HTML parser context
				1893	*
				1894	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1895	* since HTML names are not case-sensitive.
				1896	*
				1897	* Returns the Tag Name parsed or NULL
				1898	*/
				1899
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1900	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1901	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1902	xmlChar *ret = NULL;
				1903	int i = 0;
				1904	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1905
				1906	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1907	(CUR != ':')) return(NULL);
				1908
				1909	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1910	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1911	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1912	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1913	else loc[i] = CUR;
				1914	i++;
				1915
				1916	NEXT;
				1917	}
				1918
				1919	ret = xmlStrndup(loc, i);
				1920
				1921	return(ret);
				1922	}
				1923
				1924	/**
				1925	* htmlParseName:
				1926	* @ctxt: an HTML parser context
				1927	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1928	* parse an HTML name, this routine is case sensitive.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1929	*
				1930	* Returns the Name parsed or NULL
				1931	*/
				1932
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1933	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1934	htmlParseName(htmlParserCtxtPtr ctxt) {
				1935	xmlChar buf[HTML_MAX_NAMELEN];
				1936	int len = 0;
				1937
				1938	GROW;
				1939	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1940	return(NULL);
				1941	}
				1942
				1943	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1944	(CUR == '.') \|\| (CUR == '-') \|\|
				1945	(CUR == '_') \|\| (CUR == ':') \|\|
				1946	(IS_COMBINING(CUR)) \|\|
				1947	(IS_EXTENDER(CUR))) {
				1948	buf[len++] = CUR;
				1949	NEXT;
				1950	if (len >= HTML_MAX_NAMELEN) {
				1951	xmlGenericError(xmlGenericErrorContext,
				1952	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1953	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1954	(CUR == '.') \|\| (CUR == '-') \|\|
				1955	(CUR == '_') \|\| (CUR == ':') \|\|
				1956	(IS_COMBINING(CUR)) \|\|
				1957	(IS_EXTENDER(CUR)))
				1958	NEXT;
				1959	break;
				1960	}
				1961	}
				1962	return(xmlStrndup(buf, len));
				1963	}
				1964
				1965	/**
				1966	* htmlParseHTMLAttribute:
				1967	* @ctxt: an HTML parser context
				1968	* @stop: a char stop value
				1969	*
				1970	* parse an HTML attribute value till the stop (quote), if
				1971	* stop is 0 then it stops at the first space
				1972	*
				1973	* Returns the attribute parsed or NULL
				1974	*/
				1975
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1976	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1977	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1978	xmlChar *buffer = NULL;
				1979	int buffer_size = 0;
				1980	xmlChar *out = NULL;
				1981	xmlChar *name = NULL;
				1982
				1983	xmlChar *cur = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1984	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1985
				1986	/*
				1987	* allocate a translation buffer.
				1988	*/
				1989	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1990	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1991	if (buffer == NULL) {
				1992	perror("htmlParseHTMLAttribute: malloc failed");
				1993	return(NULL);
				1994	}
				1995	out = buffer;
				1996
				1997	/*
				1998	* Ok loop until we reach one of the ending chars
				1999	*/
Daniel Veillard	957fdcf	2001-11-06 22:50:19 +0000	[diff] [blame]	2000	while ((CUR != 0) && (CUR != stop)) {
				2001	if ((stop == 0) && (CUR == '>')) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2002	if ((stop == 0) && (IS_BLANK(CUR))) break;
				2003	if (CUR == '&') {
				2004	if (NXT(1) == '#') {
				2005	unsigned int c;
				2006	int bits;
				2007
				2008	c = htmlParseCharRef(ctxt);
				2009	if (c < 0x80)
				2010	{ *out++ = c; bits= -6; }
				2011	else if (c < 0x800)
				2012	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2013	else if (c < 0x10000)
				2014	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2015	else
				2016	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2017
				2018	for ( ; bits >= 0; bits-= 6) {
				2019	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2020	}
				2021	} else {
				2022	ent = htmlParseEntityRef(ctxt, &name);
				2023	if (name == NULL) {
				2024	*out++ = '&';
				2025	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2026	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2027
				2028	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2029	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2030	}
				2031	} else if (ent == NULL) {
				2032	*out++ = '&';
				2033	cur = name;
				2034	while (*cur != 0) {
				2035	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2036	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2037
				2038	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2039	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2040	}
				2041	out++ = cur++;
				2042	}
				2043	xmlFree(name);
				2044	} else {
				2045	unsigned int c;
				2046	int bits;
				2047
				2048	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2049	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2050
				2051	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2052	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2053	}
				2054	c = (xmlChar)ent->value;
				2055	if (c < 0x80)
				2056	{ *out++ = c; bits= -6; }
				2057	else if (c < 0x800)
				2058	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2059	else if (c < 0x10000)
				2060	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2061	else
				2062	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2063
				2064	for ( ; bits >= 0; bits-= 6) {
				2065	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2066	}
				2067	xmlFree(name);
				2068	}
				2069	}
				2070	} else {
				2071	unsigned int c;
				2072	int bits, l;
				2073
				2074	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2075	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2076
				2077	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2078	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2079	}
				2080	c = CUR_CHAR(l);
				2081	if (c < 0x80)
				2082	{ *out++ = c; bits= -6; }
				2083	else if (c < 0x800)
				2084	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2085	else if (c < 0x10000)
				2086	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2087	else
				2088	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2089
				2090	for ( ; bits >= 0; bits-= 6) {
				2091	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2092	}
				2093	NEXT;
				2094	}
				2095	}
				2096	*out++ = 0;
				2097	return(buffer);
				2098	}
				2099
				2100	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2101	* htmlParseEntityRef:
				2102	* @ctxt: an HTML parser context
				2103	* @str: location to store the entity name
				2104	*
				2105	* parse an HTML ENTITY references
				2106	*
				2107	* [68] EntityRef ::= '&' Name ';'
				2108	*
				2109	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2110	* if non-NULL *str will have to be freed by the caller.
				2111	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2112	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2113	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2114	xmlChar *name;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2115	const htmlEntityDesc * ent = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2116	*str = NULL;
				2117
				2118	if (CUR == '&') {
				2119	NEXT;
				2120	name = htmlParseName(ctxt);
				2121	if (name == NULL) {
				2122	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2123	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2124	ctxt->wellFormed = 0;
				2125	} else {
				2126	GROW;
				2127	if (CUR == ';') {
				2128	*str = name;
				2129
				2130	/*
				2131	* Lookup the entity in the table.
				2132	*/
				2133	ent = htmlEntityLookup(name);
				2134	if (ent != NULL) /* OK that's ugly !!! */
				2135	NEXT;
				2136	} else {
				2137	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2138	ctxt->sax->error(ctxt->userData,
				2139	"htmlParseEntityRef: expecting ';'\n");
				2140	*str = name;
				2141	}
				2142	}
				2143	}
				2144	return(ent);
				2145	}
				2146
				2147	/**
				2148	* htmlParseAttValue:
				2149	* @ctxt: an HTML parser context
				2150	*
				2151	* parse a value for an attribute
				2152	* Note: the parser won't do substitution of entities here, this
				2153	* will be handled later in xmlStringGetNodeList, unless it was
				2154	* asked for ctxt->replaceEntities != 0
				2155	*
				2156	* Returns the AttValue parsed or NULL.
				2157	*/
				2158
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2159	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2160	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2161	xmlChar *ret = NULL;
				2162
				2163	if (CUR == '"') {
				2164	NEXT;
				2165	ret = htmlParseHTMLAttribute(ctxt, '"');
				2166	if (CUR != '"') {
				2167	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2168	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2169	ctxt->wellFormed = 0;
				2170	} else
				2171	NEXT;
				2172	} else if (CUR == '\'') {
				2173	NEXT;
				2174	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2175	if (CUR != '\'') {
				2176	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2177	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2178	ctxt->wellFormed = 0;
				2179	} else
				2180	NEXT;
				2181	} else {
				2182	/*
				2183	* That's an HTMLism, the attribute value may not be quoted
				2184	*/
				2185	ret = htmlParseHTMLAttribute(ctxt, 0);
				2186	if (ret == NULL) {
				2187	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2188	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2189	ctxt->wellFormed = 0;
				2190	}
				2191	}
				2192	return(ret);
				2193	}
				2194
				2195	/**
				2196	* htmlParseSystemLiteral:
				2197	* @ctxt: an HTML parser context
				2198	*
				2199	* parse an HTML Literal
				2200	*
				2201	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2202	*
				2203	* Returns the SystemLiteral parsed or NULL
				2204	*/
				2205
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2206	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2207	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2208	const xmlChar *q;
				2209	xmlChar *ret = NULL;
				2210
				2211	if (CUR == '"') {
				2212	NEXT;
				2213	q = CUR_PTR;
				2214	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2215	NEXT;
				2216	if (!IS_CHAR(CUR)) {
				2217	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2218	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2219	ctxt->wellFormed = 0;
				2220	} else {
				2221	ret = xmlStrndup(q, CUR_PTR - q);
				2222	NEXT;
				2223	}
				2224	} else if (CUR == '\'') {
				2225	NEXT;
				2226	q = CUR_PTR;
				2227	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2228	NEXT;
				2229	if (!IS_CHAR(CUR)) {
				2230	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2231	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2232	ctxt->wellFormed = 0;
				2233	} else {
				2234	ret = xmlStrndup(q, CUR_PTR - q);
				2235	NEXT;
				2236	}
				2237	} else {
				2238	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2239	ctxt->sax->error(ctxt->userData,
				2240	"SystemLiteral \" or ' expected\n");
				2241	ctxt->wellFormed = 0;
				2242	}
				2243
				2244	return(ret);
				2245	}
				2246
				2247	/**
				2248	* htmlParsePubidLiteral:
				2249	* @ctxt: an HTML parser context
				2250	*
				2251	* parse an HTML public literal
				2252	*
				2253	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2254	*
				2255	* Returns the PubidLiteral parsed or NULL.
				2256	*/
				2257
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2258	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2259	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2260	const xmlChar *q;
				2261	xmlChar *ret = NULL;
				2262	/*
				2263	* Name ::= (Letter \| '_') (NameChar)*
				2264	*/
				2265	if (CUR == '"') {
				2266	NEXT;
				2267	q = CUR_PTR;
				2268	while (IS_PUBIDCHAR(CUR)) NEXT;
				2269	if (CUR != '"') {
				2270	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2271	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2272	ctxt->wellFormed = 0;
				2273	} else {
				2274	ret = xmlStrndup(q, CUR_PTR - q);
				2275	NEXT;
				2276	}
				2277	} else if (CUR == '\'') {
				2278	NEXT;
				2279	q = CUR_PTR;
				2280	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2281	NEXT;
				2282	if (!IS_LETTER(CUR)) {
				2283	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2284	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2285	ctxt->wellFormed = 0;
				2286	} else {
				2287	ret = xmlStrndup(q, CUR_PTR - q);
				2288	NEXT;
				2289	}
				2290	} else {
				2291	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2292	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2293	ctxt->wellFormed = 0;
				2294	}
				2295
				2296	return(ret);
				2297	}
				2298
				2299	/**
				2300	* htmlParseScript:
				2301	* @ctxt: an HTML parser context
				2302	*
				2303	* parse the content of an HTML SCRIPT or STYLE element
				2304	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2305	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2306	* http://www.w3.org/TR/html4/types.html#type-script
				2307	* http://www.w3.org/TR/html4/types.html#h-6.15
				2308	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2309	*
				2310	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2311	* element and the value of intrinsic event attributes. User agents must
				2312	* not evaluate script data as HTML markup but instead must pass it on as
				2313	* data to a script engine.
				2314	* NOTES:
				2315	* - The content is passed like CDATA
				2316	* - the attributes for style and scripting "onXXX" are also described
				2317	* as CDATA but SGML allows entities references in attributes so their
				2318	* processing is identical as other attributes
				2319	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2320	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2321	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2322	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2323	int nbchar = 0;
				2324	xmlChar cur;
				2325
				2326	SHRINK;
				2327	cur = CUR;
				2328	while (IS_CHAR(cur)) {
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	2329	if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
				2330	(NXT(3) == '-')) {
				2331	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2332	if (ctxt->sax->cdataBlock!= NULL) {
				2333	/*
				2334	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2335	*/
				2336	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2337	}
				2338	}
				2339	nbchar = 0;
				2340	htmlParseComment(ctxt);
				2341	cur = CUR;
				2342	continue;
				2343	} else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2344	/*
				2345	* One should break here, the specification is clear:
				2346	* Authors should therefore escape "</" within the content.
				2347	* Escape mechanisms are specific to each scripting or
				2348	* style sheet language.
				2349	*/
				2350	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2351	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2352	break; /* while */
				2353	}
				2354	buf[nbchar++] = cur;
				2355	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2356	if (ctxt->sax->cdataBlock!= NULL) {
				2357	/*
				2358	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2359	*/
				2360	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2361	}
				2362	nbchar = 0;
				2363	}
				2364	NEXT;
				2365	cur = CUR;
				2366	}
				2367	if (!(IS_CHAR(cur))) {
				2368	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2369	ctxt->sax->error(ctxt->userData,
				2370	"Invalid char in CDATA 0x%X\n", cur);
				2371	ctxt->wellFormed = 0;
				2372	NEXT;
				2373	}
				2374
				2375	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2376	if (ctxt->sax->cdataBlock!= NULL) {
				2377	/*
				2378	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2379	*/
				2380	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2381	}
				2382	}
				2383	}
				2384
				2385
				2386	/**
				2387	* htmlParseCharData:
				2388	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2389	*
				2390	* parse a CharData section.
				2391	* if we are within a CDATA section ']]>' marks an end of section.
				2392	*
				2393	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2394	*/
				2395
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2396	static void
				2397	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2398	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2399	int nbchar = 0;
				2400	int cur, l;
				2401
				2402	SHRINK;
				2403	cur = CUR_CHAR(l);
				2404	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2405	((cur != '&') \|\| (ctxt->token == '&')) &&
				2406	(IS_CHAR(cur))) {
				2407	COPY_BUF(l,buf,nbchar,cur);
				2408	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2409	/*
				2410	* Ok the segment is to be consumed as chars.
				2411	*/
				2412	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2413	if (areBlanks(ctxt, buf, nbchar)) {
				2414	if (ctxt->sax->ignorableWhitespace != NULL)
				2415	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2416	buf, nbchar);
				2417	} else {
				2418	htmlCheckParagraph(ctxt);
				2419	if (ctxt->sax->characters != NULL)
				2420	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2421	}
				2422	}
				2423	nbchar = 0;
				2424	}
				2425	NEXTL(l);
				2426	cur = CUR_CHAR(l);
				2427	}
				2428	if (nbchar != 0) {
				2429	/*
				2430	* Ok the segment is to be consumed as chars.
				2431	*/
				2432	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2433	if (areBlanks(ctxt, buf, nbchar)) {
				2434	if (ctxt->sax->ignorableWhitespace != NULL)
				2435	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2436	} else {
				2437	htmlCheckParagraph(ctxt);
				2438	if (ctxt->sax->characters != NULL)
				2439	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2440	}
				2441	}
Daniel Veillard	7cc95c0	2001-10-17 15:45:12 +0000	[diff] [blame]	2442	} else {
				2443	/*
				2444	* Loop detection
				2445	*/
				2446	if (cur == 0)
				2447	ctxt->instate = XML_PARSER_EOF;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2448	}
				2449	}
				2450
				2451	/**
				2452	* htmlParseExternalID:
				2453	* @ctxt: an HTML parser context
				2454	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2455	*
				2456	* Parse an External ID or a Public ID
				2457	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2458	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2459	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2460	*
				2461	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2462	*
				2463	* Returns the function returns SystemLiteral and in the second
				2464	* case publicID receives PubidLiteral, is strict is off
				2465	* it is possible to return NULL and have publicID set.
				2466	*/
				2467
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2468	static xmlChar *
				2469	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2470	xmlChar *URI = NULL;
				2471
				2472	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2473	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2474	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2475	SKIP(6);
				2476	if (!IS_BLANK(CUR)) {
				2477	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2478	ctxt->sax->error(ctxt->userData,
				2479	"Space required after 'SYSTEM'\n");
				2480	ctxt->wellFormed = 0;
				2481	}
				2482	SKIP_BLANKS;
				2483	URI = htmlParseSystemLiteral(ctxt);
				2484	if (URI == NULL) {
				2485	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2486	ctxt->sax->error(ctxt->userData,
				2487	"htmlParseExternalID: SYSTEM, no URI\n");
				2488	ctxt->wellFormed = 0;
				2489	}
				2490	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2491	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2492	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2493	SKIP(6);
				2494	if (!IS_BLANK(CUR)) {
				2495	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2496	ctxt->sax->error(ctxt->userData,
				2497	"Space required after 'PUBLIC'\n");
				2498	ctxt->wellFormed = 0;
				2499	}
				2500	SKIP_BLANKS;
				2501	*publicID = htmlParsePubidLiteral(ctxt);
				2502	if (*publicID == NULL) {
				2503	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2504	ctxt->sax->error(ctxt->userData,
				2505	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2506	ctxt->wellFormed = 0;
				2507	}
				2508	SKIP_BLANKS;
				2509	if ((CUR == '"') \|\| (CUR == '\'')) {
				2510	URI = htmlParseSystemLiteral(ctxt);
				2511	}
				2512	}
				2513	return(URI);
				2514	}
				2515
				2516	/**
				2517	* htmlParseComment:
				2518	* @ctxt: an HTML parser context
				2519	*
				2520	* Parse an XML (SGML) comment <!-- .... -->
				2521	*
				2522	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2523	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2524	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2525	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2526	xmlChar *buf = NULL;
				2527	int len;
				2528	int size = HTML_PARSER_BUFFER_SIZE;
				2529	int q, ql;
				2530	int r, rl;
				2531	int cur, l;
				2532	xmlParserInputState state;
				2533
				2534	/*
				2535	* Check that there is a comment right here.
				2536	*/
				2537	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2538	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2539
				2540	state = ctxt->instate;
				2541	ctxt->instate = XML_PARSER_COMMENT;
				2542	SHRINK;
				2543	SKIP(4);
				2544	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2545	if (buf == NULL) {
				2546	xmlGenericError(xmlGenericErrorContext,
				2547	"malloc of %d byte failed\n", size);
				2548	ctxt->instate = state;
				2549	return;
				2550	}
				2551	q = CUR_CHAR(ql);
				2552	NEXTL(ql);
				2553	r = CUR_CHAR(rl);
				2554	NEXTL(rl);
				2555	cur = CUR_CHAR(l);
				2556	len = 0;
				2557	while (IS_CHAR(cur) &&
				2558	((cur != '>') \|\|
				2559	(r != '-') \|\| (q != '-'))) {
				2560	if (len + 5 >= size) {
				2561	size *= 2;
				2562	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2563	if (buf == NULL) {
				2564	xmlGenericError(xmlGenericErrorContext,
				2565	"realloc of %d byte failed\n", size);
				2566	ctxt->instate = state;
				2567	return;
				2568	}
				2569	}
				2570	COPY_BUF(ql,buf,len,q);
				2571	q = r;
				2572	ql = rl;
				2573	r = cur;
				2574	rl = l;
				2575	NEXTL(l);
				2576	cur = CUR_CHAR(l);
				2577	if (cur == 0) {
				2578	SHRINK;
				2579	GROW;
				2580	cur = CUR_CHAR(l);
				2581	}
				2582	}
				2583	buf[len] = 0;
				2584	if (!IS_CHAR(cur)) {
				2585	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2586	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2587	ctxt->sax->error(ctxt->userData,
				2588	"Comment not terminated \n<!--%.50s\n", buf);
				2589	ctxt->wellFormed = 0;
				2590	xmlFree(buf);
				2591	} else {
				2592	NEXT;
				2593	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2594	(!ctxt->disableSAX))
				2595	ctxt->sax->comment(ctxt->userData, buf);
				2596	xmlFree(buf);
				2597	}
				2598	ctxt->instate = state;
				2599	}
				2600
				2601	/**
				2602	* htmlParseCharRef:
				2603	* @ctxt: an HTML parser context
				2604	*
				2605	* parse Reference declarations
				2606	*
				2607	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2608	* '&#x' [0-9a-fA-F]+ ';'
				2609	*
				2610	* Returns the value parsed (as an int)
				2611	*/
				2612	int
				2613	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2614	int val = 0;
				2615
				2616	if ((CUR == '&') && (NXT(1) == '#') &&
				2617	(NXT(2) == 'x')) {
				2618	SKIP(3);
				2619	while (CUR != ';') {
				2620	if ((CUR >= '0') && (CUR <= '9'))
				2621	val = val * 16 + (CUR - '0');
				2622	else if ((CUR >= 'a') && (CUR <= 'f'))
				2623	val = val * 16 + (CUR - 'a') + 10;
				2624	else if ((CUR >= 'A') && (CUR <= 'F'))
				2625	val = val * 16 + (CUR - 'A') + 10;
				2626	else {
				2627	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2628	ctxt->sax->error(ctxt->userData,
				2629	"htmlParseCharRef: invalid hexadecimal value\n");
				2630	ctxt->wellFormed = 0;
				2631	return(0);
				2632	}
				2633	NEXT;
				2634	}
				2635	if (CUR == ';')
				2636	NEXT;
				2637	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2638	SKIP(2);
				2639	while (CUR != ';') {
				2640	if ((CUR >= '0') && (CUR <= '9'))
				2641	val = val * 10 + (CUR - '0');
				2642	else {
				2643	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2644	ctxt->sax->error(ctxt->userData,
				2645	"htmlParseCharRef: invalid decimal value\n");
				2646	ctxt->wellFormed = 0;
				2647	return(0);
				2648	}
				2649	NEXT;
				2650	}
				2651	if (CUR == ';')
				2652	NEXT;
				2653	} else {
				2654	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2655	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2656	ctxt->wellFormed = 0;
				2657	}
				2658	/*
				2659	* Check the value IS_CHAR ...
				2660	*/
				2661	if (IS_CHAR(val)) {
				2662	return(val);
				2663	} else {
				2664	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2665	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2666	val);
				2667	ctxt->wellFormed = 0;
				2668	}
				2669	return(0);
				2670	}
				2671
				2672
				2673	/**
				2674	* htmlParseDocTypeDecl :
				2675	* @ctxt: an HTML parser context
				2676	*
				2677	* parse a DOCTYPE declaration
				2678	*
				2679	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2680	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2681	*/
				2682
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2683	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2684	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2685	xmlChar *name;
				2686	xmlChar *ExternalID = NULL;
				2687	xmlChar *URI = NULL;
				2688
				2689	/*
				2690	* We know that '<!DOCTYPE' has been detected.
				2691	*/
				2692	SKIP(9);
				2693
				2694	SKIP_BLANKS;
				2695
				2696	/*
				2697	* Parse the DOCTYPE name.
				2698	*/
				2699	name = htmlParseName(ctxt);
				2700	if (name == NULL) {
				2701	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2702	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2703	ctxt->wellFormed = 0;
				2704	}
				2705	/*
				2706	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2707	*/
				2708
				2709	SKIP_BLANKS;
				2710
				2711	/*
				2712	* Check for SystemID and ExternalID
				2713	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2714	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2715	SKIP_BLANKS;
				2716
				2717	/*
				2718	* We should be at the end of the DOCTYPE declaration.
				2719	*/
				2720	if (CUR != '>') {
				2721	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard	f6ed8bc	2001-10-02 09:22:47 +0000	[diff] [blame]	2722	ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2723	ctxt->wellFormed = 0;
				2724	/* We shouldn't try to resynchronize ... */
				2725	}
				2726	NEXT;
				2727
				2728	/*
				2729	* Create or update the document accordingly to the DOCTYPE
				2730	*/
				2731	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2732	(!ctxt->disableSAX))
				2733	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2734
				2735	/*
				2736	* Cleanup, since we don't use all those identifiers
				2737	*/
				2738	if (URI != NULL) xmlFree(URI);
				2739	if (ExternalID != NULL) xmlFree(ExternalID);
				2740	if (name != NULL) xmlFree(name);
				2741	}
				2742
				2743	/**
				2744	* htmlParseAttribute:
				2745	* @ctxt: an HTML parser context
				2746	* @value: a xmlChar ** used to store the value of the attribute
				2747	*
				2748	* parse an attribute
				2749	*
				2750	* [41] Attribute ::= Name Eq AttValue
				2751	*
				2752	* [25] Eq ::= S? '=' S?
				2753	*
				2754	* With namespace:
				2755	*
				2756	* [NS 11] Attribute ::= QName Eq AttValue
				2757	*
				2758	* Also the case QName == xmlns:??? is handled independently as a namespace
				2759	* definition.
				2760	*
				2761	* Returns the attribute name, and the value in *value.
				2762	*/
				2763
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2764	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2765	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2766	xmlChar name, val = NULL;
				2767
				2768	*value = NULL;
				2769	name = htmlParseHTMLName(ctxt);
				2770	if (name == NULL) {
				2771	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2772	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2773	ctxt->wellFormed = 0;
				2774	return(NULL);
				2775	}
				2776
				2777	/*
				2778	* read the value
				2779	*/
				2780	SKIP_BLANKS;
				2781	if (CUR == '=') {
				2782	NEXT;
				2783	SKIP_BLANKS;
				2784	val = htmlParseAttValue(ctxt);
				2785	/******
				2786	} else {
				2787	* TODO : some attribute must have values, some may not
				2788	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2789	ctxt->sax->warning(ctxt->userData,
				2790	"No value for attribute %s\n", name); */
				2791	}
				2792
				2793	*value = val;
				2794	return(name);
				2795	}
				2796
				2797	/**
				2798	* htmlCheckEncoding:
				2799	* @ctxt: an HTML parser context
				2800	* @attvalue: the attribute value
				2801	*
				2802	* Checks an http-equiv attribute from a Meta tag to detect
				2803	* the encoding
				2804	* If a new encoding is detected the parser is switched to decode
				2805	* it and pass UTF8
				2806	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2807	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2808	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2809	const xmlChar *encoding;
				2810
				2811	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2812	return;
				2813
				2814	/* do not change encoding */
				2815	if (ctxt->input->encoding != NULL)
				2816	return;
				2817
				2818	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2819	if (encoding != NULL) {
				2820	encoding += 8;
				2821	} else {
				2822	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2823	if (encoding != NULL)
				2824	encoding += 9;
				2825	}
				2826	if (encoding != NULL) {
				2827	xmlCharEncoding enc;
				2828	xmlCharEncodingHandlerPtr handler;
				2829
				2830	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2831
				2832	if (ctxt->input->encoding != NULL)
				2833	xmlFree((xmlChar *) ctxt->input->encoding);
				2834	ctxt->input->encoding = xmlStrdup(encoding);
				2835
				2836	enc = xmlParseCharEncoding((const char *) encoding);
				2837	/*
				2838	* registered set of known encodings
				2839	*/
				2840	if (enc != XML_CHAR_ENCODING_ERROR) {
				2841	xmlSwitchEncoding(ctxt, enc);
				2842	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2843	} else {
				2844	/*
				2845	* fallback for unknown encodings
				2846	*/
				2847	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2848	if (handler != NULL) {
				2849	xmlSwitchToEncoding(ctxt, handler);
				2850	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2851	} else {
				2852	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2853	}
				2854	}
				2855
				2856	if ((ctxt->input->buf != NULL) &&
				2857	(ctxt->input->buf->encoder != NULL) &&
				2858	(ctxt->input->buf->raw != NULL) &&
				2859	(ctxt->input->buf->buffer != NULL)) {
				2860	int nbchars;
				2861	int processed;
				2862
				2863	/*
				2864	* convert as much as possible to the parser reading buffer.
				2865	*/
				2866	processed = ctxt->input->cur - ctxt->input->base;
				2867	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2868	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2869	ctxt->input->buf->buffer,
				2870	ctxt->input->buf->raw);
				2871	if (nbchars < 0) {
				2872	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2873	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2874	ctxt->sax->error(ctxt->userData,
				2875	"htmlCheckEncoding: encoder error\n");
				2876	}
				2877	ctxt->input->base =
				2878	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2879	}
				2880	}
				2881	}
				2882
				2883	/**
				2884	* htmlCheckMeta:
				2885	* @ctxt: an HTML parser context
				2886	* @atts: the attributes values
				2887	*
				2888	* Checks an attributes from a Meta tag
				2889	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2890	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2891	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2892	int i;
				2893	const xmlChar att, value;
				2894	int http = 0;
				2895	const xmlChar *content = NULL;
				2896
				2897	if ((ctxt == NULL) \|\| (atts == NULL))
				2898	return;
				2899
				2900	i = 0;
				2901	att = atts[i++];
				2902	while (att != NULL) {
				2903	value = atts[i++];
				2904	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2905	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2906	http = 1;
				2907	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2908	content = value;
				2909	att = atts[i++];
				2910	}
				2911	if ((http) && (content != NULL))
				2912	htmlCheckEncoding(ctxt, content);
				2913
				2914	}
				2915
				2916	/**
				2917	* htmlParseStartTag:
				2918	* @ctxt: an HTML parser context
				2919	*
				2920	* parse a start of tag either for rule element or
				2921	* EmptyElement. In both case we don't parse the tag closing chars.
				2922	*
				2923	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2924	*
				2925	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2926	*
				2927	* With namespace:
				2928	*
				2929	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2930	*
				2931	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2932	*
				2933	*/
				2934
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2935	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2936	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2937	xmlChar *name;
				2938	xmlChar *attname;
				2939	xmlChar *attvalue;
				2940	const xmlChar **atts = NULL;
				2941	int nbatts = 0;
				2942	int maxatts = 0;
				2943	int meta = 0;
				2944	int i;
				2945
				2946	if (CUR != '<') return;
				2947	NEXT;
				2948
				2949	GROW;
				2950	name = htmlParseHTMLName(ctxt);
				2951	if (name == NULL) {
				2952	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2953	ctxt->sax->error(ctxt->userData,
				2954	"htmlParseStartTag: invalid element name\n");
				2955	ctxt->wellFormed = 0;
				2956	/* Dump the bogus tag like browsers do */
				2957	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2958	NEXT;
				2959	return;
				2960	}
				2961	if (xmlStrEqual(name, BAD_CAST"meta"))
				2962	meta = 1;
				2963
				2964	/*
				2965	* Check for auto-closure of HTML elements.
				2966	*/
				2967	htmlAutoClose(ctxt, name);
				2968
				2969	/*
				2970	* Check for implied HTML elements.
				2971	*/
				2972	htmlCheckImplied(ctxt, name);
				2973
				2974	/*
				2975	* Avoid html at any level > 0, head at any level != 1
				2976	* or any attempt to recurse body
				2977	*/
				2978	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2979	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2980	ctxt->sax->error(ctxt->userData,
				2981	"htmlParseStartTag: misplaced <html> tag\n");
				2982	ctxt->wellFormed = 0;
				2983	xmlFree(name);
				2984	return;
				2985	}
				2986	if ((ctxt->nameNr != 1) &&
				2987	(xmlStrEqual(name, BAD_CAST"head"))) {
				2988	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2989	ctxt->sax->error(ctxt->userData,
				2990	"htmlParseStartTag: misplaced <head> tag\n");
				2991	ctxt->wellFormed = 0;
				2992	xmlFree(name);
				2993	return;
				2994	}
				2995	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2996	int indx;
				2997	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2998	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2999	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3000	ctxt->sax->error(ctxt->userData,
				3001	"htmlParseStartTag: misplaced <body> tag\n");
				3002	ctxt->wellFormed = 0;
				3003	xmlFree(name);
				3004	return;
				3005	}
				3006	}
				3007	}
				3008
				3009	/*
				3010	* Now parse the attributes, it ends up with the ending
				3011	*
				3012	* (S Attribute)* S?
				3013	*/
				3014	SKIP_BLANKS;
				3015	while ((IS_CHAR(CUR)) &&
				3016	(CUR != '>') &&
				3017	((CUR != '/') \|\| (NXT(1) != '>'))) {
				3018	long cons = ctxt->nbChars;
				3019
				3020	GROW;
				3021	attname = htmlParseAttribute(ctxt, &attvalue);
				3022	if (attname != NULL) {
				3023
				3024	/*
				3025	* Well formedness requires at most one declaration of an attribute
				3026	*/
				3027	for (i = 0; i < nbatts;i += 2) {
				3028	if (xmlStrEqual(atts[i], attname)) {
				3029	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3030	ctxt->sax->error(ctxt->userData,
				3031	"Attribute %s redefined\n",
				3032	attname);
				3033	ctxt->wellFormed = 0;
				3034	xmlFree(attname);
				3035	if (attvalue != NULL)
				3036	xmlFree(attvalue);
				3037	goto failed;
				3038	}
				3039	}
				3040
				3041	/*
				3042	* Add the pair to atts
				3043	*/
				3044	if (atts == NULL) {
				3045	maxatts = 10;
				3046	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				3047	if (atts == NULL) {
				3048	xmlGenericError(xmlGenericErrorContext,
				3049	"malloc of %ld byte failed\n",
				3050	maxatts * (long)sizeof(xmlChar *));
				3051	if (name != NULL) xmlFree(name);
				3052	return;
				3053	}
				3054	} else if (nbatts + 4 > maxatts) {
				3055	maxatts *= 2;
				3056	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3057	maxatts * sizeof(xmlChar *));
				3058	if (atts == NULL) {
				3059	xmlGenericError(xmlGenericErrorContext,
				3060	"realloc of %ld byte failed\n",
				3061	maxatts * (long)sizeof(xmlChar *));
				3062	if (name != NULL) xmlFree(name);
				3063	return;
				3064	}
				3065	}
				3066	atts[nbatts++] = attname;
				3067	atts[nbatts++] = attvalue;
				3068	atts[nbatts] = NULL;
				3069	atts[nbatts + 1] = NULL;
				3070	}
				3071	else {
				3072	/* Dump the bogus attribute string up to the next blank or
				3073	* the end of the tag. */
Daniel Veillard	561b7f8	2002-03-20 21:55:57 +0000	[diff] [blame]	3074	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3075	&& ((CUR != '/') \|\| (NXT(1) != '>')))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3076	NEXT;
				3077	}
				3078
				3079	failed:
				3080	SKIP_BLANKS;
				3081	if (cons == ctxt->nbChars) {
				3082	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3083	ctxt->sax->error(ctxt->userData,
				3084	"htmlParseStartTag: problem parsing attributes\n");
				3085	ctxt->wellFormed = 0;
				3086	break;
				3087	}
				3088	}
				3089
				3090	/*
				3091	* Handle specific association to the META tag
				3092	*/
				3093	if (meta)
				3094	htmlCheckMeta(ctxt, atts);
				3095
				3096	/*
				3097	* SAX: Start of Element !
				3098	*/
				3099	htmlnamePush(ctxt, xmlStrdup(name));
				3100	#ifdef DEBUG
				3101	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3102	#endif
				3103	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3104	ctxt->sax->startElement(ctxt->userData, name, atts);
				3105
				3106	if (atts != NULL) {
				3107	for (i = 0;i < nbatts;i++) {
				3108	if (atts[i] != NULL)
				3109	xmlFree((xmlChar *) atts[i]);
				3110	}
				3111	xmlFree((void *) atts);
				3112	}
				3113	if (name != NULL) xmlFree(name);
				3114	}
				3115
				3116	/**
				3117	* htmlParseEndTag:
				3118	* @ctxt: an HTML parser context
				3119	*
				3120	* parse an end of tag
				3121	*
				3122	* [42] ETag ::= '</' Name S? '>'
				3123	*
				3124	* With namespace
				3125	*
				3126	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3127	*
				3128	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3129	*/
				3130
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3131	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3132	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3133	xmlChar *name;
				3134	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3135	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3136
				3137	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3138	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3139	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3140	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3141	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3142	}
				3143	SKIP(2);
				3144
				3145	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3146	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3147
				3148	/*
				3149	* We should definitely be at the ending "S? '>'" part
				3150	*/
				3151	SKIP_BLANKS;
				3152	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3153	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3154	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3155	ctxt->wellFormed = 0;
				3156	} else
				3157	NEXT;
				3158
				3159	/*
				3160	* If the name read is not one of the element in the parsing stack
				3161	* then return, it's just an error.
				3162	*/
				3163	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3164	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3165	}
				3166	if (i < 0) {
				3167	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3168	ctxt->sax->error(ctxt->userData,
				3169	"Unexpected end tag : %s\n", name);
				3170	xmlFree(name);
				3171	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3172	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3173	}
				3174
				3175
				3176	/*
				3177	* Check for auto-closure of HTML elements.
				3178	*/
				3179
				3180	htmlAutoCloseOnClose(ctxt, name);
				3181
				3182	/*
				3183	* Well formedness constraints, opening and closing must match.
				3184	* With the exception that the autoclose may have popped stuff out
				3185	* of the stack.
				3186	*/
				3187	if (!xmlStrEqual(name, ctxt->name)) {
				3188	#ifdef DEBUG
				3189	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3190	#endif
				3191	if ((ctxt->name != NULL) &&
				3192	(!xmlStrEqual(ctxt->name, name))) {
				3193	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3194	ctxt->sax->error(ctxt->userData,
				3195	"Opening and ending tag mismatch: %s and %s\n",
				3196	name, ctxt->name);
				3197	ctxt->wellFormed = 0;
				3198	}
				3199	}
				3200
				3201	/*
				3202	* SAX: End of Tag
				3203	*/
				3204	oldname = ctxt->name;
				3205	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3206	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3207	ctxt->sax->endElement(ctxt->userData, name);
				3208	oldname = htmlnamePop(ctxt);
				3209	if (oldname != NULL) {
				3210	#ifdef DEBUG
				3211	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3212	#endif
				3213	xmlFree(oldname);
				3214	#ifdef DEBUG
				3215	} else {
				3216	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3217	#endif
				3218	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3219	ret = 1;
				3220	} else {
				3221	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3222	}
				3223
				3224	if (name != NULL)
				3225	xmlFree(name);
				3226
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3227	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3228	}
				3229
				3230
				3231	/**
				3232	* htmlParseReference:
				3233	* @ctxt: an HTML parser context
				3234	*
				3235	* parse and handle entity references in content,
				3236	* this will end-up in a call to character() since this is either a
				3237	* CharRef, or a predefined entity.
				3238	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3239	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3240	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3241	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3242	xmlChar out[6];
				3243	xmlChar *name;
				3244	if (CUR != '&') return;
				3245
				3246	if (NXT(1) == '#') {
				3247	unsigned int c;
				3248	int bits, i = 0;
				3249
				3250	c = htmlParseCharRef(ctxt);
				3251	if (c == 0)
				3252	return;
				3253
				3254	if (c < 0x80) { out[i++]= c; bits= -6; }
				3255	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3256	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3257	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3258
				3259	for ( ; bits >= 0; bits-= 6) {
				3260	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3261	}
				3262	out[i] = 0;
				3263
				3264	htmlCheckParagraph(ctxt);
				3265	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3266	ctxt->sax->characters(ctxt->userData, out, i);
				3267	} else {
				3268	ent = htmlParseEntityRef(ctxt, &name);
				3269	if (name == NULL) {
				3270	htmlCheckParagraph(ctxt);
				3271	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3272	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3273	return;
				3274	}
				3275	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3276	htmlCheckParagraph(ctxt);
				3277	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3278	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3279	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3280	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3281	}
				3282	} else {
				3283	unsigned int c;
				3284	int bits, i = 0;
				3285
				3286	c = ent->value;
				3287	if (c < 0x80)
				3288	{ out[i++]= c; bits= -6; }
				3289	else if (c < 0x800)
				3290	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3291	else if (c < 0x10000)
				3292	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3293	else
				3294	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3295
				3296	for ( ; bits >= 0; bits-= 6) {
				3297	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3298	}
				3299	out[i] = 0;
				3300
				3301	htmlCheckParagraph(ctxt);
				3302	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3303	ctxt->sax->characters(ctxt->userData, out, i);
				3304	}
				3305	xmlFree(name);
				3306	}
				3307	}
				3308
				3309	/**
				3310	* htmlParseContent:
				3311	* @ctxt: an HTML parser context
				3312	* @name: the node name
				3313	*
				3314	* Parse a content: comment, sub-element, reference or text.
				3315	*
				3316	*/
				3317
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3318	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3319	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3320	xmlChar *currentNode;
				3321	int depth;
				3322
				3323	currentNode = xmlStrdup(ctxt->name);
				3324	depth = ctxt->nameNr;
				3325	while (1) {
				3326	long cons = ctxt->nbChars;
				3327
				3328	GROW;
				3329	/*
				3330	* Our tag or one of it's parent or children is ending.
				3331	*/
				3332	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3333	if (htmlParseEndTag(ctxt) &&
				3334	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3335	if (currentNode != NULL)
				3336	xmlFree(currentNode);
				3337	return;
				3338	}
				3339	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3340	}
				3341
				3342	/*
				3343	* Has this node been popped out during parsing of
				3344	* the next element
				3345	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3346	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3347	(!xmlStrEqual(currentNode, ctxt->name)))
				3348	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3349	if (currentNode != NULL) xmlFree(currentNode);
				3350	return;
				3351	}
				3352
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3353	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3354	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3355	/*
				3356	* Handle SCRIPT/STYLE separately
				3357	*/
				3358	htmlParseScript(ctxt);
				3359	} else {
				3360	/*
				3361	* Sometimes DOCTYPE arrives in the middle of the document
				3362	*/
				3363	if ((CUR == '<') && (NXT(1) == '!') &&
				3364	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3365	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3366	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3367	(UPP(8) == 'E')) {
				3368	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3369	ctxt->sax->error(ctxt->userData,
				3370	"Misplaced DOCTYPE declaration\n");
				3371	ctxt->wellFormed = 0;
				3372	htmlParseDocTypeDecl(ctxt);
				3373	}
				3374
				3375	/*
				3376	* First case : a comment
				3377	*/
				3378	if ((CUR == '<') && (NXT(1) == '!') &&
				3379	(NXT(2) == '-') && (NXT(3) == '-')) {
				3380	htmlParseComment(ctxt);
				3381	}
				3382
				3383	/*
				3384	* Second case : a sub-element.
				3385	*/
				3386	else if (CUR == '<') {
				3387	htmlParseElement(ctxt);
				3388	}
				3389
				3390	/*
				3391	* Third case : a reference. If if has not been resolved,
				3392	* parsing returns it's Name, create the node
				3393	*/
				3394	else if (CUR == '&') {
				3395	htmlParseReference(ctxt);
				3396	}
				3397
				3398	/*
				3399	* Fourth : end of the resource
				3400	*/
				3401	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3402	htmlAutoCloseOnEnd(ctxt);
				3403	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3404	}
				3405
				3406	/*
				3407	* Last case, text. Note that References are handled directly.
				3408	*/
				3409	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3410	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3411	}
				3412
				3413	if (cons == ctxt->nbChars) {
				3414	if (ctxt->node != NULL) {
				3415	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3416	ctxt->sax->error(ctxt->userData,
				3417	"detected an error in element content\n");
				3418	ctxt->wellFormed = 0;
				3419	}
				3420	break;
				3421	}
				3422	}
				3423	GROW;
				3424	}
				3425	if (currentNode != NULL) xmlFree(currentNode);
				3426	}
				3427
				3428	/**
				3429	* htmlParseElement:
				3430	* @ctxt: an HTML parser context
				3431	*
				3432	* parse an HTML element, this is highly recursive
				3433	*
				3434	* [39] element ::= EmptyElemTag \| STag content ETag
				3435	*
				3436	* [41] Attribute ::= Name Eq AttValue
				3437	*/
				3438
				3439	void
				3440	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3441	xmlChar *name;
				3442	xmlChar *currentNode = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3443	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3444	htmlParserNodeInfo node_info;
				3445	xmlChar *oldname;
				3446	int depth = ctxt->nameNr;
Daniel Veillard	3fbe8e3	2001-10-06 13:30:33 +0000	[diff] [blame]	3447	const xmlChar *oldptr;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3448
				3449	/* Capture start position */
				3450	if (ctxt->record_info) {
				3451	node_info.begin_pos = ctxt->input->consumed +
				3452	(CUR_PTR - ctxt->input->base);
				3453	node_info.begin_line = ctxt->input->line;
				3454	}
				3455
				3456	oldname = xmlStrdup(ctxt->name);
				3457	htmlParseStartTag(ctxt);
				3458	name = ctxt->name;
				3459	#ifdef DEBUG
				3460	if (oldname == NULL)
				3461	xmlGenericError(xmlGenericErrorContext,
				3462	"Start of element %s\n", name);
				3463	else if (name == NULL)
				3464	xmlGenericError(xmlGenericErrorContext,
				3465	"Start of element failed, was %s\n", oldname);
				3466	else
				3467	xmlGenericError(xmlGenericErrorContext,
				3468	"Start of element %s, was %s\n", name, oldname);
				3469	#endif
				3470	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3471	(name == NULL)) {
				3472	if (CUR == '>')
				3473	NEXT;
				3474	if (oldname != NULL)
				3475	xmlFree(oldname);
				3476	return;
				3477	}
				3478	if (oldname != NULL)
				3479	xmlFree(oldname);
				3480
				3481	/*
				3482	* Lookup the info for that element.
				3483	*/
				3484	info = htmlTagLookup(name);
				3485	if (info == NULL) {
				3486	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3487	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3488	name);
				3489	ctxt->wellFormed = 0;
				3490	} else if (info->depr) {
				3491	/***************************
				3492	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3493	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3494	name);
				3495	***************************/
				3496	}
				3497
				3498	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	3499	* Check for an Empty Element labeled the XML/SGML way
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3500	*/
				3501	if ((CUR == '/') && (NXT(1) == '>')) {
				3502	SKIP(2);
				3503	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3504	ctxt->sax->endElement(ctxt->userData, name);
				3505	oldname = htmlnamePop(ctxt);
				3506	#ifdef DEBUG
				3507	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3508	#endif
				3509	if (oldname != NULL)
				3510	xmlFree(oldname);
				3511	return;
				3512	}
				3513
				3514	if (CUR == '>') {
				3515	NEXT;
				3516	} else {
				3517	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3518	ctxt->sax->error(ctxt->userData,
				3519	"Couldn't find end of Start Tag %s\n",
				3520	name);
				3521	ctxt->wellFormed = 0;
				3522
				3523	/*
				3524	* end of parsing of this node.
				3525	*/
				3526	if (xmlStrEqual(name, ctxt->name)) {
				3527	nodePop(ctxt);
				3528	oldname = htmlnamePop(ctxt);
				3529	#ifdef DEBUG
				3530	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3531	#endif
				3532	if (oldname != NULL)
				3533	xmlFree(oldname);
				3534	}
				3535
				3536	/*
				3537	* Capture end position and add node
				3538	*/
				3539	if ( currentNode != NULL && ctxt->record_info ) {
				3540	node_info.end_pos = ctxt->input->consumed +
				3541	(CUR_PTR - ctxt->input->base);
				3542	node_info.end_line = ctxt->input->line;
				3543	node_info.node = ctxt->node;
				3544	xmlParserAddNodeInfo(ctxt, &node_info);
				3545	}
				3546	return;
				3547	}
				3548
				3549	/*
				3550	* Check for an Empty Element from DTD definition
				3551	*/
				3552	if ((info != NULL) && (info->empty)) {
				3553	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3554	ctxt->sax->endElement(ctxt->userData, name);
				3555	oldname = htmlnamePop(ctxt);
				3556	#ifdef DEBUG
				3557	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3558	#endif
				3559	if (oldname != NULL)
				3560	xmlFree(oldname);
				3561	return;
				3562	}
				3563
				3564	/*
				3565	* Parse the content of the element:
				3566	*/
				3567	currentNode = xmlStrdup(ctxt->name);
				3568	depth = ctxt->nameNr;
				3569	while (IS_CHAR(CUR)) {
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3570	oldptr = ctxt->input->cur;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3571	htmlParseContent(ctxt);
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3572	if (oldptr==ctxt->input->cur) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3573	if (ctxt->nameNr < depth) break;
				3574	}
				3575
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3576	/*
				3577	* Capture end position and add node
				3578	*/
				3579	if ( currentNode != NULL && ctxt->record_info ) {
				3580	node_info.end_pos = ctxt->input->consumed +
				3581	(CUR_PTR - ctxt->input->base);
				3582	node_info.end_line = ctxt->input->line;
				3583	node_info.node = ctxt->node;
				3584	xmlParserAddNodeInfo(ctxt, &node_info);
				3585	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3586	if (!IS_CHAR(CUR)) {
				3587	htmlAutoCloseOnEnd(ctxt);
				3588	}
				3589
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3590	if (currentNode != NULL)
				3591	xmlFree(currentNode);
				3592	}
				3593
				3594	/**
				3595	* htmlParseDocument :
				3596	* @ctxt: an HTML parser context
				3597	*
				3598	* parse an HTML document (and build a tree if using the standard SAX
				3599	* interface).
				3600	*
				3601	* Returns 0, -1 in case of error. the parser context is augmented
				3602	* as a result of the parsing.
				3603	*/
				3604
Daniel Veillard	1b31e4a	2002-05-27 14:44:50 +0000	[diff] [blame]	3605	int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3606	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3607	xmlDtdPtr dtd;
				3608
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	3609	xmlInitParser();
				3610
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3611	htmlDefaultSAXHandlerInit();
				3612	ctxt->html = 1;
				3613
				3614	GROW;
				3615	/*
				3616	* SAX: beginning of the document processing.
				3617	*/
				3618	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3619	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3620
				3621	/*
				3622	* Wipe out everything which is before the first '<'
				3623	*/
				3624	SKIP_BLANKS;
				3625	if (CUR == 0) {
				3626	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3627	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3628	ctxt->wellFormed = 0;
				3629	}
				3630
				3631	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3632	ctxt->sax->startDocument(ctxt->userData);
				3633
				3634
				3635	/*
				3636	* Parse possible comments before any content
				3637	*/
				3638	while ((CUR == '<') && (NXT(1) == '!') &&
				3639	(NXT(2) == '-') && (NXT(3) == '-')) {
				3640	htmlParseComment(ctxt);
				3641	SKIP_BLANKS;
				3642	}
				3643
				3644
				3645	/*
				3646	* Then possibly doc type declaration(s) and more Misc
				3647	* (doctypedecl Misc*)?
				3648	*/
				3649	if ((CUR == '<') && (NXT(1) == '!') &&
				3650	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3651	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3652	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3653	(UPP(8) == 'E')) {
				3654	htmlParseDocTypeDecl(ctxt);
				3655	}
				3656	SKIP_BLANKS;
				3657
				3658	/*
				3659	* Parse possible comments before any content
				3660	*/
				3661	while ((CUR == '<') && (NXT(1) == '!') &&
				3662	(NXT(2) == '-') && (NXT(3) == '-')) {
				3663	htmlParseComment(ctxt);
				3664	SKIP_BLANKS;
				3665	}
				3666
				3667	/*
				3668	* Time to start parsing the tree itself
				3669	*/
				3670	htmlParseContent(ctxt);
				3671
				3672	/*
				3673	* autoclose
				3674	*/
				3675	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3676	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3677
				3678
				3679	/*
				3680	* SAX: end of the document processing.
				3681	*/
				3682	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3683	ctxt->sax->endDocument(ctxt->userData);
				3684
				3685	if (ctxt->myDoc != NULL) {
				3686	dtd = xmlGetIntSubset(ctxt->myDoc);
				3687	if (dtd == NULL)
				3688	ctxt->myDoc->intSubset =
				3689	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3690	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3691	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3692	}
				3693	if (! ctxt->wellFormed) return(-1);
				3694	return(0);
				3695	}
				3696
				3697
				3698	/************************************************************************
				3699	* *
				3700	* Parser contexts handling *
				3701	* *
				3702	************************************************************************/
				3703
				3704	/**
				3705	* xmlInitParserCtxt:
				3706	* @ctxt: an HTML parser context
				3707	*
				3708	* Initialize a parser context
				3709	*/
				3710
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3711	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3712	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3713	{
				3714	htmlSAXHandler *sax;
				3715
				3716	if (ctxt == NULL) return;
				3717	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3718
				3719	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3720	if (sax == NULL) {
				3721	xmlGenericError(xmlGenericErrorContext,
				3722	"htmlInitParserCtxt: out of memory\n");
				3723	}
				3724	else
				3725	memset(sax, 0, sizeof(htmlSAXHandler));
				3726
				3727	/* Allocate the Input stack */
				3728	ctxt->inputTab = (htmlParserInputPtr *)
				3729	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3730	if (ctxt->inputTab == NULL) {
				3731	xmlGenericError(xmlGenericErrorContext,
				3732	"htmlInitParserCtxt: out of memory\n");
				3733	ctxt->inputNr = 0;
				3734	ctxt->inputMax = 0;
				3735	ctxt->input = NULL;
				3736	return;
				3737	}
				3738	ctxt->inputNr = 0;
				3739	ctxt->inputMax = 5;
				3740	ctxt->input = NULL;
				3741	ctxt->version = NULL;
				3742	ctxt->encoding = NULL;
				3743	ctxt->standalone = -1;
				3744	ctxt->instate = XML_PARSER_START;
				3745
				3746	/* Allocate the Node stack */
				3747	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3748	if (ctxt->nodeTab == NULL) {
				3749	xmlGenericError(xmlGenericErrorContext,
				3750	"htmlInitParserCtxt: out of memory\n");
				3751	ctxt->nodeNr = 0;
				3752	ctxt->nodeMax = 0;
				3753	ctxt->node = NULL;
				3754	ctxt->inputNr = 0;
				3755	ctxt->inputMax = 0;
				3756	ctxt->input = NULL;
				3757	return;
				3758	}
				3759	ctxt->nodeNr = 0;
				3760	ctxt->nodeMax = 10;
				3761	ctxt->node = NULL;
				3762
				3763	/* Allocate the Name stack */
				3764	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3765	if (ctxt->nameTab == NULL) {
				3766	xmlGenericError(xmlGenericErrorContext,
				3767	"htmlInitParserCtxt: out of memory\n");
				3768	ctxt->nameNr = 0;
				3769	ctxt->nameMax = 10;
				3770	ctxt->name = NULL;
				3771	ctxt->nodeNr = 0;
				3772	ctxt->nodeMax = 0;
				3773	ctxt->node = NULL;
				3774	ctxt->inputNr = 0;
				3775	ctxt->inputMax = 0;
				3776	ctxt->input = NULL;
				3777	return;
				3778	}
				3779	ctxt->nameNr = 0;
				3780	ctxt->nameMax = 10;
				3781	ctxt->name = NULL;
				3782
				3783	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3784	else {
				3785	ctxt->sax = sax;
				3786	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3787	}
				3788	ctxt->userData = ctxt;
				3789	ctxt->myDoc = NULL;
				3790	ctxt->wellFormed = 1;
				3791	ctxt->replaceEntities = 0;
Daniel Veillard	635ef72	2001-10-29 11:48:19 +0000	[diff] [blame]	3792	ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3793	ctxt->html = 1;
				3794	ctxt->record_info = 0;
				3795	ctxt->validate = 0;
				3796	ctxt->nbChars = 0;
				3797	ctxt->checkIndex = 0;
Daniel Veillard	dc2cee2	2001-08-22 16:30:37 +0000	[diff] [blame]	3798	ctxt->catalogs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3799	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3800	}
				3801
				3802	/**
				3803	* htmlFreeParserCtxt:
				3804	* @ctxt: an HTML parser context
				3805	*
				3806	* Free all the memory used by a parser context. However the parsed
				3807	* document in ctxt->myDoc is not freed.
				3808	*/
				3809
				3810	void
				3811	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3812	{
				3813	xmlFreeParserCtxt(ctxt);
				3814	}
				3815
				3816	/**
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame^]	3817	* htmlNewParserCtxt:
				3818	*
				3819	* Allocate and initialize a new parser context.
				3820	*
				3821	* Returns the xmlParserCtxtPtr or NULL
				3822	*/
				3823
				3824	static htmlParserCtxtPtr
				3825	htmlNewParserCtxt(void)
				3826	{
				3827	xmlParserCtxtPtr ctxt;
				3828
				3829	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
				3830	if (ctxt == NULL) {
				3831	xmlGenericError(xmlGenericErrorContext,
				3832	"xmlNewParserCtxt : cannot allocate context\n");
				3833	perror("malloc");
				3834	return(NULL);
				3835	}
				3836	memset(ctxt, 0, sizeof(xmlParserCtxt));
				3837	htmlInitParserCtxt(ctxt);
				3838	return(ctxt);
				3839	}
				3840
				3841	/**
				3842	* htmlCreateMemoryParserCtxt:
				3843	* @buffer: a pointer to a char array
				3844	* @size: the size of the array
				3845	*
				3846	* Create a parser context for an HTML in-memory document.
				3847	*
				3848	* Returns the new parser context or NULL
				3849	*/
				3850	static htmlParserCtxtPtr
				3851	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
				3852	xmlParserCtxtPtr ctxt;
				3853	xmlParserInputPtr input;
				3854	xmlParserInputBufferPtr buf;
				3855
				3856	if (buffer == NULL)
				3857	return(NULL);
				3858	if (size <= 0)
				3859	return(NULL);
				3860
				3861	ctxt = htmlNewParserCtxt();
				3862	if (ctxt == NULL)
				3863	return(NULL);
				3864
				3865	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
				3866	if (buf == NULL) return(NULL);
				3867
				3868	input = xmlNewInputStream(ctxt);
				3869	if (input == NULL) {
				3870	xmlFreeParserCtxt(ctxt);
				3871	return(NULL);
				3872	}
				3873
				3874	input->filename = NULL;
				3875	input->buf = buf;
				3876	input->base = input->buf->buffer->content;
				3877	input->cur = input->buf->buffer->content;
				3878	input->end = &input->buf->buffer->content[input->buf->buffer->use];
				3879
				3880	inputPush(ctxt, input);
				3881	return(ctxt);
				3882	}
				3883
				3884	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3885	* htmlCreateDocParserCtxt :
				3886	* @cur: a pointer to an array of xmlChar
				3887	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3888	*
				3889	* Create a parser context for an HTML document.
				3890	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3891	* TODO: check the need to add encoding handling there
				3892	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3893	* Returns the new parser context or NULL
				3894	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3895	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3896	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame^]	3897	int len;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3898
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame^]	3899	if (cur == NULL)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3900	return(NULL);
Daniel Veillard	1d99527	2002-07-22 16:43:32 +0000	[diff] [blame^]	3901	len = xmlStrlen(cur);
				3902	return(htmlCreateMemoryParserCtxt((char *)cur, len));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3903	}
				3904
				3905	/************************************************************************
				3906	* *
				3907	* Progressive parsing interfaces *
				3908	* *
				3909	************************************************************************/
				3910
				3911	/**
				3912	* htmlParseLookupSequence:
				3913	* @ctxt: an HTML parser context
				3914	* @first: the first char to lookup
				3915	* @next: the next char to lookup or zero
				3916	* @third: the next char to lookup or zero
				3917	*
				3918	* Try to find if a sequence (first, next, third) or just (first next) or
				3919	* (first) is available in the input stream.
				3920	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3921	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3922	* parser, do not use liberally.
				3923	* This is basically similar to xmlParseLookupSequence()
				3924	*
				3925	* Returns the index to the current parsing point if the full sequence
				3926	* is available, -1 otherwise.
				3927	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3928	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3929	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3930	xmlChar next, xmlChar third) {
				3931	int base, len;
				3932	htmlParserInputPtr in;
				3933	const xmlChar *buf;
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	3934	int incomment = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3935
				3936	in = ctxt->input;
				3937	if (in == NULL) return(-1);
				3938	base = in->cur - in->base;
				3939	if (base < 0) return(-1);
				3940	if (ctxt->checkIndex > base)
				3941	base = ctxt->checkIndex;
				3942	if (in->buf == NULL) {
				3943	buf = in->base;
				3944	len = in->length;
				3945	} else {
				3946	buf = in->buf->buffer->content;
				3947	len = in->buf->buffer->use;
				3948	}
				3949	/* take into account the sequence length */
				3950	if (third) len -= 2;
				3951	else if (next) len --;
				3952	for (;base < len;base++) {
Daniel Veillard	c1f7834	2001-11-10 11:43:05 +0000	[diff] [blame]	3953	if (!incomment && (base + 4 < len)) {
				3954	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
				3955	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
				3956	incomment = 1;
				3957	}
				3958	/* do not increment base, some people use <!--> */
				3959	}
				3960	if (incomment) {
				3961	if (base + 3 < len)
				3962	return(-1);
				3963	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
				3964	(buf[base + 2] == '>')) {
				3965	incomment = 0;
				3966	base += 2;
				3967	}
				3968	continue;
				3969	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3970	if (buf[base] == first) {
				3971	if (third != 0) {
				3972	if ((buf[base + 1] != next) \|\|
				3973	(buf[base + 2] != third)) continue;
				3974	} else if (next != 0) {
				3975	if (buf[base + 1] != next) continue;
				3976	}
				3977	ctxt->checkIndex = 0;
				3978	#ifdef DEBUG_PUSH
				3979	if (next == 0)
				3980	xmlGenericError(xmlGenericErrorContext,
				3981	"HPP: lookup '%c' found at %d\n",
				3982	first, base);
				3983	else if (third == 0)
				3984	xmlGenericError(xmlGenericErrorContext,
				3985	"HPP: lookup '%c%c' found at %d\n",
				3986	first, next, base);
				3987	else
				3988	xmlGenericError(xmlGenericErrorContext,
				3989	"HPP: lookup '%c%c%c' found at %d\n",
				3990	first, next, third, base);
				3991	#endif
				3992	return(base - (in->cur - in->base));
				3993	}
				3994	}
				3995	ctxt->checkIndex = base;
				3996	#ifdef DEBUG_PUSH
				3997	if (next == 0)
				3998	xmlGenericError(xmlGenericErrorContext,
				3999	"HPP: lookup '%c' failed\n", first);
				4000	else if (third == 0)
				4001	xmlGenericError(xmlGenericErrorContext,
				4002	"HPP: lookup '%c%c' failed\n", first, next);
				4003	else
				4004	xmlGenericError(xmlGenericErrorContext,
				4005	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				4006	#endif
				4007	return(-1);
				4008	}
				4009
				4010	/**
				4011	* htmlParseTryOrFinish:
				4012	* @ctxt: an HTML parser context
				4013	* @terminate: last chunk indicator
				4014	*
				4015	* Try to progress on parsing
				4016	*
				4017	* Returns zero if no parsing was possible
				4018	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4019	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4020	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				4021	int ret = 0;
				4022	htmlParserInputPtr in;
				4023	int avail = 0;
				4024	xmlChar cur, next;
				4025
				4026	#ifdef DEBUG_PUSH
				4027	switch (ctxt->instate) {
				4028	case XML_PARSER_EOF:
				4029	xmlGenericError(xmlGenericErrorContext,
				4030	"HPP: try EOF\n"); break;
				4031	case XML_PARSER_START:
				4032	xmlGenericError(xmlGenericErrorContext,
				4033	"HPP: try START\n"); break;
				4034	case XML_PARSER_MISC:
				4035	xmlGenericError(xmlGenericErrorContext,
				4036	"HPP: try MISC\n");break;
				4037	case XML_PARSER_COMMENT:
				4038	xmlGenericError(xmlGenericErrorContext,
				4039	"HPP: try COMMENT\n");break;
				4040	case XML_PARSER_PROLOG:
				4041	xmlGenericError(xmlGenericErrorContext,
				4042	"HPP: try PROLOG\n");break;
				4043	case XML_PARSER_START_TAG:
				4044	xmlGenericError(xmlGenericErrorContext,
				4045	"HPP: try START_TAG\n");break;
				4046	case XML_PARSER_CONTENT:
				4047	xmlGenericError(xmlGenericErrorContext,
				4048	"HPP: try CONTENT\n");break;
				4049	case XML_PARSER_CDATA_SECTION:
				4050	xmlGenericError(xmlGenericErrorContext,
				4051	"HPP: try CDATA_SECTION\n");break;
				4052	case XML_PARSER_END_TAG:
				4053	xmlGenericError(xmlGenericErrorContext,
				4054	"HPP: try END_TAG\n");break;
				4055	case XML_PARSER_ENTITY_DECL:
				4056	xmlGenericError(xmlGenericErrorContext,
				4057	"HPP: try ENTITY_DECL\n");break;
				4058	case XML_PARSER_ENTITY_VALUE:
				4059	xmlGenericError(xmlGenericErrorContext,
				4060	"HPP: try ENTITY_VALUE\n");break;
				4061	case XML_PARSER_ATTRIBUTE_VALUE:
				4062	xmlGenericError(xmlGenericErrorContext,
				4063	"HPP: try ATTRIBUTE_VALUE\n");break;
				4064	case XML_PARSER_DTD:
				4065	xmlGenericError(xmlGenericErrorContext,
				4066	"HPP: try DTD\n");break;
				4067	case XML_PARSER_EPILOG:
				4068	xmlGenericError(xmlGenericErrorContext,
				4069	"HPP: try EPILOG\n");break;
				4070	case XML_PARSER_PI:
				4071	xmlGenericError(xmlGenericErrorContext,
				4072	"HPP: try PI\n");break;
				4073	case XML_PARSER_SYSTEM_LITERAL:
				4074	xmlGenericError(xmlGenericErrorContext,
				4075	"HPP: try SYSTEM_LITERAL\n");break;
				4076	}
				4077	#endif
				4078
				4079	while (1) {
				4080
				4081	in = ctxt->input;
				4082	if (in == NULL) break;
				4083	if (in->buf == NULL)
				4084	avail = in->length - (in->cur - in->base);
				4085	else
				4086	avail = in->buf->buffer->use - (in->cur - in->base);
				4087	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4088	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4089	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4090	/*
				4091	* SAX: end of the document processing.
				4092	*/
				4093	ctxt->instate = XML_PARSER_EOF;
				4094	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4095	ctxt->sax->endDocument(ctxt->userData);
				4096	}
				4097	}
				4098	if (avail < 1)
				4099	goto done;
				4100	switch (ctxt->instate) {
				4101	case XML_PARSER_EOF:
				4102	/*
				4103	* Document parsing is done !
				4104	*/
				4105	goto done;
				4106	case XML_PARSER_START:
				4107	/*
				4108	* Very first chars read from the document flow.
				4109	*/
				4110	cur = in->cur[0];
				4111	if (IS_BLANK(cur)) {
				4112	SKIP_BLANKS;
				4113	if (in->buf == NULL)
				4114	avail = in->length - (in->cur - in->base);
				4115	else
				4116	avail = in->buf->buffer->use - (in->cur - in->base);
				4117	}
				4118	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4119	ctxt->sax->setDocumentLocator(ctxt->userData,
				4120	&xmlDefaultSAXLocator);
				4121	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4122	(!ctxt->disableSAX))
				4123	ctxt->sax->startDocument(ctxt->userData);
				4124
				4125	cur = in->cur[0];
				4126	next = in->cur[1];
				4127	if ((cur == '<') && (next == '!') &&
				4128	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4129	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4130	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4131	(UPP(8) == 'E')) {
				4132	if ((!terminate) &&
				4133	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4134	goto done;
				4135	#ifdef DEBUG_PUSH
				4136	xmlGenericError(xmlGenericErrorContext,
				4137	"HPP: Parsing internal subset\n");
				4138	#endif
				4139	htmlParseDocTypeDecl(ctxt);
				4140	ctxt->instate = XML_PARSER_PROLOG;
				4141	#ifdef DEBUG_PUSH
				4142	xmlGenericError(xmlGenericErrorContext,
				4143	"HPP: entering PROLOG\n");
				4144	#endif
				4145	} else {
				4146	ctxt->instate = XML_PARSER_MISC;
				4147	}
				4148	#ifdef DEBUG_PUSH
				4149	xmlGenericError(xmlGenericErrorContext,
				4150	"HPP: entering MISC\n");
				4151	#endif
				4152	break;
				4153	case XML_PARSER_MISC:
				4154	SKIP_BLANKS;
				4155	if (in->buf == NULL)
				4156	avail = in->length - (in->cur - in->base);
				4157	else
				4158	avail = in->buf->buffer->use - (in->cur - in->base);
				4159	if (avail < 2)
				4160	goto done;
				4161	cur = in->cur[0];
				4162	next = in->cur[1];
				4163	if ((cur == '<') && (next == '!') &&
				4164	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4165	if ((!terminate) &&
				4166	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4167	goto done;
				4168	#ifdef DEBUG_PUSH
				4169	xmlGenericError(xmlGenericErrorContext,
				4170	"HPP: Parsing Comment\n");
				4171	#endif
				4172	htmlParseComment(ctxt);
				4173	ctxt->instate = XML_PARSER_MISC;
				4174	} else if ((cur == '<') && (next == '!') &&
				4175	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4176	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4177	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4178	(UPP(8) == 'E')) {
				4179	if ((!terminate) &&
				4180	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4181	goto done;
				4182	#ifdef DEBUG_PUSH
				4183	xmlGenericError(xmlGenericErrorContext,
				4184	"HPP: Parsing internal subset\n");
				4185	#endif
				4186	htmlParseDocTypeDecl(ctxt);
				4187	ctxt->instate = XML_PARSER_PROLOG;
				4188	#ifdef DEBUG_PUSH
				4189	xmlGenericError(xmlGenericErrorContext,
				4190	"HPP: entering PROLOG\n");
				4191	#endif
				4192	} else if ((cur == '<') && (next == '!') &&
				4193	(avail < 9)) {
				4194	goto done;
				4195	} else {
				4196	ctxt->instate = XML_PARSER_START_TAG;
				4197	#ifdef DEBUG_PUSH
				4198	xmlGenericError(xmlGenericErrorContext,
				4199	"HPP: entering START_TAG\n");
				4200	#endif
				4201	}
				4202	break;
				4203	case XML_PARSER_PROLOG:
				4204	SKIP_BLANKS;
				4205	if (in->buf == NULL)
				4206	avail = in->length - (in->cur - in->base);
				4207	else
				4208	avail = in->buf->buffer->use - (in->cur - in->base);
				4209	if (avail < 2)
				4210	goto done;
				4211	cur = in->cur[0];
				4212	next = in->cur[1];
				4213	if ((cur == '<') && (next == '!') &&
				4214	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4215	if ((!terminate) &&
				4216	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4217	goto done;
				4218	#ifdef DEBUG_PUSH
				4219	xmlGenericError(xmlGenericErrorContext,
				4220	"HPP: Parsing Comment\n");
				4221	#endif
				4222	htmlParseComment(ctxt);
				4223	ctxt->instate = XML_PARSER_PROLOG;
				4224	} else if ((cur == '<') && (next == '!') &&
				4225	(avail < 4)) {
				4226	goto done;
				4227	} else {
				4228	ctxt->instate = XML_PARSER_START_TAG;
				4229	#ifdef DEBUG_PUSH
				4230	xmlGenericError(xmlGenericErrorContext,
				4231	"HPP: entering START_TAG\n");
				4232	#endif
				4233	}
				4234	break;
				4235	case XML_PARSER_EPILOG:
				4236	if (in->buf == NULL)
				4237	avail = in->length - (in->cur - in->base);
				4238	else
				4239	avail = in->buf->buffer->use - (in->cur - in->base);
				4240	if (avail < 1)
				4241	goto done;
				4242	cur = in->cur[0];
				4243	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4244	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4245	goto done;
				4246	}
				4247	if (avail < 2)
				4248	goto done;
				4249	next = in->cur[1];
				4250	if ((cur == '<') && (next == '!') &&
				4251	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4252	if ((!terminate) &&
				4253	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4254	goto done;
				4255	#ifdef DEBUG_PUSH
				4256	xmlGenericError(xmlGenericErrorContext,
				4257	"HPP: Parsing Comment\n");
				4258	#endif
				4259	htmlParseComment(ctxt);
				4260	ctxt->instate = XML_PARSER_EPILOG;
				4261	} else if ((cur == '<') && (next == '!') &&
				4262	(avail < 4)) {
				4263	goto done;
				4264	} else {
				4265	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4266	ctxt->wellFormed = 0;
				4267	ctxt->instate = XML_PARSER_EOF;
				4268	#ifdef DEBUG_PUSH
				4269	xmlGenericError(xmlGenericErrorContext,
				4270	"HPP: entering EOF\n");
				4271	#endif
				4272	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4273	ctxt->sax->endDocument(ctxt->userData);
				4274	goto done;
				4275	}
				4276	break;
				4277	case XML_PARSER_START_TAG: {
				4278	xmlChar name, oldname;
				4279	int depth = ctxt->nameNr;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	4280	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4281
				4282	if (avail < 2)
				4283	goto done;
				4284	cur = in->cur[0];
				4285	if (cur != '<') {
				4286	ctxt->instate = XML_PARSER_CONTENT;
				4287	#ifdef DEBUG_PUSH
				4288	xmlGenericError(xmlGenericErrorContext,
				4289	"HPP: entering CONTENT\n");
				4290	#endif
				4291	break;
				4292	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4293	if (in->cur[1] == '/') {
				4294	ctxt->instate = XML_PARSER_END_TAG;
				4295	ctxt->checkIndex = 0;
				4296	#ifdef DEBUG_PUSH
				4297	xmlGenericError(xmlGenericErrorContext,
				4298	"HPP: entering END_TAG\n");
				4299	#endif
				4300	break;
				4301	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4302	if ((!terminate) &&
				4303	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4304	goto done;
				4305
				4306	oldname = xmlStrdup(ctxt->name);
				4307	htmlParseStartTag(ctxt);
				4308	name = ctxt->name;
				4309	#ifdef DEBUG
				4310	if (oldname == NULL)
				4311	xmlGenericError(xmlGenericErrorContext,
				4312	"Start of element %s\n", name);
				4313	else if (name == NULL)
				4314	xmlGenericError(xmlGenericErrorContext,
				4315	"Start of element failed, was %s\n",
				4316	oldname);
				4317	else
				4318	xmlGenericError(xmlGenericErrorContext,
				4319	"Start of element %s, was %s\n",
				4320	name, oldname);
				4321	#endif
				4322	if (((depth == ctxt->nameNr) &&
				4323	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4324	(name == NULL)) {
				4325	if (CUR == '>')
				4326	NEXT;
				4327	if (oldname != NULL)
				4328	xmlFree(oldname);
				4329	break;
				4330	}
				4331	if (oldname != NULL)
				4332	xmlFree(oldname);
				4333
				4334	/*
				4335	* Lookup the info for that element.
				4336	*/
				4337	info = htmlTagLookup(name);
				4338	if (info == NULL) {
				4339	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4340	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4341	name);
				4342	ctxt->wellFormed = 0;
				4343	} else if (info->depr) {
				4344	/***************************
				4345	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4346	ctxt->sax->warning(ctxt->userData,
				4347	"Tag %s is deprecated\n",
				4348	name);
				4349	***************************/
				4350	}
				4351
				4352	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	4353	* Check for an Empty Element labeled the XML/SGML way
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4354	*/
				4355	if ((CUR == '/') && (NXT(1) == '>')) {
				4356	SKIP(2);
				4357	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4358	ctxt->sax->endElement(ctxt->userData, name);
				4359	oldname = htmlnamePop(ctxt);
				4360	#ifdef DEBUG
				4361	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4362	oldname);
				4363	#endif
				4364	if (oldname != NULL)
				4365	xmlFree(oldname);
				4366	ctxt->instate = XML_PARSER_CONTENT;
				4367	#ifdef DEBUG_PUSH
				4368	xmlGenericError(xmlGenericErrorContext,
				4369	"HPP: entering CONTENT\n");
				4370	#endif
				4371	break;
				4372	}
				4373
				4374	if (CUR == '>') {
				4375	NEXT;
				4376	} else {
				4377	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4378	ctxt->sax->error(ctxt->userData,
				4379	"Couldn't find end of Start Tag %s\n",
				4380	name);
				4381	ctxt->wellFormed = 0;
				4382
				4383	/*
				4384	* end of parsing of this node.
				4385	*/
				4386	if (xmlStrEqual(name, ctxt->name)) {
				4387	nodePop(ctxt);
				4388	oldname = htmlnamePop(ctxt);
				4389	#ifdef DEBUG
				4390	xmlGenericError(xmlGenericErrorContext,
				4391	"End of start tag problem: popping out %s\n", oldname);
				4392	#endif
				4393	if (oldname != NULL)
				4394	xmlFree(oldname);
				4395	}
				4396
				4397	ctxt->instate = XML_PARSER_CONTENT;
				4398	#ifdef DEBUG_PUSH
				4399	xmlGenericError(xmlGenericErrorContext,
				4400	"HPP: entering CONTENT\n");
				4401	#endif
				4402	break;
				4403	}
				4404
				4405	/*
				4406	* Check for an Empty Element from DTD definition
				4407	*/
				4408	if ((info != NULL) && (info->empty)) {
				4409	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4410	ctxt->sax->endElement(ctxt->userData, name);
				4411	oldname = htmlnamePop(ctxt);
				4412	#ifdef DEBUG
				4413	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4414	#endif
				4415	if (oldname != NULL)
				4416	xmlFree(oldname);
				4417	}
				4418	ctxt->instate = XML_PARSER_CONTENT;
				4419	#ifdef DEBUG_PUSH
				4420	xmlGenericError(xmlGenericErrorContext,
				4421	"HPP: entering CONTENT\n");
				4422	#endif
				4423	break;
				4424	}
				4425	case XML_PARSER_CONTENT: {
				4426	long cons;
				4427	/*
				4428	* Handle preparsed entities and charRef
				4429	*/
				4430	if (ctxt->token != 0) {
				4431	xmlChar chr[2] = { 0 , 0 } ;
				4432
				4433	chr[0] = (xmlChar) ctxt->token;
				4434	htmlCheckParagraph(ctxt);
				4435	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4436	ctxt->sax->characters(ctxt->userData, chr, 1);
				4437	ctxt->token = 0;
				4438	ctxt->checkIndex = 0;
				4439	}
				4440	if ((avail == 1) && (terminate)) {
				4441	cur = in->cur[0];
				4442	if ((cur != '<') && (cur != '&')) {
				4443	if (ctxt->sax != NULL) {
				4444	if (IS_BLANK(cur)) {
				4445	if (ctxt->sax->ignorableWhitespace != NULL)
				4446	ctxt->sax->ignorableWhitespace(
				4447	ctxt->userData, &cur, 1);
				4448	} else {
				4449	htmlCheckParagraph(ctxt);
				4450	if (ctxt->sax->characters != NULL)
				4451	ctxt->sax->characters(
				4452	ctxt->userData, &cur, 1);
				4453	}
				4454	}
				4455	ctxt->token = 0;
				4456	ctxt->checkIndex = 0;
				4457	NEXT;
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	4458	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4459	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4460	}
				4461	if (avail < 2)
				4462	goto done;
				4463	cur = in->cur[0];
				4464	next = in->cur[1];
				4465	cons = ctxt->nbChars;
				4466	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4467	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4468	/*
				4469	* Handle SCRIPT/STYLE separately
				4470	*/
				4471	if ((!terminate) &&
				4472	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4473	goto done;
				4474	htmlParseScript(ctxt);
				4475	if ((cur == '<') && (next == '/')) {
				4476	ctxt->instate = XML_PARSER_END_TAG;
				4477	ctxt->checkIndex = 0;
				4478	#ifdef DEBUG_PUSH
				4479	xmlGenericError(xmlGenericErrorContext,
				4480	"HPP: entering END_TAG\n");
				4481	#endif
				4482	break;
				4483	}
				4484	} else {
				4485	/*
				4486	* Sometimes DOCTYPE arrives in the middle of the document
				4487	*/
				4488	if ((cur == '<') && (next == '!') &&
				4489	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4490	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4491	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4492	(UPP(8) == 'E')) {
				4493	if ((!terminate) &&
				4494	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4495	goto done;
				4496	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4497	ctxt->sax->error(ctxt->userData,
				4498	"Misplaced DOCTYPE declaration\n");
				4499	ctxt->wellFormed = 0;
				4500	htmlParseDocTypeDecl(ctxt);
				4501	} else if ((cur == '<') && (next == '!') &&
				4502	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4503	if ((!terminate) &&
				4504	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4505	goto done;
				4506	#ifdef DEBUG_PUSH
				4507	xmlGenericError(xmlGenericErrorContext,
				4508	"HPP: Parsing Comment\n");
				4509	#endif
				4510	htmlParseComment(ctxt);
				4511	ctxt->instate = XML_PARSER_CONTENT;
				4512	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4513	goto done;
				4514	} else if ((cur == '<') && (next == '/')) {
				4515	ctxt->instate = XML_PARSER_END_TAG;
				4516	ctxt->checkIndex = 0;
				4517	#ifdef DEBUG_PUSH
				4518	xmlGenericError(xmlGenericErrorContext,
				4519	"HPP: entering END_TAG\n");
				4520	#endif
				4521	break;
				4522	} else if (cur == '<') {
				4523	ctxt->instate = XML_PARSER_START_TAG;
				4524	ctxt->checkIndex = 0;
				4525	#ifdef DEBUG_PUSH
				4526	xmlGenericError(xmlGenericErrorContext,
				4527	"HPP: entering START_TAG\n");
				4528	#endif
				4529	break;
				4530	} else if (cur == '&') {
				4531	if ((!terminate) &&
				4532	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4533	goto done;
				4534	#ifdef DEBUG_PUSH
				4535	xmlGenericError(xmlGenericErrorContext,
				4536	"HPP: Parsing Reference\n");
				4537	#endif
				4538	/* TODO: check generation of subtrees if noent !!! */
				4539	htmlParseReference(ctxt);
				4540	} else {
				4541	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4542	/*
				4543	* Goal of the following test is :
				4544	* - minimize calls to the SAX 'character' callback
				4545	* when they are mergeable
				4546	*/
				4547	if ((ctxt->inputNr == 1) &&
				4548	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4549	if ((!terminate) &&
				4550	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4551	goto done;
				4552	}
				4553	ctxt->checkIndex = 0;
				4554	#ifdef DEBUG_PUSH
				4555	xmlGenericError(xmlGenericErrorContext,
				4556	"HPP: Parsing char data\n");
				4557	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4558	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4559	}
				4560	}
				4561	if (cons == ctxt->nbChars) {
				4562	if (ctxt->node != NULL) {
				4563	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4564	ctxt->sax->error(ctxt->userData,
				4565	"detected an error in element content\n");
				4566	ctxt->wellFormed = 0;
				4567	}
				4568	NEXT;
				4569	break;
				4570	}
				4571
				4572	break;
				4573	}
				4574	case XML_PARSER_END_TAG:
				4575	if (avail < 2)
				4576	goto done;
				4577	if ((!terminate) &&
				4578	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4579	goto done;
				4580	htmlParseEndTag(ctxt);
				4581	if (ctxt->nameNr == 0) {
				4582	ctxt->instate = XML_PARSER_EPILOG;
				4583	} else {
				4584	ctxt->instate = XML_PARSER_CONTENT;
				4585	}
				4586	ctxt->checkIndex = 0;
				4587	#ifdef DEBUG_PUSH
				4588	xmlGenericError(xmlGenericErrorContext,
				4589	"HPP: entering CONTENT\n");
				4590	#endif
				4591	break;
				4592	case XML_PARSER_CDATA_SECTION:
				4593	xmlGenericError(xmlGenericErrorContext,
				4594	"HPP: internal error, state == CDATA\n");
				4595	ctxt->instate = XML_PARSER_CONTENT;
				4596	ctxt->checkIndex = 0;
				4597	#ifdef DEBUG_PUSH
				4598	xmlGenericError(xmlGenericErrorContext,
				4599	"HPP: entering CONTENT\n");
				4600	#endif
				4601	break;
				4602	case XML_PARSER_DTD:
				4603	xmlGenericError(xmlGenericErrorContext,
				4604	"HPP: internal error, state == DTD\n");
				4605	ctxt->instate = XML_PARSER_CONTENT;
				4606	ctxt->checkIndex = 0;
				4607	#ifdef DEBUG_PUSH
				4608	xmlGenericError(xmlGenericErrorContext,
				4609	"HPP: entering CONTENT\n");
				4610	#endif
				4611	break;
				4612	case XML_PARSER_COMMENT:
				4613	xmlGenericError(xmlGenericErrorContext,
				4614	"HPP: internal error, state == COMMENT\n");
				4615	ctxt->instate = XML_PARSER_CONTENT;
				4616	ctxt->checkIndex = 0;
				4617	#ifdef DEBUG_PUSH
				4618	xmlGenericError(xmlGenericErrorContext,
				4619	"HPP: entering CONTENT\n");
				4620	#endif
				4621	break;
				4622	case XML_PARSER_PI:
				4623	xmlGenericError(xmlGenericErrorContext,
				4624	"HPP: internal error, state == PI\n");
				4625	ctxt->instate = XML_PARSER_CONTENT;
				4626	ctxt->checkIndex = 0;
				4627	#ifdef DEBUG_PUSH
				4628	xmlGenericError(xmlGenericErrorContext,
				4629	"HPP: entering CONTENT\n");
				4630	#endif
				4631	break;
				4632	case XML_PARSER_ENTITY_DECL:
				4633	xmlGenericError(xmlGenericErrorContext,
				4634	"HPP: internal error, state == ENTITY_DECL\n");
				4635	ctxt->instate = XML_PARSER_CONTENT;
				4636	ctxt->checkIndex = 0;
				4637	#ifdef DEBUG_PUSH
				4638	xmlGenericError(xmlGenericErrorContext,
				4639	"HPP: entering CONTENT\n");
				4640	#endif
				4641	break;
				4642	case XML_PARSER_ENTITY_VALUE:
				4643	xmlGenericError(xmlGenericErrorContext,
				4644	"HPP: internal error, state == ENTITY_VALUE\n");
				4645	ctxt->instate = XML_PARSER_CONTENT;
				4646	ctxt->checkIndex = 0;
				4647	#ifdef DEBUG_PUSH
				4648	xmlGenericError(xmlGenericErrorContext,
				4649	"HPP: entering DTD\n");
				4650	#endif
				4651	break;
				4652	case XML_PARSER_ATTRIBUTE_VALUE:
				4653	xmlGenericError(xmlGenericErrorContext,
				4654	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4655	ctxt->instate = XML_PARSER_START_TAG;
				4656	ctxt->checkIndex = 0;
				4657	#ifdef DEBUG_PUSH
				4658	xmlGenericError(xmlGenericErrorContext,
				4659	"HPP: entering START_TAG\n");
				4660	#endif
				4661	break;
				4662	case XML_PARSER_SYSTEM_LITERAL:
				4663	xmlGenericError(xmlGenericErrorContext,
				4664	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4665	ctxt->instate = XML_PARSER_CONTENT;
				4666	ctxt->checkIndex = 0;
				4667	#ifdef DEBUG_PUSH
				4668	xmlGenericError(xmlGenericErrorContext,
				4669	"HPP: entering CONTENT\n");
				4670	#endif
				4671	break;
				4672	case XML_PARSER_IGNORE:
				4673	xmlGenericError(xmlGenericErrorContext,
				4674	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4675	ctxt->instate = XML_PARSER_CONTENT;
				4676	ctxt->checkIndex = 0;
				4677	#ifdef DEBUG_PUSH
				4678	xmlGenericError(xmlGenericErrorContext,
				4679	"HPP: entering CONTENT\n");
				4680	#endif
				4681	break;
Daniel Veillard	044fc6b	2002-03-04 17:09:44 +0000	[diff] [blame]	4682	case XML_PARSER_PUBLIC_LITERAL:
				4683	xmlGenericError(xmlGenericErrorContext,
				4684	"HPP: internal error, state == XML_PARSER_LITERAL\n");
				4685	ctxt->instate = XML_PARSER_CONTENT;
				4686	ctxt->checkIndex = 0;
				4687	#ifdef DEBUG_PUSH
				4688	xmlGenericError(xmlGenericErrorContext,
				4689	"HPP: entering CONTENT\n");
				4690	#endif
				4691	break;
				4692
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4693	}
				4694	}
				4695	done:
				4696	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4697	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4698	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4699	/*
				4700	* SAX: end of the document processing.
				4701	*/
				4702	ctxt->instate = XML_PARSER_EOF;
				4703	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4704	ctxt->sax->endDocument(ctxt->userData);
				4705	}
				4706	}
				4707	if ((ctxt->myDoc != NULL) &&
				4708	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4709	(ctxt->instate == XML_PARSER_EPILOG))) {
				4710	xmlDtdPtr dtd;
				4711	dtd = xmlGetIntSubset(ctxt->myDoc);
				4712	if (dtd == NULL)
				4713	ctxt->myDoc->intSubset =
				4714	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4715	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4716	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4717	}
				4718	#ifdef DEBUG_PUSH
				4719	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4720	#endif
				4721	return(ret);
				4722	}
				4723
				4724	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4725	* htmlParseChunk:
				4726	* @ctxt: an XML parser context
				4727	* @chunk: an char array
				4728	* @size: the size in byte of the chunk
				4729	* @terminate: last chunk indicator
				4730	*
				4731	* Parse a Chunk of memory
				4732	*
				4733	* Returns zero if no error, the xmlParserErrors otherwise.
				4734	*/
				4735	int
				4736	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4737	int terminate) {
				4738	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4739	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4740	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4741	int cur = ctxt->input->cur - ctxt->input->base;
				4742
				4743	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4744	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4745	ctxt->input->cur = ctxt->input->base + cur;
				4746	#ifdef DEBUG_PUSH
				4747	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4748	#endif
				4749
				4750	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4751	htmlParseTryOrFinish(ctxt, terminate);
				4752	} else if (ctxt->instate != XML_PARSER_EOF) {
				4753	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4754	htmlParseTryOrFinish(ctxt, terminate);
				4755	}
				4756	if (terminate) {
				4757	if ((ctxt->instate != XML_PARSER_EOF) &&
				4758	(ctxt->instate != XML_PARSER_EPILOG) &&
				4759	(ctxt->instate != XML_PARSER_MISC)) {
				4760	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4761	ctxt->wellFormed = 0;
				4762	}
				4763	if (ctxt->instate != XML_PARSER_EOF) {
				4764	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4765	ctxt->sax->endDocument(ctxt->userData);
				4766	}
				4767	ctxt->instate = XML_PARSER_EOF;
				4768	}
				4769	return((xmlParserErrors) ctxt->errNo);
				4770	}
				4771
				4772	/************************************************************************
				4773	* *
				4774	* User entry points *
				4775	* *
				4776	************************************************************************/
				4777
				4778	/**
				4779	* htmlCreatePushParserCtxt :
				4780	* @sax: a SAX handler
				4781	* @user_data: The user data returned on SAX callbacks
				4782	* @chunk: a pointer to an array of chars
				4783	* @size: number of chars in the array
				4784	* @filename: an optional file name or URI
				4785	* @enc: an optional encoding
				4786	*
				4787	* Create a parser context for using the HTML parser in push mode
				4788	* To allow content encoding detection, @size should be >= 4
				4789	* The value of @filename is used for fetching external entities
				4790	* and error/warning reports.
				4791	*
				4792	* Returns the new parser context or NULL
				4793	*/
				4794	htmlParserCtxtPtr
				4795	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4796	const char chunk, int size, const char filename,
				4797	xmlCharEncoding enc) {
				4798	htmlParserCtxtPtr ctxt;
				4799	htmlParserInputPtr inputStream;
				4800	xmlParserInputBufferPtr buf;
				4801
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4802	xmlInitParser();
				4803
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4804	buf = xmlAllocParserInputBuffer(enc);
				4805	if (buf == NULL) return(NULL);
				4806
				4807	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4808	if (ctxt == NULL) {
				4809	xmlFree(buf);
				4810	return(NULL);
				4811	}
				4812	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4813	htmlInitParserCtxt(ctxt);
				4814	if (sax != NULL) {
				4815	if (ctxt->sax != &htmlDefaultSAXHandler)
				4816	xmlFree(ctxt->sax);
				4817	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4818	if (ctxt->sax == NULL) {
				4819	xmlFree(buf);
				4820	xmlFree(ctxt);
				4821	return(NULL);
				4822	}
				4823	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4824	if (user_data != NULL)
				4825	ctxt->userData = user_data;
				4826	}
				4827	if (filename == NULL) {
				4828	ctxt->directory = NULL;
				4829	} else {
				4830	ctxt->directory = xmlParserGetDirectory(filename);
				4831	}
				4832
				4833	inputStream = htmlNewInputStream(ctxt);
				4834	if (inputStream == NULL) {
				4835	xmlFreeParserCtxt(ctxt);
				4836	return(NULL);
				4837	}
				4838
				4839	if (filename == NULL)
				4840	inputStream->filename = NULL;
				4841	else
				4842	inputStream->filename = xmlMemStrdup(filename);
				4843	inputStream->buf = buf;
				4844	inputStream->base = inputStream->buf->buffer->content;
				4845	inputStream->cur = inputStream->buf->buffer->content;
				4846
				4847	inputPush(ctxt, inputStream);
				4848
				4849	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4850	(ctxt->input->buf != NULL)) {
				4851	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4852	#ifdef DEBUG_PUSH
				4853	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4854	#endif
				4855	}
				4856
				4857	return(ctxt);
				4858	}
				4859
				4860	/**
				4861	* htmlSAXParseDoc :
				4862	* @cur: a pointer to an array of xmlChar
				4863	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4864	* @sax: the SAX handler block
				4865	* @userData: if using SAX, this pointer will be provided on callbacks.
				4866	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4867	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4868	* to handle parse events. If sax is NULL, fallback to the default DOM
				4869	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4870	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4871	* Returns the resulting document tree unless SAX is NULL or the document is
				4872	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4873	*/
				4874
				4875	htmlDocPtr
				4876	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4877	htmlDocPtr ret;
				4878	htmlParserCtxtPtr ctxt;
				4879
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	4880	xmlInitParser();
				4881
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4882	if (cur == NULL) return(NULL);
				4883
				4884
				4885	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4886	if (ctxt == NULL) return(NULL);
				4887	if (sax != NULL) {
				4888	ctxt->sax = sax;
				4889	ctxt->userData = userData;
				4890	}
				4891
				4892	htmlParseDocument(ctxt);
				4893	ret = ctxt->myDoc;
				4894	if (sax != NULL) {
				4895	ctxt->sax = NULL;
				4896	ctxt->userData = NULL;
				4897	}
				4898	htmlFreeParserCtxt(ctxt);
				4899
				4900	return(ret);
				4901	}
				4902
				4903	/**
				4904	* htmlParseDoc :
				4905	* @cur: a pointer to an array of xmlChar
				4906	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4907	*
				4908	* parse an HTML in-memory document and build a tree.
				4909	*
				4910	* Returns the resulting document tree
				4911	*/
				4912
				4913	htmlDocPtr
				4914	htmlParseDoc(xmlChar cur, const char encoding) {
				4915	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4916	}
				4917
				4918
				4919	/**
				4920	* htmlCreateFileParserCtxt :
				4921	* @filename: the filename
				4922	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4923	*
				4924	* Create a parser context for a file content.
				4925	* Automatic support for ZLIB/Compress compressed document is provided
				4926	* by default if found at compile-time.
				4927	*
				4928	* Returns the new parser context or NULL
				4929	*/
				4930	htmlParserCtxtPtr
				4931	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4932	{
				4933	htmlParserCtxtPtr ctxt;
				4934	htmlParserInputPtr inputStream;
				4935	xmlParserInputBufferPtr buf;
				4936	/* htmlCharEncoding enc; */
				4937	xmlChar content, content_line = (xmlChar *) "charset=";
				4938
				4939	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4940	if (buf == NULL) return(NULL);
				4941
				4942	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4943	if (ctxt == NULL) {
				4944	perror("malloc");
				4945	return(NULL);
				4946	}
				4947	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4948	htmlInitParserCtxt(ctxt);
				4949	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4950	if (inputStream == NULL) {
				4951	perror("malloc");
				4952	xmlFree(ctxt);
				4953	return(NULL);
				4954	}
				4955	memset(inputStream, 0, sizeof(htmlParserInput));
				4956
				4957	inputStream->filename = xmlMemStrdup(filename);
				4958	inputStream->line = 1;
				4959	inputStream->col = 1;
				4960	inputStream->buf = buf;
				4961	inputStream->directory = NULL;
				4962
				4963	inputStream->base = inputStream->buf->buffer->content;
				4964	inputStream->cur = inputStream->buf->buffer->content;
				4965	inputStream->free = NULL;
				4966
				4967	inputPush(ctxt, inputStream);
				4968
				4969	/* set encoding */
				4970	if (encoding) {
				4971	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4972	if (content) {
				4973	strcpy ((char )content, (char )content_line);
				4974	strcat ((char )content, (char )encoding);
				4975	htmlCheckEncoding (ctxt, content);
				4976	xmlFree (content);
				4977	}
				4978	}
				4979
				4980	return(ctxt);
				4981	}
				4982
				4983	/**
				4984	* htmlSAXParseFile :
				4985	* @filename: the filename
				4986	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4987	* @sax: the SAX handler block
				4988	* @userData: if using SAX, this pointer will be provided on callbacks.
				4989	*
				4990	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4991	* compressed document is provided by default if found at compile-time.
				4992	* It use the given SAX function block to handle the parsing callback.
				4993	* If sax is NULL, fallback to the default DOM tree building routines.
				4994	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4995	* Returns the resulting document tree unless SAX is NULL or the document is
				4996	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4997	*/
				4998
				4999	htmlDocPtr
				5000	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				5001	void *userData) {
				5002	htmlDocPtr ret;
				5003	htmlParserCtxtPtr ctxt;
				5004	htmlSAXHandlerPtr oldsax = NULL;
				5005
Daniel Veillard	d046356	2001-10-13 09:15:48 +0000	[diff] [blame]	5006	xmlInitParser();
				5007
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	5008	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				5009	if (ctxt == NULL) return(NULL);
				5010	if (sax != NULL) {
				5011	oldsax = ctxt->sax;
				5012	ctxt->sax = sax;
				5013	ctxt->userData = userData;
				5014	}
				5015
				5016	htmlParseDocument(ctxt);
				5017
				5018	ret = ctxt->myDoc;
				5019	if (sax != NULL) {
				5020	ctxt->sax = oldsax;
				5021	ctxt->userData = NULL;
				5022	}
				5023	htmlFreeParserCtxt(ctxt);
				5024
				5025	return(ret);
				5026	}
				5027
				5028	/**
				5029	* htmlParseFile :
				5030	* @filename: the filename
				5031	* @encoding: a free form C string describing the HTML document encoding, or NULL
				5032	*
				5033	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				5034	* compressed document is provided by default if found at compile-time.
				5035	*
				5036	* Returns the resulting document tree
				5037	*/
				5038
				5039	htmlDocPtr
				5040	htmlParseFile(const char filename, const char encoding) {
				5041	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				5042	}
				5043
				5044	/**
				5045	* htmlHandleOmittedElem:
				5046	* @val: int 0 or 1
				5047	*
				5048	* Set and return the previous value for handling HTML omitted tags.
				5049	*
				5050	* Returns the last value for 0 for no handling, 1 for auto insertion.
				5051	*/
				5052
				5053	int
				5054	htmlHandleOmittedElem(int val) {
				5055	int old = htmlOmittedDefaultValue;
				5056
				5057	htmlOmittedDefaultValue = val;
				5058	return(old);
				5059	}
				5060
				5061	#endif /* LIBXML_HTML_ENABLED */