Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: b096d82e8482ccc8a19da2f6eab7a83d3704ff0d [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
				6	* Daniel.Veillard@w3.org
				7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
				43
				44	#define HTML_MAX_NAMELEN 1000
				45	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				46	#define HTML_PARSER_BUFFER_SIZE 100
				47
				48	/* #define DEBUG */
				49	/* #define DEBUG_PUSH */
				50
				51	int htmlOmittedDefaultValue = 1;
				52
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	53	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				54	xmlChar end, xmlChar end2, xmlChar end3);
				55
				56	/************************************************************************
				57	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	58	* Parser stacks related functions and macros *
				59	* *
				60	************************************************************************/
				61
				62	/*
				63	* Generic function for accessing stacks in the Parser Context
				64	*/
				65
				66	#define PUSH_AND_POP(scope, type, name) \
				67	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				68	if (ctxt->name##Nr >= ctxt->name##Max) { \
				69	ctxt->name##Max *= 2; \
				70	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				71	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				72	if (ctxt->name##Tab == NULL) { \
				73	xmlGenericError(xmlGenericErrorContext, \
				74	"realloc failed !\n"); \
				75	return(0); \
				76	} \
				77	} \
				78	ctxt->name##Tab[ctxt->name##Nr] = value; \
				79	ctxt->name = value; \
				80	return(ctxt->name##Nr++); \
				81	} \
				82	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				83	type ret; \
				84	if (ctxt->name##Nr < 0) return(0); \
				85	ctxt->name##Nr--; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	if (ctxt->name##Nr > 0) \
				88	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				89	else \
				90	ctxt->name = NULL; \
				91	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				92	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				93	return(ret); \
				94	} \
				95
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	96	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				97	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	98
				99	/*
				100	* Macros for accessing the content. Those should be used only by the parser,
				101	* and not exported.
				102	*
				103	* Dirty macros, i.e. one need to make assumption on the context to use them
				104	*
				105	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				106	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				107	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				108	* in UNICODE mode. This should be used internally by the parser
				109	* only to compare to ASCII values otherwise it would break when
				110	* running with UTF-8 encoding.
				111	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				112	* to compare on ASCII based substring.
				113	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				114	* it should be used only to compare on ASCII based substring.
				115	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				116	* strings within the parser.
				117	*
				118	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				119	*
				120	* CURRENT Returns the current char value, with the full decoding of
				121	* UTF-8 if we are using this mode. It returns an int.
				122	* NEXT Skip to the next character, this does the proper decoding
				123	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				124	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				125	*/
				126
				127	#define UPPER (toupper(*ctxt->input->cur))
				128
				129	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				130
				131	#define NXT(val) ctxt->input->cur[(val)]
				132
				133	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				134
				135	#define CUR_PTR ctxt->input->cur
				136
				137	#define SHRINK xmlParserInputShrink(ctxt->input)
				138
				139	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				140
				141	#define CURRENT ((int) (*ctxt->input->cur))
				142
				143	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				144
				145	/* Inported from XML */
				146
				147	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				148	#define CUR ((int) (*ctxt->input->cur))
				149	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				150
				151	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				152	#define NXT(val) ctxt->input->cur[(val)]
				153	#define CUR_PTR ctxt->input->cur
				154
				155
				156	#define NEXTL(l) do { \
				157	if (*(ctxt->input->cur) == '\n') { \
				158	ctxt->input->line++; ctxt->input->col = 1; \
				159	} else ctxt->input->col++; \
				160	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				161	} while (0)
				162
				163	/************
				164	\
				165	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				166	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				167	************/
				168
				169	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				170	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				171
				172	#define COPY_BUF(l,b,i,v) \
				173	if (l == 1) b[i++] = (xmlChar) v; \
				174	else i += xmlCopyChar(l,&b[i],v)
				175
				176	/**
				177	* htmlCurrentChar:
				178	* @ctxt: the HTML parser context
				179	* @len: pointer to the length of the char read
				180	*
				181	* The current char value, if using UTF-8 this may actaully span multiple
				182	* bytes in the input buffer. Implement the end of line normalization:
				183	* 2.11 End-of-Line Handling
				184	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				185	* char, then the encoding converter is plugged in automatically.
				186	*
				187	* Returns the current char value and its lenght
				188	*/
				189
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	190	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				192	if (ctxt->instate == XML_PARSER_EOF)
				193	return(0);
				194
				195	if (ctxt->token != 0) {
				196	*len = 0;
				197	return(ctxt->token);
				198	}
				199	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				200	/*
				201	* We are supposed to handle UTF8, check it's valid
				202	* From rfc2044: encoding of the Unicode values on UTF-8:
				203	*
				204	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				205	* 0000 0000-0000 007F 0xxxxxxx
				206	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				207	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				208	*
				209	* Check for the 0x110000 limit too
				210	*/
				211	const unsigned char *cur = ctxt->input->cur;
				212	unsigned char c;
				213	unsigned int val;
				214
				215	c = *cur;
				216	if (c & 0x80) {
				217	if (cur[1] == 0)
				218	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				219	if ((cur[1] & 0xc0) != 0x80)
				220	goto encoding_error;
				221	if ((c & 0xe0) == 0xe0) {
				222
				223	if (cur[2] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[2] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xf0) == 0xf0) {
				228	if (cur[3] == 0)
				229	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				230	if (((c & 0xf8) != 0xf0) \|\|
				231	((cur[3] & 0xc0) != 0x80))
				232	goto encoding_error;
				233	/* 4-byte code */
				234	*len = 4;
				235	val = (cur[0] & 0x7) << 18;
				236	val \|= (cur[1] & 0x3f) << 12;
				237	val \|= (cur[2] & 0x3f) << 6;
				238	val \|= cur[3] & 0x3f;
				239	} else {
				240	/* 3-byte code */
				241	*len = 3;
				242	val = (cur[0] & 0xf) << 12;
				243	val \|= (cur[1] & 0x3f) << 6;
				244	val \|= cur[2] & 0x3f;
				245	}
				246	} else {
				247	/* 2-byte code */
				248	*len = 2;
				249	val = (cur[0] & 0x1f) << 6;
				250	val \|= cur[1] & 0x3f;
				251	}
				252	if (!IS_CHAR(val)) {
				253	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				254	if ((ctxt->sax != NULL) &&
				255	(ctxt->sax->error != NULL))
				256	ctxt->sax->error(ctxt->userData,
				257	"Char 0x%X out of allowed range\n", val);
				258	ctxt->wellFormed = 0;
				259	ctxt->disableSAX = 1;
				260	}
				261	return(val);
				262	} else {
				263	/* 1-byte code */
				264	*len = 1;
				265	return((int) *ctxt->input->cur);
				266	}
				267	}
				268	/*
				269	* Assume it's a fixed lenght encoding (1) with
				270	* a compatibke encoding for the ASCII set, since
				271	* XML constructs only use < 128 chars
				272	*/
				273	*len = 1;
				274	if ((int) *ctxt->input->cur < 0x80)
				275	return((int) *ctxt->input->cur);
				276
				277	/*
				278	* Humm this is bad, do an automatic flow conversion
				279	*/
				280	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				281	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				282	return(xmlCurrentChar(ctxt, len));
				283
				284	encoding_error:
				285	/*
				286	* If we detect an UTF8 error that probably mean that the
				287	* input encoding didn't get properly advertized in the
				288	* declaration header. Report the error and switch the encoding
				289	* to ISO-Latin-1 (if you don't like this policy, just declare the
				290	* encoding !)
				291	*/
				292	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				293	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				294	ctxt->sax->error(ctxt->userData,
				295	"Input is not proper UTF-8, indicate encoding !\n");
				296	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				297	ctxt->input->cur[0], ctxt->input->cur[1],
				298	ctxt->input->cur[2], ctxt->input->cur[3]);
				299	}
				300
				301	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				302	*len = 1;
				303	return((int) *ctxt->input->cur);
				304	}
				305
				306	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	307	* htmlSkipBlankChars:
				308	* @ctxt: the HTML parser context
				309	*
				310	* skip all blanks character found at that point in the input streams.
				311	*
				312	* Returns the number of space chars skipped
				313	*/
				314
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	315	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	316	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				317	int res = 0;
				318
				319	while (IS_BLANK(*(ctxt->input->cur))) {
				320	if ((*ctxt->input->cur == 0) &&
				321	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				322	xmlPopInput(ctxt);
				323	} else {
				324	if (*(ctxt->input->cur) == '\n') {
				325	ctxt->input->line++; ctxt->input->col = 1;
				326	} else ctxt->input->col++;
				327	ctxt->input->cur++;
				328	ctxt->nbChars++;
				329	if (*ctxt->input->cur == 0)
				330	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				331	}
				332	res++;
				333	}
				334	return(res);
				335	}
				336
				337
				338
				339	/************************************************************************
				340	* *
				341	* The list of HTML elements and their properties *
				342	* *
				343	************************************************************************/
				344
				345	/*
				346	* Start Tag: 1 means the start tag can be ommited
				347	* End Tag: 1 means the end tag can be ommited
				348	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	349	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	350	* Depr: this element is deprecated
				351	* DTD: 1 means that this element is valid only in the Loose DTD
				352	* 2 means that this element is valid only in the Frameset DTD
				353	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame^]	354	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	355	*/
				356	htmlElemDesc html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame^]	357	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				358	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				359	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				360	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				361	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				362	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				363	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				364	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				365	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				366	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				367	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				368	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				369	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				370	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				371	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				372	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				373	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				374	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				375	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				376	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				377	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				378	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				379	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				380	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				381	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				382	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				383	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				384	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				385	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				386	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				387	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				388	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				389	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				390	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				391	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				392	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				393	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				398	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				399	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				400	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				401	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				402	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				403	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				404	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				405	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				406	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				407	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				408	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				409	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				410	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				411	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				412	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				413	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				414	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				415	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				416	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				417	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				418	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				419	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				420	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				421	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				422	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				423	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				424	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				425	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				426	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				427	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				428	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				429	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				430	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				431	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				432	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				433	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				434	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				435	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				436	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				437	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				438	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				439	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				440	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				441	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				442	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				443	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				444	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				445	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				446	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				447	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	448	};
				449
				450	/*
				451	* start tags that imply the end of a current element
				452	* any tag of each line implies the end of the current element if the type of
				453	* that element is in the same line
				454	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	455	const char *htmlEquEnd[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	456	"dt", "dd", "li", "option", NULL,
				457	"h1", "h2", "h3", "h4", "h5", "h6", NULL,
				458	"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
				459	NULL
				460	};
				461	/*
				462	* acording the HTML DTD, HR should be added to the 2nd line above, as it
				463	* is not allowed within a H1, H2, H3, etc. But we should tolerate that case
				464	* because many documents contain rules in headings...
				465	*/
				466
				467	/*
				468	* start tags that imply the end of current element
				469	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	470	const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	471	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				472	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				473	"listing", "xmp", "head", NULL,
				474	"head", "p", NULL,
				475	"title", "p", NULL,
				476	"body", "head", "style", "link", "title", "p", NULL,
				477	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				478	"pre", "listing", "xmp", "head", "li", NULL,
				479	"hr", "p", "head", NULL,
				480	"h1", "p", "head", NULL,
				481	"h2", "p", "head", NULL,
				482	"h3", "p", "head", NULL,
				483	"h4", "p", "head", NULL,
				484	"h5", "p", "head", NULL,
				485	"h6", "p", "head", NULL,
				486	"dir", "p", "head", NULL,
				487	"address", "p", "head", "ul", NULL,
				488	"pre", "p", "head", "ul", NULL,
				489	"listing", "p", "head", NULL,
				490	"xmp", "p", "head", NULL,
				491	"blockquote", "p", "head", NULL,
				492	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				493	"xmp", "head", NULL,
				494	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				495	"head", "dd", NULL,
				496	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				497	"head", "dt", NULL,
				498	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				499	"listing", "xmp", NULL,
				500	"ol", "p", "head", "ul", NULL,
				501	"menu", "p", "head", "ul", NULL,
				502	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				503	"div", "p", "head", NULL,
				504	"noscript", "p", "head", NULL,
				505	"center", "font", "b", "i", "p", "head", NULL,
				506	"a", "a", NULL,
				507	"caption", "p", NULL,
				508	"colgroup", "caption", "colgroup", "col", "p", NULL,
				509	"col", "caption", "col", "p", NULL,
				510	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				511	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	512	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				513	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	514	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				515	"thead", "caption", "col", "colgroup", NULL,
				516	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				517	"tbody", "p", NULL,
				518	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				519	"tfoot", "tbody", "p", NULL,
				520	"optgroup", "option", NULL,
				521	"option", "option", NULL,
				522	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				523	"pre", "listing", "xmp", "a", NULL,
				524	NULL
				525	};
				526
				527	/*
				528	* The list of HTML elements which are supposed not to have
				529	* CDATA content and where a p element will be implied
				530	*
				531	* TODO: extend that list by reading the HTML SGML DtD on
				532	* implied paragraph
				533	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	534	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	535	"html",
				536	"head",
				537	"body",
				538	NULL
				539	};
				540
				541	/*
				542	* The list of HTML attributes which are of content %Script;
				543	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				544	* it assumes the name starts with 'on'
				545	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	546	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	547	"onclick",
				548	"ondblclick",
				549	"onmousedown",
				550	"onmouseup",
				551	"onmouseover",
				552	"onmousemove",
				553	"onmouseout",
				554	"onkeypress",
				555	"onkeydown",
				556	"onkeyup",
				557	"onload",
				558	"onunload",
				559	"onfocus",
				560	"onblur",
				561	"onsubmit",
				562	"onrest",
				563	"onchange",
				564	"onselect"
				565	};
				566
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	567	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	568	* This table is used by the htmlparser to know what to do with
				569	* broken html pages. By assigning different priorities to different
				570	* elements the parser can decide how to handle extra endtags.
				571	* Endtags are only allowed to close elements with lower or equal
				572	* priority.
				573	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	574
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	575	typedef struct {
				576	const char *name;
				577	int priority;
				578	} elementPriority;
				579
				580	const elementPriority htmlEndPriority[] = {
				581	{"div", 150},
				582	{"td", 160},
				583	{"th", 160},
				584	{"tr", 170},
				585	{"thead", 180},
				586	{"tbody", 180},
				587	{"tfoot", 180},
				588	{"table", 190},
				589	{"head", 200},
				590	{"body", 200},
				591	{"html", 220},
				592	{NULL, 100} /* Default priority */
				593	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	594
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	595	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	596	static int htmlStartCloseIndexinitialized = 0;
				597
				598	/************************************************************************
				599	* *
				600	* functions to handle HTML specific data *
				601	* *
				602	************************************************************************/
				603
				604	/**
				605	* htmlInitAutoClose:
				606	*
				607	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				608	* This is not reentrant. Call xmlInitParser() once before processing in
				609	* case of use in multithreaded programs.
				610	*/
				611	void
				612	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	613	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	614
				615	if (htmlStartCloseIndexinitialized) return;
				616
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	617	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				618	indx = 0;
				619	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				620	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	621	while (htmlStartClose[i] != NULL) i++;
				622	i++;
				623	}
				624	htmlStartCloseIndexinitialized = 1;
				625	}
				626
				627	/**
				628	* htmlTagLookup:
				629	* @tag: The tag name in lowercase
				630	*
				631	* Lookup the HTML tag in the ElementTable
				632	*
				633	* Returns the related htmlElemDescPtr or NULL if not found.
				634	*/
				635	htmlElemDescPtr
				636	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	637	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	638
				639	for (i = 0; i < (sizeof(html40ElementTable) /
				640	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	641	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	642	return(&html40ElementTable[i]);
				643	}
				644	return(NULL);
				645	}
				646
				647	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	648	* htmlGetEndPriority:
				649	* @name: The name of the element to look up the priority for.
				650	*
				651	* Return value: The "endtag" priority.
				652	**/
				653	static int
				654	htmlGetEndPriority (const xmlChar *name) {
				655	int i = 0;
				656
				657	while ((htmlEndPriority[i].name != NULL) &&
				658	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				659	i++;
				660
				661	return(htmlEndPriority[i].priority);
				662	}
				663
				664	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	665	* htmlCheckAutoClose:
				666	* @newtag: The new tag name
				667	* @oldtag: The old tag name
				668	*
				669	* Checks wether the new tag is one of the registered valid tags for closing old.
				670	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				671	*
				672	* Returns 0 if no, 1 if yes.
				673	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	674	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	675	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	676	int i, indx;
				677	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	678
				679	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				680
				681	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	682	for (indx = 0; indx < 100;indx++) {
				683	closed = htmlStartCloseIndex[indx];
				684	if (closed == NULL) return(0);
				685	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	686	}
				687
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	688	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	689	i++;
				690	while (htmlStartClose[i] != NULL) {
				691	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				692	return(1);
				693	}
				694	i++;
				695	}
				696	return(0);
				697	}
				698
				699	/**
				700	* htmlAutoCloseOnClose:
				701	* @ctxt: an HTML parser context
				702	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	703	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	704	*
				705	* The HTmL DtD allows an ending tag to implicitely close other tags.
				706	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	707	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	708	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				709	htmlElemDescPtr info;
				710	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	711	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	712
				713	#ifdef DEBUG
				714	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				715	for (i = 0;i < ctxt->nameNr;i++)
				716	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				717	#endif
				718
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	719	priority = htmlGetEndPriority (newtag);
				720
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	721	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	722
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	723	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	724	/*
				725	* A missplaced endtagad can only close elements with lower
				726	* or equal priority, so if we find an element with higher
				727	* priority before we find an element with
				728	* matching name, we just ignore this endtag
				729	*/
				730	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	731	}
				732	if (i < 0) return;
				733
				734	while (!xmlStrEqual(newtag, ctxt->name)) {
				735	info = htmlTagLookup(ctxt->name);
				736	if ((info == NULL) \|\| (info->endTag == 1)) {
				737	#ifdef DEBUG
				738	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				739	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	740	} else if (info->endTag == 3) {
				741	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	742	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	743	#endif
				744	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				745	ctxt->sax->error(ctxt->userData,
				746	"Opening and ending tag mismatch: %s and %s\n",
				747	newtag, ctxt->name);
				748	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	749	}
				750	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				751	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				752	oldname = htmlnamePop(ctxt);
				753	if (oldname != NULL) {
				754	#ifdef DEBUG
				755	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				756	#endif
				757	xmlFree(oldname);
				758	}
				759	}
				760	}
				761
				762	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	763	* htmlAutoCloseOnEnd:
				764	* @ctxt: an HTML parser context
				765	*
				766	* Close all remaining tags at the end of the stream
				767	*/
				768	static void
				769	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				770	xmlChar *oldname;
				771	int i;
				772
				773	if (ctxt->nameNr == 0)
				774	return;
				775	#ifdef DEBUG
				776	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				777	#endif
				778
				779	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				780	#ifdef DEBUG
				781	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				782	#endif
				783	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				784	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				785	oldname = htmlnamePop(ctxt);
				786	if (oldname != NULL) {
				787	#ifdef DEBUG
				788	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				789	#endif
				790	xmlFree(oldname);
				791	}
				792	}
				793	}
				794
				795	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	796	* htmlAutoClose:
				797	* @ctxt: an HTML parser context
				798	* @newtag: The new tag name or NULL
				799	*
				800	* The HTmL DtD allows a tag to implicitely close other tags.
				801	* The list is kept in htmlStartClose array. This function is
				802	* called when a new tag has been detected and generates the
				803	* appropriates closes if possible/needed.
				804	* If newtag is NULL this mean we are at the end of the resource
				805	* and we should check
				806	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	807	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	808	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				809	xmlChar *oldname;
				810	while ((newtag != NULL) && (ctxt->name != NULL) &&
				811	(htmlCheckAutoClose(newtag, ctxt->name))) {
				812	#ifdef DEBUG
				813	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				814	#endif
				815	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				816	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				817	oldname = htmlnamePop(ctxt);
				818	if (oldname != NULL) {
				819	#ifdef DEBUG
				820	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				821	#endif
				822	xmlFree(oldname);
				823	}
				824	}
				825	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	826	htmlAutoCloseOnEnd(ctxt);
				827	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	828	}
				829	while ((newtag == NULL) && (ctxt->name != NULL) &&
				830	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				831	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				832	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				833	#ifdef DEBUG
				834	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				835	#endif
				836	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				837	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				838	oldname = htmlnamePop(ctxt);
				839	if (oldname != NULL) {
				840	#ifdef DEBUG
				841	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				842	#endif
				843	xmlFree(oldname);
				844	}
				845	}
				846
				847	}
				848
				849	/**
				850	* htmlAutoCloseTag:
				851	* @doc: the HTML document
				852	* @name: The tag name
				853	* @elem: the HTML element
				854	*
				855	* The HTmL DtD allows a tag to implicitely close other tags.
				856	* The list is kept in htmlStartClose array. This function checks
				857	* if the element or one of it's children would autoclose the
				858	* given tag.
				859	*
				860	* Returns 1 if autoclose, 0 otherwise
				861	*/
				862	int
				863	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				864	htmlNodePtr child;
				865
				866	if (elem == NULL) return(1);
				867	if (xmlStrEqual(name, elem->name)) return(0);
				868	if (htmlCheckAutoClose(elem->name, name)) return(1);
				869	child = elem->children;
				870	while (child != NULL) {
				871	if (htmlAutoCloseTag(doc, name, child)) return(1);
				872	child = child->next;
				873	}
				874	return(0);
				875	}
				876
				877	/**
				878	* htmlIsAutoClosed:
				879	* @doc: the HTML document
				880	* @elem: the HTML element
				881	*
				882	* The HTmL DtD allows a tag to implicitely close other tags.
				883	* The list is kept in htmlStartClose array. This function checks
				884	* if a tag is autoclosed by one of it's child
				885	*
				886	* Returns 1 if autoclosed, 0 otherwise
				887	*/
				888	int
				889	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				890	htmlNodePtr child;
				891
				892	if (elem == NULL) return(1);
				893	child = elem->children;
				894	while (child != NULL) {
				895	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				896	child = child->next;
				897	}
				898	return(0);
				899	}
				900
				901	/**
				902	* htmlCheckImplied:
				903	* @ctxt: an HTML parser context
				904	* @newtag: The new tag name
				905	*
				906	* The HTML DtD allows a tag to exists only implicitely
				907	* called when a new tag has been detected and generates the
				908	* appropriates implicit tags if missing
				909	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	910	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	911	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				912	if (!htmlOmittedDefaultValue)
				913	return;
				914	if (xmlStrEqual(newtag, BAD_CAST"html"))
				915	return;
				916	if (ctxt->nameNr <= 0) {
				917	#ifdef DEBUG
				918	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				919	#endif
				920	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				921	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				922	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				923	}
				924	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				925	return;
				926	if ((ctxt->nameNr <= 1) &&
				927	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				928	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				929	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				930	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				931	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				932	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				933	/*
				934	* dropped OBJECT ... i you put it first BODY will be
				935	* assumed !
				936	*/
				937	#ifdef DEBUG
				938	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				939	#endif
				940	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				941	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				942	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				943	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				944	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				945	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				946	int i;
				947	for (i = 0;i < ctxt->nameNr;i++) {
				948	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				949	return;
				950	}
				951	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				952	return;
				953	}
				954	}
				955
				956	#ifdef DEBUG
				957	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				958	#endif
				959	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				960	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				961	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				962	}
				963	}
				964
				965	/**
				966	* htmlCheckParagraph
				967	* @ctxt: an HTML parser context
				968	*
				969	* Check whether a p element need to be implied before inserting
				970	* characters in the current element.
				971	*
				972	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				973	* in case of error.
				974	*/
				975
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	976	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	977	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				978	const xmlChar *tag;
				979	int i;
				980
				981	if (ctxt == NULL)
				982	return(-1);
				983	tag = ctxt->name;
				984	if (tag == NULL) {
				985	htmlAutoClose(ctxt, BAD_CAST"p");
				986	htmlCheckImplied(ctxt, BAD_CAST"p");
				987	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				988	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				989	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				990	return(1);
				991	}
				992	if (!htmlOmittedDefaultValue)
				993	return(0);
				994	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				995	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				996	#ifdef DEBUG
				997	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				998	#endif
				999	htmlAutoClose(ctxt, BAD_CAST"p");
				1000	htmlCheckImplied(ctxt, BAD_CAST"p");
				1001	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				1002	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				1003	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				1004	return(1);
				1005	}
				1006	}
				1007	return(0);
				1008	}
				1009
				1010	/**
				1011	* htmlIsScriptAttribute:
				1012	* @name: an attribute name
				1013	*
				1014	* Check if an attribute is of content type Script
				1015	*
				1016	* Returns 1 is the attribute is a script 0 otherwise
				1017	*/
				1018	int
				1019	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1020	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1021
				1022	if (name == NULL)
				1023	return(0);
				1024	/*
				1025	* all script attributes start with 'on'
				1026	*/
				1027	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1028	return(0);
				1029	for (i = 0;
				1030	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1031	i++) {
				1032	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1033	return(1);
				1034	}
				1035	return(0);
				1036	}
				1037
				1038	/************************************************************************
				1039	* *
				1040	* The list of HTML predefined entities *
				1041	* *
				1042	************************************************************************/
				1043
				1044
				1045	htmlEntityDesc html40EntitiesTable[] = {
				1046	/*
				1047	* the 4 absolute ones, plus apostrophe.
				1048	*/
				1049	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1050	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1051	{ 39, "apos", "single quote" },
				1052	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1053	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1054
				1055	/*
				1056	* A bunch still in the 128-255 range
				1057	* Replacing them depend really on the charset used.
				1058	*/
				1059	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1060	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1061	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1062	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1063	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1064	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1065	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1066	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1067	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1068	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1069	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1070	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1071	{ 172, "not", "not sign, U+00AC ISOnum" },
				1072	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1073	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1074	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1075	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1076	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1077	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1078	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1079	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1080	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1081	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1082	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1083	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1084	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1085	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1086	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1087	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1088	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1089	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1090	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1091	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1092	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1093	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1094	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1095	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1096	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1097	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1098	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1099	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1100	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1101	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1102	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1103	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1104	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1105	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1106	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1107	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1108	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1109	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1110	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1111	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1112	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1113	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1114	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1115	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1116	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1117	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1118	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1119	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1120	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1121	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1122	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1123	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1124	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1125	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1126	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1127	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1128	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1129	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1130	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1131	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1132	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1133	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1134	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1135	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1136	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1137	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1138	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1139	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1140	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1141	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1142	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1143	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1144	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1145	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1146	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1147	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1148	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1149	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1150	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1151	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1152	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1153	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1154	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1155
				1156	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1157	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1158	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1159	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1160	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1161
				1162	/*
				1163	* Anything below should really be kept as entities references
				1164	*/
				1165	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1166
				1167	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1168	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1169
				1170	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1171	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1172	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1173	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1174	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1175	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1176	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1177	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1178	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1179	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1180	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1181	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1182	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1183	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1184	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1185	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1186	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1187	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1188	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1189	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1190	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1191	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1192	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1193	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1194
				1195	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1196	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1197	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1198	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1199	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1200	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1201	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1202	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1203	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1204	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1205	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1206	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1207	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1208	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1209	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1210	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1211	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1212	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1213	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1214	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1215	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1216	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1217	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1218	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1219	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1220	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1221	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1222	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1223
				1224	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1225	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1226	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1227	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1228	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1229	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1230	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1231	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1232	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1233	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1234	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1235	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1236	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1237	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1238	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1239	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1240	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1241
				1242	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1243	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1244
				1245	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1246
				1247	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1248	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1249
				1250	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1251	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1252
				1253	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1254	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1255
				1256	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1257
				1258	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1259	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1260	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1261	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1262	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1263	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1264	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1265	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1266	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1267	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1268	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1269	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1270	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1271	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1272	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1273	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1274
				1275	{ 8704, "forall","for all, U+2200 ISOtech" },
				1276	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1277	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1278	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1279	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1280	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1281	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1282	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1283	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1284	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1285	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1286	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1287	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1288	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1289	{ 8734, "infin","infinity, U+221E ISOtech" },
				1290	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1291	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1292	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1293	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1294	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1295	{ 8747, "int", "integral, U+222B ISOtech" },
				1296	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1297	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1298	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1299	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1300	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1301	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1302	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1303	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1304	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1305	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1306	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1307	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1308	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1309	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1310	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1311	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1312	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1313	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1314	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1315	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1316	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1317	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1318	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1319	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1320
				1321	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1322	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1323	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1324	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1325
				1326	};
				1327
				1328	/************************************************************************
				1329	* *
				1330	* Commodity functions to handle entities *
				1331	* *
				1332	************************************************************************/
				1333
				1334	/*
				1335	* Macro used to grow the current buffer.
				1336	*/
				1337	#define growBuffer(buffer) { \
				1338	buffer##_size *= 2; \
				1339	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1340	if (buffer == NULL) { \
				1341	perror("realloc failed"); \
				1342	return(NULL); \
				1343	} \
				1344	}
				1345
				1346	/**
				1347	* htmlEntityLookup:
				1348	* @name: the entity name
				1349	*
				1350	* Lookup the given entity in EntitiesTable
				1351	*
				1352	* TODO: the linear scan is really ugly, an hash table is really needed.
				1353	*
				1354	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1355	*/
				1356	htmlEntityDescPtr
				1357	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1358	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1359
				1360	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1361	sizeof(html40EntitiesTable[0]));i++) {
				1362	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1363	#ifdef DEBUG
				1364	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1365	#endif
				1366	return(&html40EntitiesTable[i]);
				1367	}
				1368	}
				1369	return(NULL);
				1370	}
				1371
				1372	/**
				1373	* htmlEntityValueLookup:
				1374	* @value: the entity's unicode value
				1375	*
				1376	* Lookup the given entity in EntitiesTable
				1377	*
				1378	* TODO: the linear scan is really ugly, an hash table is really needed.
				1379	*
				1380	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1381	*/
				1382	htmlEntityDescPtr
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1383	htmlEntityValueLookup(unsigned int value) {
				1384	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1385	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1386	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1387	#endif
				1388
				1389	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1390	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1391	if (html40EntitiesTable[i].value >= value) {
				1392	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1393	break;
				1394	#ifdef DEBUG
				1395	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1396	#endif
				1397	return(&html40EntitiesTable[i]);
				1398	}
				1399	#ifdef DEBUG
				1400	if (lv > html40EntitiesTable[i].value) {
				1401	xmlGenericError(xmlGenericErrorContext,
				1402	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1403	lv, html40EntitiesTable[i].value);
				1404	}
				1405	lv = html40EntitiesTable[i].value;
				1406	#endif
				1407	}
				1408	return(NULL);
				1409	}
				1410
				1411	/**
				1412	* UTF8ToHtml:
				1413	* @out: a pointer to an array of bytes to store the result
				1414	* @outlen: the length of @out
				1415	* @in: a pointer to an array of UTF-8 chars
				1416	* @inlen: the length of @in
				1417	*
				1418	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1419	* plus HTML entities block of chars out.
				1420	*
				1421	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1422	* The value of @inlen after return is the number of octets consumed
				1423	* as the return value is positive, else unpredictiable.
				1424	* The value of @outlen after return is the number of octets consumed.
				1425	*/
				1426	int
				1427	UTF8ToHtml(unsigned char* out, int *outlen,
				1428	const unsigned char* in, int *inlen) {
				1429	const unsigned char* processed = in;
				1430	const unsigned char* outend;
				1431	const unsigned char* outstart = out;
				1432	const unsigned char* instart = in;
				1433	const unsigned char* inend;
				1434	unsigned int c, d;
				1435	int trailing;
				1436
				1437	if (in == NULL) {
				1438	/*
				1439	* initialization nothing to do
				1440	*/
				1441	*outlen = 0;
				1442	*inlen = 0;
				1443	return(0);
				1444	}
				1445	inend = in + (*inlen);
				1446	outend = out + (*outlen);
				1447	while (in < inend) {
				1448	d = *in++;
				1449	if (d < 0x80) { c= d; trailing= 0; }
				1450	else if (d < 0xC0) {
				1451	/* trailing byte in leading position */
				1452	*outlen = out - outstart;
				1453	*inlen = processed - instart;
				1454	return(-2);
				1455	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1456	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1457	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1458	else {
				1459	/* no chance for this in Ascii */
				1460	*outlen = out - outstart;
				1461	*inlen = processed - instart;
				1462	return(-2);
				1463	}
				1464
				1465	if (inend - in < trailing) {
				1466	break;
				1467	}
				1468
				1469	for ( ; trailing; trailing--) {
				1470	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1471	break;
				1472	c <<= 6;
				1473	c \|= d & 0x3F;
				1474	}
				1475
				1476	/* assertion: c is a single UTF-4 value */
				1477	if (c < 0x80) {
				1478	if (out + 1 >= outend)
				1479	break;
				1480	*out++ = c;
				1481	} else {
				1482	int len;
				1483	htmlEntityDescPtr ent;
				1484
				1485	/*
				1486	* Try to lookup a predefined HTML entity for it
				1487	*/
				1488
				1489	ent = htmlEntityValueLookup(c);
				1490	if (ent == NULL) {
				1491	/* no chance for this in Ascii */
				1492	*outlen = out - outstart;
				1493	*inlen = processed - instart;
				1494	return(-2);
				1495	}
				1496	len = strlen(ent->name);
				1497	if (out + 2 + len >= outend)
				1498	break;
				1499	*out++ = '&';
				1500	memcpy(out, ent->name, len);
				1501	out += len;
				1502	*out++ = ';';
				1503	}
				1504	processed = in;
				1505	}
				1506	*outlen = out - outstart;
				1507	*inlen = processed - instart;
				1508	return(0);
				1509	}
				1510
				1511	/**
				1512	* htmlEncodeEntities:
				1513	* @out: a pointer to an array of bytes to store the result
				1514	* @outlen: the length of @out
				1515	* @in: a pointer to an array of UTF-8 chars
				1516	* @inlen: the length of @in
				1517	* @quoteChar: the quote character to escape (' or ") or zero.
				1518	*
				1519	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1520	* plus HTML entities block of chars out.
				1521	*
				1522	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1523	* The value of @inlen after return is the number of octets consumed
				1524	* as the return value is positive, else unpredictiable.
				1525	* The value of @outlen after return is the number of octets consumed.
				1526	*/
				1527	int
				1528	htmlEncodeEntities(unsigned char* out, int *outlen,
				1529	const unsigned char* in, int *inlen, int quoteChar) {
				1530	const unsigned char* processed = in;
				1531	const unsigned char* outend = out + (*outlen);
				1532	const unsigned char* outstart = out;
				1533	const unsigned char* instart = in;
				1534	const unsigned char* inend = in + (*inlen);
				1535	unsigned int c, d;
				1536	int trailing;
				1537
				1538	while (in < inend) {
				1539	d = *in++;
				1540	if (d < 0x80) { c= d; trailing= 0; }
				1541	else if (d < 0xC0) {
				1542	/* trailing byte in leading position */
				1543	*outlen = out - outstart;
				1544	*inlen = processed - instart;
				1545	return(-2);
				1546	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1547	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1548	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1549	else {
				1550	/* no chance for this in Ascii */
				1551	*outlen = out - outstart;
				1552	*inlen = processed - instart;
				1553	return(-2);
				1554	}
				1555
				1556	if (inend - in < trailing)
				1557	break;
				1558
				1559	while (trailing--) {
				1560	if (((d= *in++) & 0xC0) != 0x80) {
				1561	*outlen = out - outstart;
				1562	*inlen = processed - instart;
				1563	return(-2);
				1564	}
				1565	c <<= 6;
				1566	c \|= d & 0x3F;
				1567	}
				1568
				1569	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1570	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1571	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1572	if (out >= outend)
				1573	break;
				1574	*out++ = c;
				1575	} else {
				1576	htmlEntityDescPtr ent;
				1577	const char *cp;
				1578	char nbuf[16];
				1579	int len;
				1580
				1581	/*
				1582	* Try to lookup a predefined HTML entity for it
				1583	*/
				1584	ent = htmlEntityValueLookup(c);
				1585	if (ent == NULL) {
				1586	sprintf(nbuf, "#%u", c);
				1587	cp = nbuf;
				1588	}
				1589	else
				1590	cp = ent->name;
				1591	len = strlen(cp);
				1592	if (out + 2 + len > outend)
				1593	break;
				1594	*out++ = '&';
				1595	memcpy(out, cp, len);
				1596	out += len;
				1597	*out++ = ';';
				1598	}
				1599	processed = in;
				1600	}
				1601	*outlen = out - outstart;
				1602	*inlen = processed - instart;
				1603	return(0);
				1604	}
				1605
				1606	/**
				1607	* htmlDecodeEntities:
				1608	* @ctxt: the parser context
				1609	* @len: the len to decode (in bytes !), -1 for no size limit
				1610	* @end: an end marker xmlChar, 0 if none
				1611	* @end2: an end marker xmlChar, 0 if none
				1612	* @end3: an end marker xmlChar, 0 if none
				1613	*
				1614	* Subtitute the HTML entities by their value
				1615	*
				1616	* DEPRECATED !!!!
				1617	*
				1618	* Returns A newly allocated string with the substitution done. The caller
				1619	* must deallocate it !
				1620	*/
				1621	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1622	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1623	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1624	static int deprecated = 0;
				1625	if (!deprecated) {
				1626	xmlGenericError(xmlGenericErrorContext,
				1627	"htmlDecodeEntities() deprecated function reached\n");
				1628	deprecated = 1;
				1629	}
				1630	return(NULL);
				1631	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1632	xmlChar *name = NULL;
				1633	xmlChar *buffer = NULL;
				1634	unsigned int buffer_size = 0;
				1635	unsigned int nbchars = 0;
				1636	htmlEntityDescPtr ent;
				1637	unsigned int max = (unsigned int) len;
				1638	int c,l;
				1639
				1640	if (ctxt->depth > 40) {
				1641	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1642	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1643	ctxt->sax->error(ctxt->userData,
				1644	"Detected entity reference loop\n");
				1645	ctxt->wellFormed = 0;
				1646	ctxt->disableSAX = 1;
				1647	return(NULL);
				1648	}
				1649
				1650	/*
				1651	* allocate a translation buffer.
				1652	*/
				1653	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1654	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1655	if (buffer == NULL) {
				1656	perror("xmlDecodeEntities: malloc failed");
				1657	return(NULL);
				1658	}
				1659
				1660	/*
				1661	* Ok loop until we reach one of the ending char or a size limit.
				1662	*/
				1663	c = CUR_CHAR(l);
				1664	while ((nbchars < max) && (c != end) &&
				1665	(c != end2) && (c != end3)) {
				1666
				1667	if (c == 0) break;
				1668	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1669	int val = htmlParseCharRef(ctxt);
				1670	COPY_BUF(0,buffer,nbchars,val);
				1671	NEXTL(l);
				1672	} else if ((c == '&') && (ctxt->token != '&')) {
				1673	ent = htmlParseEntityRef(ctxt, &name);
				1674	if (name != NULL) {
				1675	if (ent != NULL) {
				1676	int val = ent->value;
				1677	COPY_BUF(0,buffer,nbchars,val);
				1678	NEXTL(l);
				1679	} else {
				1680	const xmlChar *cur = name;
				1681
				1682	buffer[nbchars++] = '&';
				1683	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1684	growBuffer(buffer);
				1685	}
				1686	while (*cur != 0) {
				1687	buffer[nbchars++] = *cur++;
				1688	}
				1689	buffer[nbchars++] = ';';
				1690	}
				1691	}
				1692	} else {
				1693	COPY_BUF(l,buffer,nbchars,c);
				1694	NEXTL(l);
				1695	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1696	growBuffer(buffer);
				1697	}
				1698	}
				1699	c = CUR_CHAR(l);
				1700	}
				1701	buffer[nbchars++] = 0;
				1702	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1703	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1704	}
				1705
				1706	/************************************************************************
				1707	* *
				1708	* Commodity functions to handle streams *
				1709	* *
				1710	************************************************************************/
				1711
				1712	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1713	* htmlNewInputStream:
				1714	* @ctxt: an HTML parser context
				1715	*
				1716	* Create a new input stream structure
				1717	* Returns the new input stream or NULL
				1718	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1719	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1720	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1721	htmlParserInputPtr input;
				1722
				1723	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1724	if (input == NULL) {
				1725	ctxt->errNo = XML_ERR_NO_MEMORY;
				1726	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1727	ctxt->sax->error(ctxt->userData,
				1728	"malloc: couldn't allocate a new input stream\n");
				1729	return(NULL);
				1730	}
				1731	memset(input, 0, sizeof(htmlParserInput));
				1732	input->filename = NULL;
				1733	input->directory = NULL;
				1734	input->base = NULL;
				1735	input->cur = NULL;
				1736	input->buf = NULL;
				1737	input->line = 1;
				1738	input->col = 1;
				1739	input->buf = NULL;
				1740	input->free = NULL;
				1741	input->version = NULL;
				1742	input->consumed = 0;
				1743	input->length = 0;
				1744	return(input);
				1745	}
				1746
				1747
				1748	/************************************************************************
				1749	* *
				1750	* Commodity functions, cleanup needed ? *
				1751	* *
				1752	************************************************************************/
				1753
				1754	/**
				1755	* areBlanks:
				1756	* @ctxt: an HTML parser context
				1757	* @str: a xmlChar *
				1758	* @len: the size of @str
				1759	*
				1760	* Is this a sequence of blank chars that one can ignore ?
				1761	*
				1762	* Returns 1 if ignorable 0 otherwise.
				1763	*/
				1764
				1765	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1766	int i;
				1767	xmlNodePtr lastChild;
				1768
				1769	for (i = 0;i < len;i++)
				1770	if (!(IS_BLANK(str[i]))) return(0);
				1771
				1772	if (CUR == 0) return(1);
				1773	if (CUR != '<') return(0);
				1774	if (ctxt->name == NULL)
				1775	return(1);
				1776	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1777	return(1);
				1778	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1779	return(1);
				1780	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1781	return(1);
				1782	if (ctxt->node == NULL) return(0);
				1783	lastChild = xmlGetLastChild(ctxt->node);
				1784	if (lastChild == NULL) {
				1785	if (ctxt->node->content != NULL) return(0);
				1786	} else if (xmlNodeIsText(lastChild)) {
				1787	return(0);
				1788	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1789	return(0);
				1790	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1791	return(0);
				1792	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1793	return(0);
				1794	}
				1795	return(1);
				1796	}
				1797
				1798	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1799	* htmlNewDocNoDtD:
				1800	* @URI: URI for the dtd, or NULL
				1801	* @ExternalID: the external ID of the DTD, or NULL
				1802	*
				1803	* Returns a new document, do not intialize the DTD if not provided
				1804	*/
				1805	htmlDocPtr
				1806	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1807	xmlDocPtr cur;
				1808
				1809	/*
				1810	* Allocate a new document and fill the fields.
				1811	*/
				1812	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1813	if (cur == NULL) {
				1814	xmlGenericError(xmlGenericErrorContext,
				1815	"xmlNewDoc : malloc failed\n");
				1816	return(NULL);
				1817	}
				1818	memset(cur, 0, sizeof(xmlDoc));
				1819
				1820	cur->type = XML_HTML_DOCUMENT_NODE;
				1821	cur->version = NULL;
				1822	cur->intSubset = NULL;
				1823	if ((ExternalID != NULL) \|\|
				1824	(URI != NULL))
				1825	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1826	cur->doc = cur;
				1827	cur->name = NULL;
				1828	cur->children = NULL;
				1829	cur->extSubset = NULL;
				1830	cur->oldNs = NULL;
				1831	cur->encoding = NULL;
				1832	cur->standalone = 1;
				1833	cur->compression = 0;
				1834	cur->ids = NULL;
				1835	cur->refs = NULL;
				1836	#ifndef XML_WITHOUT_CORBA
				1837	cur->_private = NULL;
				1838	#endif
				1839	return(cur);
				1840	}
				1841
				1842	/**
				1843	* htmlNewDoc:
				1844	* @URI: URI for the dtd, or NULL
				1845	* @ExternalID: the external ID of the DTD, or NULL
				1846	*
				1847	* Returns a new document
				1848	*/
				1849	htmlDocPtr
				1850	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1851	if ((URI == NULL) && (ExternalID == NULL))
				1852	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1853	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1854	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1855
				1856	return(htmlNewDocNoDtD(URI, ExternalID));
				1857	}
				1858
				1859
				1860	/************************************************************************
				1861	* *
				1862	* The parser itself *
				1863	* Relates to http://www.w3.org/TR/html40 *
				1864	* *
				1865	************************************************************************/
				1866
				1867	/************************************************************************
				1868	* *
				1869	* The parser itself *
				1870	* *
				1871	************************************************************************/
				1872
				1873	/**
				1874	* htmlParseHTMLName:
				1875	* @ctxt: an HTML parser context
				1876	*
				1877	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1878	* since HTML names are not case-sensitive.
				1879	*
				1880	* Returns the Tag Name parsed or NULL
				1881	*/
				1882
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1883	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1884	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1885	xmlChar *ret = NULL;
				1886	int i = 0;
				1887	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1888
				1889	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1890	(CUR != ':')) return(NULL);
				1891
				1892	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1893	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1894	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1895	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1896	else loc[i] = CUR;
				1897	i++;
				1898
				1899	NEXT;
				1900	}
				1901
				1902	ret = xmlStrndup(loc, i);
				1903
				1904	return(ret);
				1905	}
				1906
				1907	/**
				1908	* htmlParseName:
				1909	* @ctxt: an HTML parser context
				1910	*
				1911	* parse an HTML name, this routine is case sensistive.
				1912	*
				1913	* Returns the Name parsed or NULL
				1914	*/
				1915
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1916	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1917	htmlParseName(htmlParserCtxtPtr ctxt) {
				1918	xmlChar buf[HTML_MAX_NAMELEN];
				1919	int len = 0;
				1920
				1921	GROW;
				1922	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1923	return(NULL);
				1924	}
				1925
				1926	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1927	(CUR == '.') \|\| (CUR == '-') \|\|
				1928	(CUR == '_') \|\| (CUR == ':') \|\|
				1929	(IS_COMBINING(CUR)) \|\|
				1930	(IS_EXTENDER(CUR))) {
				1931	buf[len++] = CUR;
				1932	NEXT;
				1933	if (len >= HTML_MAX_NAMELEN) {
				1934	xmlGenericError(xmlGenericErrorContext,
				1935	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1936	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1937	(CUR == '.') \|\| (CUR == '-') \|\|
				1938	(CUR == '_') \|\| (CUR == ':') \|\|
				1939	(IS_COMBINING(CUR)) \|\|
				1940	(IS_EXTENDER(CUR)))
				1941	NEXT;
				1942	break;
				1943	}
				1944	}
				1945	return(xmlStrndup(buf, len));
				1946	}
				1947
				1948	/**
				1949	* htmlParseHTMLAttribute:
				1950	* @ctxt: an HTML parser context
				1951	* @stop: a char stop value
				1952	*
				1953	* parse an HTML attribute value till the stop (quote), if
				1954	* stop is 0 then it stops at the first space
				1955	*
				1956	* Returns the attribute parsed or NULL
				1957	*/
				1958
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1959	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1960	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1961	xmlChar *buffer = NULL;
				1962	int buffer_size = 0;
				1963	xmlChar *out = NULL;
				1964	xmlChar *name = NULL;
				1965
				1966	xmlChar *cur = NULL;
				1967	htmlEntityDescPtr ent;
				1968
				1969	/*
				1970	* allocate a translation buffer.
				1971	*/
				1972	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1973	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1974	if (buffer == NULL) {
				1975	perror("htmlParseHTMLAttribute: malloc failed");
				1976	return(NULL);
				1977	}
				1978	out = buffer;
				1979
				1980	/*
				1981	* Ok loop until we reach one of the ending chars
				1982	*/
				1983	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1984	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1985	if (CUR == '&') {
				1986	if (NXT(1) == '#') {
				1987	unsigned int c;
				1988	int bits;
				1989
				1990	c = htmlParseCharRef(ctxt);
				1991	if (c < 0x80)
				1992	{ *out++ = c; bits= -6; }
				1993	else if (c < 0x800)
				1994	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1995	else if (c < 0x10000)
				1996	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1997	else
				1998	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1999
				2000	for ( ; bits >= 0; bits-= 6) {
				2001	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2002	}
				2003	} else {
				2004	ent = htmlParseEntityRef(ctxt, &name);
				2005	if (name == NULL) {
				2006	*out++ = '&';
				2007	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2008	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2009
				2010	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2011	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2012	}
				2013	} else if (ent == NULL) {
				2014	*out++ = '&';
				2015	cur = name;
				2016	while (*cur != 0) {
				2017	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2018	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2019
				2020	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2021	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2022	}
				2023	out++ = cur++;
				2024	}
				2025	xmlFree(name);
				2026	} else {
				2027	unsigned int c;
				2028	int bits;
				2029
				2030	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2031	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2032
				2033	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2034	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2035	}
				2036	c = (xmlChar)ent->value;
				2037	if (c < 0x80)
				2038	{ *out++ = c; bits= -6; }
				2039	else if (c < 0x800)
				2040	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2041	else if (c < 0x10000)
				2042	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2043	else
				2044	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2045
				2046	for ( ; bits >= 0; bits-= 6) {
				2047	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2048	}
				2049	xmlFree(name);
				2050	}
				2051	}
				2052	} else {
				2053	unsigned int c;
				2054	int bits, l;
				2055
				2056	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2057	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2058
				2059	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2060	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2061	}
				2062	c = CUR_CHAR(l);
				2063	if (c < 0x80)
				2064	{ *out++ = c; bits= -6; }
				2065	else if (c < 0x800)
				2066	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2067	else if (c < 0x10000)
				2068	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2069	else
				2070	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2071
				2072	for ( ; bits >= 0; bits-= 6) {
				2073	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2074	}
				2075	NEXT;
				2076	}
				2077	}
				2078	*out++ = 0;
				2079	return(buffer);
				2080	}
				2081
				2082	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2083	* htmlParseEntityRef:
				2084	* @ctxt: an HTML parser context
				2085	* @str: location to store the entity name
				2086	*
				2087	* parse an HTML ENTITY references
				2088	*
				2089	* [68] EntityRef ::= '&' Name ';'
				2090	*
				2091	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2092	* if non-NULL *str will have to be freed by the caller.
				2093	*/
				2094	htmlEntityDescPtr
				2095	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2096	xmlChar *name;
				2097	htmlEntityDescPtr ent = NULL;
				2098	*str = NULL;
				2099
				2100	if (CUR == '&') {
				2101	NEXT;
				2102	name = htmlParseName(ctxt);
				2103	if (name == NULL) {
				2104	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2105	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2106	ctxt->wellFormed = 0;
				2107	} else {
				2108	GROW;
				2109	if (CUR == ';') {
				2110	*str = name;
				2111
				2112	/*
				2113	* Lookup the entity in the table.
				2114	*/
				2115	ent = htmlEntityLookup(name);
				2116	if (ent != NULL) /* OK that's ugly !!! */
				2117	NEXT;
				2118	} else {
				2119	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2120	ctxt->sax->error(ctxt->userData,
				2121	"htmlParseEntityRef: expecting ';'\n");
				2122	*str = name;
				2123	}
				2124	}
				2125	}
				2126	return(ent);
				2127	}
				2128
				2129	/**
				2130	* htmlParseAttValue:
				2131	* @ctxt: an HTML parser context
				2132	*
				2133	* parse a value for an attribute
				2134	* Note: the parser won't do substitution of entities here, this
				2135	* will be handled later in xmlStringGetNodeList, unless it was
				2136	* asked for ctxt->replaceEntities != 0
				2137	*
				2138	* Returns the AttValue parsed or NULL.
				2139	*/
				2140
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2141	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2142	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2143	xmlChar *ret = NULL;
				2144
				2145	if (CUR == '"') {
				2146	NEXT;
				2147	ret = htmlParseHTMLAttribute(ctxt, '"');
				2148	if (CUR != '"') {
				2149	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2150	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2151	ctxt->wellFormed = 0;
				2152	} else
				2153	NEXT;
				2154	} else if (CUR == '\'') {
				2155	NEXT;
				2156	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2157	if (CUR != '\'') {
				2158	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2159	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2160	ctxt->wellFormed = 0;
				2161	} else
				2162	NEXT;
				2163	} else {
				2164	/*
				2165	* That's an HTMLism, the attribute value may not be quoted
				2166	*/
				2167	ret = htmlParseHTMLAttribute(ctxt, 0);
				2168	if (ret == NULL) {
				2169	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2170	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2171	ctxt->wellFormed = 0;
				2172	}
				2173	}
				2174	return(ret);
				2175	}
				2176
				2177	/**
				2178	* htmlParseSystemLiteral:
				2179	* @ctxt: an HTML parser context
				2180	*
				2181	* parse an HTML Literal
				2182	*
				2183	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2184	*
				2185	* Returns the SystemLiteral parsed or NULL
				2186	*/
				2187
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2188	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2189	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2190	const xmlChar *q;
				2191	xmlChar *ret = NULL;
				2192
				2193	if (CUR == '"') {
				2194	NEXT;
				2195	q = CUR_PTR;
				2196	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2197	NEXT;
				2198	if (!IS_CHAR(CUR)) {
				2199	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2200	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2201	ctxt->wellFormed = 0;
				2202	} else {
				2203	ret = xmlStrndup(q, CUR_PTR - q);
				2204	NEXT;
				2205	}
				2206	} else if (CUR == '\'') {
				2207	NEXT;
				2208	q = CUR_PTR;
				2209	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2210	NEXT;
				2211	if (!IS_CHAR(CUR)) {
				2212	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2213	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2214	ctxt->wellFormed = 0;
				2215	} else {
				2216	ret = xmlStrndup(q, CUR_PTR - q);
				2217	NEXT;
				2218	}
				2219	} else {
				2220	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2221	ctxt->sax->error(ctxt->userData,
				2222	"SystemLiteral \" or ' expected\n");
				2223	ctxt->wellFormed = 0;
				2224	}
				2225
				2226	return(ret);
				2227	}
				2228
				2229	/**
				2230	* htmlParsePubidLiteral:
				2231	* @ctxt: an HTML parser context
				2232	*
				2233	* parse an HTML public literal
				2234	*
				2235	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2236	*
				2237	* Returns the PubidLiteral parsed or NULL.
				2238	*/
				2239
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2240	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2241	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2242	const xmlChar *q;
				2243	xmlChar *ret = NULL;
				2244	/*
				2245	* Name ::= (Letter \| '_') (NameChar)*
				2246	*/
				2247	if (CUR == '"') {
				2248	NEXT;
				2249	q = CUR_PTR;
				2250	while (IS_PUBIDCHAR(CUR)) NEXT;
				2251	if (CUR != '"') {
				2252	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2253	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2254	ctxt->wellFormed = 0;
				2255	} else {
				2256	ret = xmlStrndup(q, CUR_PTR - q);
				2257	NEXT;
				2258	}
				2259	} else if (CUR == '\'') {
				2260	NEXT;
				2261	q = CUR_PTR;
				2262	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2263	NEXT;
				2264	if (!IS_LETTER(CUR)) {
				2265	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2266	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2267	ctxt->wellFormed = 0;
				2268	} else {
				2269	ret = xmlStrndup(q, CUR_PTR - q);
				2270	NEXT;
				2271	}
				2272	} else {
				2273	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2274	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2275	ctxt->wellFormed = 0;
				2276	}
				2277
				2278	return(ret);
				2279	}
				2280
				2281	/**
				2282	* htmlParseScript:
				2283	* @ctxt: an HTML parser context
				2284	*
				2285	* parse the content of an HTML SCRIPT or STYLE element
				2286	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2287	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2288	* http://www.w3.org/TR/html4/types.html#type-script
				2289	* http://www.w3.org/TR/html4/types.html#h-6.15
				2290	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2291	*
				2292	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2293	* element and the value of intrinsic event attributes. User agents must
				2294	* not evaluate script data as HTML markup but instead must pass it on as
				2295	* data to a script engine.
				2296	* NOTES:
				2297	* - The content is passed like CDATA
				2298	* - the attributes for style and scripting "onXXX" are also described
				2299	* as CDATA but SGML allows entities references in attributes so their
				2300	* processing is identical as other attributes
				2301	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2302	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2303	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2304	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2305	int nbchar = 0;
				2306	xmlChar cur;
				2307
				2308	SHRINK;
				2309	cur = CUR;
				2310	while (IS_CHAR(cur)) {
				2311	if ((cur == '<') && (NXT(1) == '/')) {
				2312	/*
				2313	* One should break here, the specification is clear:
				2314	* Authors should therefore escape "</" within the content.
				2315	* Escape mechanisms are specific to each scripting or
				2316	* style sheet language.
				2317	*/
				2318	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2319	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2320	break; /* while */
				2321	}
				2322	buf[nbchar++] = cur;
				2323	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2324	if (ctxt->sax->cdataBlock!= NULL) {
				2325	/*
				2326	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2327	*/
				2328	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2329	}
				2330	nbchar = 0;
				2331	}
				2332	NEXT;
				2333	cur = CUR;
				2334	}
				2335	if (!(IS_CHAR(cur))) {
				2336	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2337	ctxt->sax->error(ctxt->userData,
				2338	"Invalid char in CDATA 0x%X\n", cur);
				2339	ctxt->wellFormed = 0;
				2340	NEXT;
				2341	}
				2342
				2343	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2344	if (ctxt->sax->cdataBlock!= NULL) {
				2345	/*
				2346	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2347	*/
				2348	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2349	}
				2350	}
				2351	}
				2352
				2353
				2354	/**
				2355	* htmlParseCharData:
				2356	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2357	*
				2358	* parse a CharData section.
				2359	* if we are within a CDATA section ']]>' marks an end of section.
				2360	*
				2361	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2362	*/
				2363
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2364	static void
				2365	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2366	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2367	int nbchar = 0;
				2368	int cur, l;
				2369
				2370	SHRINK;
				2371	cur = CUR_CHAR(l);
				2372	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2373	((cur != '&') \|\| (ctxt->token == '&')) &&
				2374	(IS_CHAR(cur))) {
				2375	COPY_BUF(l,buf,nbchar,cur);
				2376	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2377	/*
				2378	* Ok the segment is to be consumed as chars.
				2379	*/
				2380	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2381	if (areBlanks(ctxt, buf, nbchar)) {
				2382	if (ctxt->sax->ignorableWhitespace != NULL)
				2383	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2384	buf, nbchar);
				2385	} else {
				2386	htmlCheckParagraph(ctxt);
				2387	if (ctxt->sax->characters != NULL)
				2388	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2389	}
				2390	}
				2391	nbchar = 0;
				2392	}
				2393	NEXTL(l);
				2394	cur = CUR_CHAR(l);
				2395	}
				2396	if (nbchar != 0) {
				2397	/*
				2398	* Ok the segment is to be consumed as chars.
				2399	*/
				2400	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2401	if (areBlanks(ctxt, buf, nbchar)) {
				2402	if (ctxt->sax->ignorableWhitespace != NULL)
				2403	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2404	} else {
				2405	htmlCheckParagraph(ctxt);
				2406	if (ctxt->sax->characters != NULL)
				2407	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2408	}
				2409	}
				2410	}
				2411	}
				2412
				2413	/**
				2414	* htmlParseExternalID:
				2415	* @ctxt: an HTML parser context
				2416	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2417	*
				2418	* Parse an External ID or a Public ID
				2419	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2420	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2421	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2422	*
				2423	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2424	*
				2425	* Returns the function returns SystemLiteral and in the second
				2426	* case publicID receives PubidLiteral, is strict is off
				2427	* it is possible to return NULL and have publicID set.
				2428	*/
				2429
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2430	static xmlChar *
				2431	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2432	xmlChar *URI = NULL;
				2433
				2434	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2435	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2436	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2437	SKIP(6);
				2438	if (!IS_BLANK(CUR)) {
				2439	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2440	ctxt->sax->error(ctxt->userData,
				2441	"Space required after 'SYSTEM'\n");
				2442	ctxt->wellFormed = 0;
				2443	}
				2444	SKIP_BLANKS;
				2445	URI = htmlParseSystemLiteral(ctxt);
				2446	if (URI == NULL) {
				2447	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2448	ctxt->sax->error(ctxt->userData,
				2449	"htmlParseExternalID: SYSTEM, no URI\n");
				2450	ctxt->wellFormed = 0;
				2451	}
				2452	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2453	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2454	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2455	SKIP(6);
				2456	if (!IS_BLANK(CUR)) {
				2457	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2458	ctxt->sax->error(ctxt->userData,
				2459	"Space required after 'PUBLIC'\n");
				2460	ctxt->wellFormed = 0;
				2461	}
				2462	SKIP_BLANKS;
				2463	*publicID = htmlParsePubidLiteral(ctxt);
				2464	if (*publicID == NULL) {
				2465	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2466	ctxt->sax->error(ctxt->userData,
				2467	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2468	ctxt->wellFormed = 0;
				2469	}
				2470	SKIP_BLANKS;
				2471	if ((CUR == '"') \|\| (CUR == '\'')) {
				2472	URI = htmlParseSystemLiteral(ctxt);
				2473	}
				2474	}
				2475	return(URI);
				2476	}
				2477
				2478	/**
				2479	* htmlParseComment:
				2480	* @ctxt: an HTML parser context
				2481	*
				2482	* Parse an XML (SGML) comment <!-- .... -->
				2483	*
				2484	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2485	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2486	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2487	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2488	xmlChar *buf = NULL;
				2489	int len;
				2490	int size = HTML_PARSER_BUFFER_SIZE;
				2491	int q, ql;
				2492	int r, rl;
				2493	int cur, l;
				2494	xmlParserInputState state;
				2495
				2496	/*
				2497	* Check that there is a comment right here.
				2498	*/
				2499	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2500	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2501
				2502	state = ctxt->instate;
				2503	ctxt->instate = XML_PARSER_COMMENT;
				2504	SHRINK;
				2505	SKIP(4);
				2506	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2507	if (buf == NULL) {
				2508	xmlGenericError(xmlGenericErrorContext,
				2509	"malloc of %d byte failed\n", size);
				2510	ctxt->instate = state;
				2511	return;
				2512	}
				2513	q = CUR_CHAR(ql);
				2514	NEXTL(ql);
				2515	r = CUR_CHAR(rl);
				2516	NEXTL(rl);
				2517	cur = CUR_CHAR(l);
				2518	len = 0;
				2519	while (IS_CHAR(cur) &&
				2520	((cur != '>') \|\|
				2521	(r != '-') \|\| (q != '-'))) {
				2522	if (len + 5 >= size) {
				2523	size *= 2;
				2524	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2525	if (buf == NULL) {
				2526	xmlGenericError(xmlGenericErrorContext,
				2527	"realloc of %d byte failed\n", size);
				2528	ctxt->instate = state;
				2529	return;
				2530	}
				2531	}
				2532	COPY_BUF(ql,buf,len,q);
				2533	q = r;
				2534	ql = rl;
				2535	r = cur;
				2536	rl = l;
				2537	NEXTL(l);
				2538	cur = CUR_CHAR(l);
				2539	if (cur == 0) {
				2540	SHRINK;
				2541	GROW;
				2542	cur = CUR_CHAR(l);
				2543	}
				2544	}
				2545	buf[len] = 0;
				2546	if (!IS_CHAR(cur)) {
				2547	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2548	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2549	ctxt->sax->error(ctxt->userData,
				2550	"Comment not terminated \n<!--%.50s\n", buf);
				2551	ctxt->wellFormed = 0;
				2552	xmlFree(buf);
				2553	} else {
				2554	NEXT;
				2555	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2556	(!ctxt->disableSAX))
				2557	ctxt->sax->comment(ctxt->userData, buf);
				2558	xmlFree(buf);
				2559	}
				2560	ctxt->instate = state;
				2561	}
				2562
				2563	/**
				2564	* htmlParseCharRef:
				2565	* @ctxt: an HTML parser context
				2566	*
				2567	* parse Reference declarations
				2568	*
				2569	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2570	* '&#x' [0-9a-fA-F]+ ';'
				2571	*
				2572	* Returns the value parsed (as an int)
				2573	*/
				2574	int
				2575	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2576	int val = 0;
				2577
				2578	if ((CUR == '&') && (NXT(1) == '#') &&
				2579	(NXT(2) == 'x')) {
				2580	SKIP(3);
				2581	while (CUR != ';') {
				2582	if ((CUR >= '0') && (CUR <= '9'))
				2583	val = val * 16 + (CUR - '0');
				2584	else if ((CUR >= 'a') && (CUR <= 'f'))
				2585	val = val * 16 + (CUR - 'a') + 10;
				2586	else if ((CUR >= 'A') && (CUR <= 'F'))
				2587	val = val * 16 + (CUR - 'A') + 10;
				2588	else {
				2589	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2590	ctxt->sax->error(ctxt->userData,
				2591	"htmlParseCharRef: invalid hexadecimal value\n");
				2592	ctxt->wellFormed = 0;
				2593	return(0);
				2594	}
				2595	NEXT;
				2596	}
				2597	if (CUR == ';')
				2598	NEXT;
				2599	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2600	SKIP(2);
				2601	while (CUR != ';') {
				2602	if ((CUR >= '0') && (CUR <= '9'))
				2603	val = val * 10 + (CUR - '0');
				2604	else {
				2605	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2606	ctxt->sax->error(ctxt->userData,
				2607	"htmlParseCharRef: invalid decimal value\n");
				2608	ctxt->wellFormed = 0;
				2609	return(0);
				2610	}
				2611	NEXT;
				2612	}
				2613	if (CUR == ';')
				2614	NEXT;
				2615	} else {
				2616	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2617	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2618	ctxt->wellFormed = 0;
				2619	}
				2620	/*
				2621	* Check the value IS_CHAR ...
				2622	*/
				2623	if (IS_CHAR(val)) {
				2624	return(val);
				2625	} else {
				2626	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2627	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2628	val);
				2629	ctxt->wellFormed = 0;
				2630	}
				2631	return(0);
				2632	}
				2633
				2634
				2635	/**
				2636	* htmlParseDocTypeDecl :
				2637	* @ctxt: an HTML parser context
				2638	*
				2639	* parse a DOCTYPE declaration
				2640	*
				2641	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2642	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2643	*/
				2644
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2645	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2646	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2647	xmlChar *name;
				2648	xmlChar *ExternalID = NULL;
				2649	xmlChar *URI = NULL;
				2650
				2651	/*
				2652	* We know that '<!DOCTYPE' has been detected.
				2653	*/
				2654	SKIP(9);
				2655
				2656	SKIP_BLANKS;
				2657
				2658	/*
				2659	* Parse the DOCTYPE name.
				2660	*/
				2661	name = htmlParseName(ctxt);
				2662	if (name == NULL) {
				2663	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2664	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2665	ctxt->wellFormed = 0;
				2666	}
				2667	/*
				2668	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2669	*/
				2670
				2671	SKIP_BLANKS;
				2672
				2673	/*
				2674	* Check for SystemID and ExternalID
				2675	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2676	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2677	SKIP_BLANKS;
				2678
				2679	/*
				2680	* We should be at the end of the DOCTYPE declaration.
				2681	*/
				2682	if (CUR != '>') {
				2683	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2684	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2685	ctxt->wellFormed = 0;
				2686	/* We shouldn't try to resynchronize ... */
				2687	}
				2688	NEXT;
				2689
				2690	/*
				2691	* Create or update the document accordingly to the DOCTYPE
				2692	*/
				2693	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2694	(!ctxt->disableSAX))
				2695	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2696
				2697	/*
				2698	* Cleanup, since we don't use all those identifiers
				2699	*/
				2700	if (URI != NULL) xmlFree(URI);
				2701	if (ExternalID != NULL) xmlFree(ExternalID);
				2702	if (name != NULL) xmlFree(name);
				2703	}
				2704
				2705	/**
				2706	* htmlParseAttribute:
				2707	* @ctxt: an HTML parser context
				2708	* @value: a xmlChar ** used to store the value of the attribute
				2709	*
				2710	* parse an attribute
				2711	*
				2712	* [41] Attribute ::= Name Eq AttValue
				2713	*
				2714	* [25] Eq ::= S? '=' S?
				2715	*
				2716	* With namespace:
				2717	*
				2718	* [NS 11] Attribute ::= QName Eq AttValue
				2719	*
				2720	* Also the case QName == xmlns:??? is handled independently as a namespace
				2721	* definition.
				2722	*
				2723	* Returns the attribute name, and the value in *value.
				2724	*/
				2725
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2726	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2727	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2728	xmlChar name, val = NULL;
				2729
				2730	*value = NULL;
				2731	name = htmlParseHTMLName(ctxt);
				2732	if (name == NULL) {
				2733	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2734	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2735	ctxt->wellFormed = 0;
				2736	return(NULL);
				2737	}
				2738
				2739	/*
				2740	* read the value
				2741	*/
				2742	SKIP_BLANKS;
				2743	if (CUR == '=') {
				2744	NEXT;
				2745	SKIP_BLANKS;
				2746	val = htmlParseAttValue(ctxt);
				2747	/******
				2748	} else {
				2749	* TODO : some attribute must have values, some may not
				2750	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2751	ctxt->sax->warning(ctxt->userData,
				2752	"No value for attribute %s\n", name); */
				2753	}
				2754
				2755	*value = val;
				2756	return(name);
				2757	}
				2758
				2759	/**
				2760	* htmlCheckEncoding:
				2761	* @ctxt: an HTML parser context
				2762	* @attvalue: the attribute value
				2763	*
				2764	* Checks an http-equiv attribute from a Meta tag to detect
				2765	* the encoding
				2766	* If a new encoding is detected the parser is switched to decode
				2767	* it and pass UTF8
				2768	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2769	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2770	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2771	const xmlChar *encoding;
				2772
				2773	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2774	return;
				2775
				2776	/* do not change encoding */
				2777	if (ctxt->input->encoding != NULL)
				2778	return;
				2779
				2780	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2781	if (encoding != NULL) {
				2782	encoding += 8;
				2783	} else {
				2784	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2785	if (encoding != NULL)
				2786	encoding += 9;
				2787	}
				2788	if (encoding != NULL) {
				2789	xmlCharEncoding enc;
				2790	xmlCharEncodingHandlerPtr handler;
				2791
				2792	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2793
				2794	if (ctxt->input->encoding != NULL)
				2795	xmlFree((xmlChar *) ctxt->input->encoding);
				2796	ctxt->input->encoding = xmlStrdup(encoding);
				2797
				2798	enc = xmlParseCharEncoding((const char *) encoding);
				2799	/*
				2800	* registered set of known encodings
				2801	*/
				2802	if (enc != XML_CHAR_ENCODING_ERROR) {
				2803	xmlSwitchEncoding(ctxt, enc);
				2804	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2805	} else {
				2806	/*
				2807	* fallback for unknown encodings
				2808	*/
				2809	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2810	if (handler != NULL) {
				2811	xmlSwitchToEncoding(ctxt, handler);
				2812	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2813	} else {
				2814	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2815	}
				2816	}
				2817
				2818	if ((ctxt->input->buf != NULL) &&
				2819	(ctxt->input->buf->encoder != NULL) &&
				2820	(ctxt->input->buf->raw != NULL) &&
				2821	(ctxt->input->buf->buffer != NULL)) {
				2822	int nbchars;
				2823	int processed;
				2824
				2825	/*
				2826	* convert as much as possible to the parser reading buffer.
				2827	*/
				2828	processed = ctxt->input->cur - ctxt->input->base;
				2829	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2830	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2831	ctxt->input->buf->buffer,
				2832	ctxt->input->buf->raw);
				2833	if (nbchars < 0) {
				2834	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2835	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2836	ctxt->sax->error(ctxt->userData,
				2837	"htmlCheckEncoding: encoder error\n");
				2838	}
				2839	ctxt->input->base =
				2840	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2841	}
				2842	}
				2843	}
				2844
				2845	/**
				2846	* htmlCheckMeta:
				2847	* @ctxt: an HTML parser context
				2848	* @atts: the attributes values
				2849	*
				2850	* Checks an attributes from a Meta tag
				2851	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2852	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2853	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2854	int i;
				2855	const xmlChar att, value;
				2856	int http = 0;
				2857	const xmlChar *content = NULL;
				2858
				2859	if ((ctxt == NULL) \|\| (atts == NULL))
				2860	return;
				2861
				2862	i = 0;
				2863	att = atts[i++];
				2864	while (att != NULL) {
				2865	value = atts[i++];
				2866	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2867	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2868	http = 1;
				2869	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2870	content = value;
				2871	att = atts[i++];
				2872	}
				2873	if ((http) && (content != NULL))
				2874	htmlCheckEncoding(ctxt, content);
				2875
				2876	}
				2877
				2878	/**
				2879	* htmlParseStartTag:
				2880	* @ctxt: an HTML parser context
				2881	*
				2882	* parse a start of tag either for rule element or
				2883	* EmptyElement. In both case we don't parse the tag closing chars.
				2884	*
				2885	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2886	*
				2887	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2888	*
				2889	* With namespace:
				2890	*
				2891	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2892	*
				2893	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2894	*
				2895	*/
				2896
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2897	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2898	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2899	xmlChar *name;
				2900	xmlChar *attname;
				2901	xmlChar *attvalue;
				2902	const xmlChar **atts = NULL;
				2903	int nbatts = 0;
				2904	int maxatts = 0;
				2905	int meta = 0;
				2906	int i;
				2907
				2908	if (CUR != '<') return;
				2909	NEXT;
				2910
				2911	GROW;
				2912	name = htmlParseHTMLName(ctxt);
				2913	if (name == NULL) {
				2914	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2915	ctxt->sax->error(ctxt->userData,
				2916	"htmlParseStartTag: invalid element name\n");
				2917	ctxt->wellFormed = 0;
				2918	/* Dump the bogus tag like browsers do */
				2919	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2920	NEXT;
				2921	return;
				2922	}
				2923	if (xmlStrEqual(name, BAD_CAST"meta"))
				2924	meta = 1;
				2925
				2926	/*
				2927	* Check for auto-closure of HTML elements.
				2928	*/
				2929	htmlAutoClose(ctxt, name);
				2930
				2931	/*
				2932	* Check for implied HTML elements.
				2933	*/
				2934	htmlCheckImplied(ctxt, name);
				2935
				2936	/*
				2937	* Avoid html at any level > 0, head at any level != 1
				2938	* or any attempt to recurse body
				2939	*/
				2940	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2941	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2942	ctxt->sax->error(ctxt->userData,
				2943	"htmlParseStartTag: misplaced <html> tag\n");
				2944	ctxt->wellFormed = 0;
				2945	xmlFree(name);
				2946	return;
				2947	}
				2948	if ((ctxt->nameNr != 1) &&
				2949	(xmlStrEqual(name, BAD_CAST"head"))) {
				2950	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2951	ctxt->sax->error(ctxt->userData,
				2952	"htmlParseStartTag: misplaced <head> tag\n");
				2953	ctxt->wellFormed = 0;
				2954	xmlFree(name);
				2955	return;
				2956	}
				2957	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2958	int indx;
				2959	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2960	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2961	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2962	ctxt->sax->error(ctxt->userData,
				2963	"htmlParseStartTag: misplaced <body> tag\n");
				2964	ctxt->wellFormed = 0;
				2965	xmlFree(name);
				2966	return;
				2967	}
				2968	}
				2969	}
				2970
				2971	/*
				2972	* Now parse the attributes, it ends up with the ending
				2973	*
				2974	* (S Attribute)* S?
				2975	*/
				2976	SKIP_BLANKS;
				2977	while ((IS_CHAR(CUR)) &&
				2978	(CUR != '>') &&
				2979	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2980	long cons = ctxt->nbChars;
				2981
				2982	GROW;
				2983	attname = htmlParseAttribute(ctxt, &attvalue);
				2984	if (attname != NULL) {
				2985
				2986	/*
				2987	* Well formedness requires at most one declaration of an attribute
				2988	*/
				2989	for (i = 0; i < nbatts;i += 2) {
				2990	if (xmlStrEqual(atts[i], attname)) {
				2991	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2992	ctxt->sax->error(ctxt->userData,
				2993	"Attribute %s redefined\n",
				2994	attname);
				2995	ctxt->wellFormed = 0;
				2996	xmlFree(attname);
				2997	if (attvalue != NULL)
				2998	xmlFree(attvalue);
				2999	goto failed;
				3000	}
				3001	}
				3002
				3003	/*
				3004	* Add the pair to atts
				3005	*/
				3006	if (atts == NULL) {
				3007	maxatts = 10;
				3008	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				3009	if (atts == NULL) {
				3010	xmlGenericError(xmlGenericErrorContext,
				3011	"malloc of %ld byte failed\n",
				3012	maxatts * (long)sizeof(xmlChar *));
				3013	if (name != NULL) xmlFree(name);
				3014	return;
				3015	}
				3016	} else if (nbatts + 4 > maxatts) {
				3017	maxatts *= 2;
				3018	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3019	maxatts * sizeof(xmlChar *));
				3020	if (atts == NULL) {
				3021	xmlGenericError(xmlGenericErrorContext,
				3022	"realloc of %ld byte failed\n",
				3023	maxatts * (long)sizeof(xmlChar *));
				3024	if (name != NULL) xmlFree(name);
				3025	return;
				3026	}
				3027	}
				3028	atts[nbatts++] = attname;
				3029	atts[nbatts++] = attvalue;
				3030	atts[nbatts] = NULL;
				3031	atts[nbatts + 1] = NULL;
				3032	}
				3033	else {
				3034	/* Dump the bogus attribute string up to the next blank or
				3035	* the end of the tag. */
				3036	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3037	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3038	NEXT;
				3039	}
				3040
				3041	failed:
				3042	SKIP_BLANKS;
				3043	if (cons == ctxt->nbChars) {
				3044	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3045	ctxt->sax->error(ctxt->userData,
				3046	"htmlParseStartTag: problem parsing attributes\n");
				3047	ctxt->wellFormed = 0;
				3048	break;
				3049	}
				3050	}
				3051
				3052	/*
				3053	* Handle specific association to the META tag
				3054	*/
				3055	if (meta)
				3056	htmlCheckMeta(ctxt, atts);
				3057
				3058	/*
				3059	* SAX: Start of Element !
				3060	*/
				3061	htmlnamePush(ctxt, xmlStrdup(name));
				3062	#ifdef DEBUG
				3063	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3064	#endif
				3065	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3066	ctxt->sax->startElement(ctxt->userData, name, atts);
				3067
				3068	if (atts != NULL) {
				3069	for (i = 0;i < nbatts;i++) {
				3070	if (atts[i] != NULL)
				3071	xmlFree((xmlChar *) atts[i]);
				3072	}
				3073	xmlFree((void *) atts);
				3074	}
				3075	if (name != NULL) xmlFree(name);
				3076	}
				3077
				3078	/**
				3079	* htmlParseEndTag:
				3080	* @ctxt: an HTML parser context
				3081	*
				3082	* parse an end of tag
				3083	*
				3084	* [42] ETag ::= '</' Name S? '>'
				3085	*
				3086	* With namespace
				3087	*
				3088	* [NS 9] ETag ::= '</' QName S? '>'
				3089	*/
				3090
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3091	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3092	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3093	xmlChar *name;
				3094	xmlChar *oldname;
				3095	int i;
				3096
				3097	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3098	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3099	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3100	ctxt->wellFormed = 0;
				3101	return;
				3102	}
				3103	SKIP(2);
				3104
				3105	name = htmlParseHTMLName(ctxt);
				3106	if (name == NULL) return;
				3107
				3108	/*
				3109	* We should definitely be at the ending "S? '>'" part
				3110	*/
				3111	SKIP_BLANKS;
				3112	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3113	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3114	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3115	ctxt->wellFormed = 0;
				3116	} else
				3117	NEXT;
				3118
				3119	/*
				3120	* If the name read is not one of the element in the parsing stack
				3121	* then return, it's just an error.
				3122	*/
				3123	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3124	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3125	}
				3126	if (i < 0) {
				3127	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3128	ctxt->sax->error(ctxt->userData,
				3129	"Unexpected end tag : %s\n", name);
				3130	xmlFree(name);
				3131	ctxt->wellFormed = 0;
				3132	return;
				3133	}
				3134
				3135
				3136	/*
				3137	* Check for auto-closure of HTML elements.
				3138	*/
				3139
				3140	htmlAutoCloseOnClose(ctxt, name);
				3141
				3142	/*
				3143	* Well formedness constraints, opening and closing must match.
				3144	* With the exception that the autoclose may have popped stuff out
				3145	* of the stack.
				3146	*/
				3147	if (!xmlStrEqual(name, ctxt->name)) {
				3148	#ifdef DEBUG
				3149	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3150	#endif
				3151	if ((ctxt->name != NULL) &&
				3152	(!xmlStrEqual(ctxt->name, name))) {
				3153	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3154	ctxt->sax->error(ctxt->userData,
				3155	"Opening and ending tag mismatch: %s and %s\n",
				3156	name, ctxt->name);
				3157	ctxt->wellFormed = 0;
				3158	}
				3159	}
				3160
				3161	/*
				3162	* SAX: End of Tag
				3163	*/
				3164	oldname = ctxt->name;
				3165	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3166	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3167	ctxt->sax->endElement(ctxt->userData, name);
				3168	oldname = htmlnamePop(ctxt);
				3169	if (oldname != NULL) {
				3170	#ifdef DEBUG
				3171	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3172	#endif
				3173	xmlFree(oldname);
				3174	#ifdef DEBUG
				3175	} else {
				3176	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3177	#endif
				3178	}
				3179	}
				3180
				3181	if (name != NULL)
				3182	xmlFree(name);
				3183
				3184	return;
				3185	}
				3186
				3187
				3188	/**
				3189	* htmlParseReference:
				3190	* @ctxt: an HTML parser context
				3191	*
				3192	* parse and handle entity references in content,
				3193	* this will end-up in a call to character() since this is either a
				3194	* CharRef, or a predefined entity.
				3195	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3196	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3197	htmlParseReference(htmlParserCtxtPtr ctxt) {
				3198	htmlEntityDescPtr ent;
				3199	xmlChar out[6];
				3200	xmlChar *name;
				3201	if (CUR != '&') return;
				3202
				3203	if (NXT(1) == '#') {
				3204	unsigned int c;
				3205	int bits, i = 0;
				3206
				3207	c = htmlParseCharRef(ctxt);
				3208	if (c == 0)
				3209	return;
				3210
				3211	if (c < 0x80) { out[i++]= c; bits= -6; }
				3212	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3213	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3214	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3215
				3216	for ( ; bits >= 0; bits-= 6) {
				3217	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3218	}
				3219	out[i] = 0;
				3220
				3221	htmlCheckParagraph(ctxt);
				3222	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3223	ctxt->sax->characters(ctxt->userData, out, i);
				3224	} else {
				3225	ent = htmlParseEntityRef(ctxt, &name);
				3226	if (name == NULL) {
				3227	htmlCheckParagraph(ctxt);
				3228	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3229	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3230	return;
				3231	}
				3232	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3233	htmlCheckParagraph(ctxt);
				3234	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3235	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3236	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3237	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3238	}
				3239	} else {
				3240	unsigned int c;
				3241	int bits, i = 0;
				3242
				3243	c = ent->value;
				3244	if (c < 0x80)
				3245	{ out[i++]= c; bits= -6; }
				3246	else if (c < 0x800)
				3247	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3248	else if (c < 0x10000)
				3249	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3250	else
				3251	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3252
				3253	for ( ; bits >= 0; bits-= 6) {
				3254	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3255	}
				3256	out[i] = 0;
				3257
				3258	htmlCheckParagraph(ctxt);
				3259	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3260	ctxt->sax->characters(ctxt->userData, out, i);
				3261	}
				3262	xmlFree(name);
				3263	}
				3264	}
				3265
				3266	/**
				3267	* htmlParseContent:
				3268	* @ctxt: an HTML parser context
				3269	* @name: the node name
				3270	*
				3271	* Parse a content: comment, sub-element, reference or text.
				3272	*
				3273	*/
				3274
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3275	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3276	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3277	xmlChar *currentNode;
				3278	int depth;
				3279
				3280	currentNode = xmlStrdup(ctxt->name);
				3281	depth = ctxt->nameNr;
				3282	while (1) {
				3283	long cons = ctxt->nbChars;
				3284
				3285	GROW;
				3286	/*
				3287	* Our tag or one of it's parent or children is ending.
				3288	*/
				3289	if ((CUR == '<') && (NXT(1) == '/')) {
				3290	htmlParseEndTag(ctxt);
				3291	if (currentNode != NULL) xmlFree(currentNode);
				3292	return;
				3293	}
				3294
				3295	/*
				3296	* Has this node been popped out during parsing of
				3297	* the next element
				3298	*/
				3299	if ((!xmlStrEqual(currentNode, ctxt->name)) &&
				3300	(depth >= ctxt->nameNr)) {
				3301	if (currentNode != NULL) xmlFree(currentNode);
				3302	return;
				3303	}
				3304
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3305	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3306	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3307	/*
				3308	* Handle SCRIPT/STYLE separately
				3309	*/
				3310	htmlParseScript(ctxt);
				3311	} else {
				3312	/*
				3313	* Sometimes DOCTYPE arrives in the middle of the document
				3314	*/
				3315	if ((CUR == '<') && (NXT(1) == '!') &&
				3316	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3317	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3318	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3319	(UPP(8) == 'E')) {
				3320	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3321	ctxt->sax->error(ctxt->userData,
				3322	"Misplaced DOCTYPE declaration\n");
				3323	ctxt->wellFormed = 0;
				3324	htmlParseDocTypeDecl(ctxt);
				3325	}
				3326
				3327	/*
				3328	* First case : a comment
				3329	*/
				3330	if ((CUR == '<') && (NXT(1) == '!') &&
				3331	(NXT(2) == '-') && (NXT(3) == '-')) {
				3332	htmlParseComment(ctxt);
				3333	}
				3334
				3335	/*
				3336	* Second case : a sub-element.
				3337	*/
				3338	else if (CUR == '<') {
				3339	htmlParseElement(ctxt);
				3340	}
				3341
				3342	/*
				3343	* Third case : a reference. If if has not been resolved,
				3344	* parsing returns it's Name, create the node
				3345	*/
				3346	else if (CUR == '&') {
				3347	htmlParseReference(ctxt);
				3348	}
				3349
				3350	/*
				3351	* Fourth : end of the resource
				3352	*/
				3353	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3354	htmlAutoCloseOnEnd(ctxt);
				3355	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3356	}
				3357
				3358	/*
				3359	* Last case, text. Note that References are handled directly.
				3360	*/
				3361	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3362	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3363	}
				3364
				3365	if (cons == ctxt->nbChars) {
				3366	if (ctxt->node != NULL) {
				3367	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3368	ctxt->sax->error(ctxt->userData,
				3369	"detected an error in element content\n");
				3370	ctxt->wellFormed = 0;
				3371	}
				3372	break;
				3373	}
				3374	}
				3375	GROW;
				3376	}
				3377	if (currentNode != NULL) xmlFree(currentNode);
				3378	}
				3379
				3380	/**
				3381	* htmlParseElement:
				3382	* @ctxt: an HTML parser context
				3383	*
				3384	* parse an HTML element, this is highly recursive
				3385	*
				3386	* [39] element ::= EmptyElemTag \| STag content ETag
				3387	*
				3388	* [41] Attribute ::= Name Eq AttValue
				3389	*/
				3390
				3391	void
				3392	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3393	xmlChar *name;
				3394	xmlChar *currentNode = NULL;
				3395	htmlElemDescPtr info;
				3396	htmlParserNodeInfo node_info;
				3397	xmlChar *oldname;
				3398	int depth = ctxt->nameNr;
				3399
				3400	/* Capture start position */
				3401	if (ctxt->record_info) {
				3402	node_info.begin_pos = ctxt->input->consumed +
				3403	(CUR_PTR - ctxt->input->base);
				3404	node_info.begin_line = ctxt->input->line;
				3405	}
				3406
				3407	oldname = xmlStrdup(ctxt->name);
				3408	htmlParseStartTag(ctxt);
				3409	name = ctxt->name;
				3410	#ifdef DEBUG
				3411	if (oldname == NULL)
				3412	xmlGenericError(xmlGenericErrorContext,
				3413	"Start of element %s\n", name);
				3414	else if (name == NULL)
				3415	xmlGenericError(xmlGenericErrorContext,
				3416	"Start of element failed, was %s\n", oldname);
				3417	else
				3418	xmlGenericError(xmlGenericErrorContext,
				3419	"Start of element %s, was %s\n", name, oldname);
				3420	#endif
				3421	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3422	(name == NULL)) {
				3423	if (CUR == '>')
				3424	NEXT;
				3425	if (oldname != NULL)
				3426	xmlFree(oldname);
				3427	return;
				3428	}
				3429	if (oldname != NULL)
				3430	xmlFree(oldname);
				3431
				3432	/*
				3433	* Lookup the info for that element.
				3434	*/
				3435	info = htmlTagLookup(name);
				3436	if (info == NULL) {
				3437	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3438	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3439	name);
				3440	ctxt->wellFormed = 0;
				3441	} else if (info->depr) {
				3442	/***************************
				3443	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3444	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3445	name);
				3446	***************************/
				3447	}
				3448
				3449	/*
				3450	* Check for an Empty Element labelled the XML/SGML way
				3451	*/
				3452	if ((CUR == '/') && (NXT(1) == '>')) {
				3453	SKIP(2);
				3454	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3455	ctxt->sax->endElement(ctxt->userData, name);
				3456	oldname = htmlnamePop(ctxt);
				3457	#ifdef DEBUG
				3458	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3459	#endif
				3460	if (oldname != NULL)
				3461	xmlFree(oldname);
				3462	return;
				3463	}
				3464
				3465	if (CUR == '>') {
				3466	NEXT;
				3467	} else {
				3468	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3469	ctxt->sax->error(ctxt->userData,
				3470	"Couldn't find end of Start Tag %s\n",
				3471	name);
				3472	ctxt->wellFormed = 0;
				3473
				3474	/*
				3475	* end of parsing of this node.
				3476	*/
				3477	if (xmlStrEqual(name, ctxt->name)) {
				3478	nodePop(ctxt);
				3479	oldname = htmlnamePop(ctxt);
				3480	#ifdef DEBUG
				3481	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3482	#endif
				3483	if (oldname != NULL)
				3484	xmlFree(oldname);
				3485	}
				3486
				3487	/*
				3488	* Capture end position and add node
				3489	*/
				3490	if ( currentNode != NULL && ctxt->record_info ) {
				3491	node_info.end_pos = ctxt->input->consumed +
				3492	(CUR_PTR - ctxt->input->base);
				3493	node_info.end_line = ctxt->input->line;
				3494	node_info.node = ctxt->node;
				3495	xmlParserAddNodeInfo(ctxt, &node_info);
				3496	}
				3497	return;
				3498	}
				3499
				3500	/*
				3501	* Check for an Empty Element from DTD definition
				3502	*/
				3503	if ((info != NULL) && (info->empty)) {
				3504	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3505	ctxt->sax->endElement(ctxt->userData, name);
				3506	oldname = htmlnamePop(ctxt);
				3507	#ifdef DEBUG
				3508	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3509	#endif
				3510	if (oldname != NULL)
				3511	xmlFree(oldname);
				3512	return;
				3513	}
				3514
				3515	/*
				3516	* Parse the content of the element:
				3517	*/
				3518	currentNode = xmlStrdup(ctxt->name);
				3519	depth = ctxt->nameNr;
				3520	while (IS_CHAR(CUR)) {
				3521	htmlParseContent(ctxt);
				3522	if (ctxt->nameNr < depth) break;
				3523	}
				3524
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3525	/*
				3526	* Capture end position and add node
				3527	*/
				3528	if ( currentNode != NULL && ctxt->record_info ) {
				3529	node_info.end_pos = ctxt->input->consumed +
				3530	(CUR_PTR - ctxt->input->base);
				3531	node_info.end_line = ctxt->input->line;
				3532	node_info.node = ctxt->node;
				3533	xmlParserAddNodeInfo(ctxt, &node_info);
				3534	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3535	if (!IS_CHAR(CUR)) {
				3536	htmlAutoCloseOnEnd(ctxt);
				3537	}
				3538
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3539	if (currentNode != NULL)
				3540	xmlFree(currentNode);
				3541	}
				3542
				3543	/**
				3544	* htmlParseDocument :
				3545	* @ctxt: an HTML parser context
				3546	*
				3547	* parse an HTML document (and build a tree if using the standard SAX
				3548	* interface).
				3549	*
				3550	* Returns 0, -1 in case of error. the parser context is augmented
				3551	* as a result of the parsing.
				3552	*/
				3553
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3554	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3555	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3556	xmlDtdPtr dtd;
				3557
				3558	htmlDefaultSAXHandlerInit();
				3559	ctxt->html = 1;
				3560
				3561	GROW;
				3562	/*
				3563	* SAX: beginning of the document processing.
				3564	*/
				3565	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3566	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3567
				3568	/*
				3569	* Wipe out everything which is before the first '<'
				3570	*/
				3571	SKIP_BLANKS;
				3572	if (CUR == 0) {
				3573	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3574	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3575	ctxt->wellFormed = 0;
				3576	}
				3577
				3578	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3579	ctxt->sax->startDocument(ctxt->userData);
				3580
				3581
				3582	/*
				3583	* Parse possible comments before any content
				3584	*/
				3585	while ((CUR == '<') && (NXT(1) == '!') &&
				3586	(NXT(2) == '-') && (NXT(3) == '-')) {
				3587	htmlParseComment(ctxt);
				3588	SKIP_BLANKS;
				3589	}
				3590
				3591
				3592	/*
				3593	* Then possibly doc type declaration(s) and more Misc
				3594	* (doctypedecl Misc*)?
				3595	*/
				3596	if ((CUR == '<') && (NXT(1) == '!') &&
				3597	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3598	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3599	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3600	(UPP(8) == 'E')) {
				3601	htmlParseDocTypeDecl(ctxt);
				3602	}
				3603	SKIP_BLANKS;
				3604
				3605	/*
				3606	* Parse possible comments before any content
				3607	*/
				3608	while ((CUR == '<') && (NXT(1) == '!') &&
				3609	(NXT(2) == '-') && (NXT(3) == '-')) {
				3610	htmlParseComment(ctxt);
				3611	SKIP_BLANKS;
				3612	}
				3613
				3614	/*
				3615	* Time to start parsing the tree itself
				3616	*/
				3617	htmlParseContent(ctxt);
				3618
				3619	/*
				3620	* autoclose
				3621	*/
				3622	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3623	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3624
				3625
				3626	/*
				3627	* SAX: end of the document processing.
				3628	*/
				3629	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3630	ctxt->sax->endDocument(ctxt->userData);
				3631
				3632	if (ctxt->myDoc != NULL) {
				3633	dtd = xmlGetIntSubset(ctxt->myDoc);
				3634	if (dtd == NULL)
				3635	ctxt->myDoc->intSubset =
				3636	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3637	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3638	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3639	}
				3640	if (! ctxt->wellFormed) return(-1);
				3641	return(0);
				3642	}
				3643
				3644
				3645	/************************************************************************
				3646	* *
				3647	* Parser contexts handling *
				3648	* *
				3649	************************************************************************/
				3650
				3651	/**
				3652	* xmlInitParserCtxt:
				3653	* @ctxt: an HTML parser context
				3654	*
				3655	* Initialize a parser context
				3656	*/
				3657
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3658	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3659	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3660	{
				3661	htmlSAXHandler *sax;
				3662
				3663	if (ctxt == NULL) return;
				3664	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3665
				3666	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3667	if (sax == NULL) {
				3668	xmlGenericError(xmlGenericErrorContext,
				3669	"htmlInitParserCtxt: out of memory\n");
				3670	}
				3671	else
				3672	memset(sax, 0, sizeof(htmlSAXHandler));
				3673
				3674	/* Allocate the Input stack */
				3675	ctxt->inputTab = (htmlParserInputPtr *)
				3676	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3677	if (ctxt->inputTab == NULL) {
				3678	xmlGenericError(xmlGenericErrorContext,
				3679	"htmlInitParserCtxt: out of memory\n");
				3680	ctxt->inputNr = 0;
				3681	ctxt->inputMax = 0;
				3682	ctxt->input = NULL;
				3683	return;
				3684	}
				3685	ctxt->inputNr = 0;
				3686	ctxt->inputMax = 5;
				3687	ctxt->input = NULL;
				3688	ctxt->version = NULL;
				3689	ctxt->encoding = NULL;
				3690	ctxt->standalone = -1;
				3691	ctxt->instate = XML_PARSER_START;
				3692
				3693	/* Allocate the Node stack */
				3694	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3695	if (ctxt->nodeTab == NULL) {
				3696	xmlGenericError(xmlGenericErrorContext,
				3697	"htmlInitParserCtxt: out of memory\n");
				3698	ctxt->nodeNr = 0;
				3699	ctxt->nodeMax = 0;
				3700	ctxt->node = NULL;
				3701	ctxt->inputNr = 0;
				3702	ctxt->inputMax = 0;
				3703	ctxt->input = NULL;
				3704	return;
				3705	}
				3706	ctxt->nodeNr = 0;
				3707	ctxt->nodeMax = 10;
				3708	ctxt->node = NULL;
				3709
				3710	/* Allocate the Name stack */
				3711	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3712	if (ctxt->nameTab == NULL) {
				3713	xmlGenericError(xmlGenericErrorContext,
				3714	"htmlInitParserCtxt: out of memory\n");
				3715	ctxt->nameNr = 0;
				3716	ctxt->nameMax = 10;
				3717	ctxt->name = NULL;
				3718	ctxt->nodeNr = 0;
				3719	ctxt->nodeMax = 0;
				3720	ctxt->node = NULL;
				3721	ctxt->inputNr = 0;
				3722	ctxt->inputMax = 0;
				3723	ctxt->input = NULL;
				3724	return;
				3725	}
				3726	ctxt->nameNr = 0;
				3727	ctxt->nameMax = 10;
				3728	ctxt->name = NULL;
				3729
				3730	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3731	else {
				3732	ctxt->sax = sax;
				3733	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3734	}
				3735	ctxt->userData = ctxt;
				3736	ctxt->myDoc = NULL;
				3737	ctxt->wellFormed = 1;
				3738	ctxt->replaceEntities = 0;
				3739	ctxt->html = 1;
				3740	ctxt->record_info = 0;
				3741	ctxt->validate = 0;
				3742	ctxt->nbChars = 0;
				3743	ctxt->checkIndex = 0;
				3744	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3745	}
				3746
				3747	/**
				3748	* htmlFreeParserCtxt:
				3749	* @ctxt: an HTML parser context
				3750	*
				3751	* Free all the memory used by a parser context. However the parsed
				3752	* document in ctxt->myDoc is not freed.
				3753	*/
				3754
				3755	void
				3756	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3757	{
				3758	xmlFreeParserCtxt(ctxt);
				3759	}
				3760
				3761	/**
				3762	* htmlCreateDocParserCtxt :
				3763	* @cur: a pointer to an array of xmlChar
				3764	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3765	*
				3766	* Create a parser context for an HTML document.
				3767	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3768	* TODO: check the need to add encoding handling there
				3769	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3770	* Returns the new parser context or NULL
				3771	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3772	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3773	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3774	htmlParserCtxtPtr ctxt;
				3775	htmlParserInputPtr input;
				3776	/* htmlCharEncoding enc; */
				3777
				3778	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3779	if (ctxt == NULL) {
				3780	perror("malloc");
				3781	return(NULL);
				3782	}
				3783	htmlInitParserCtxt(ctxt);
				3784	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3785	if (input == NULL) {
				3786	perror("malloc");
				3787	xmlFree(ctxt);
				3788	return(NULL);
				3789	}
				3790	memset(input, 0, sizeof(htmlParserInput));
				3791
				3792	input->line = 1;
				3793	input->col = 1;
				3794	input->base = cur;
				3795	input->cur = cur;
				3796
				3797	inputPush(ctxt, input);
				3798	return(ctxt);
				3799	}
				3800
				3801	/************************************************************************
				3802	* *
				3803	* Progressive parsing interfaces *
				3804	* *
				3805	************************************************************************/
				3806
				3807	/**
				3808	* htmlParseLookupSequence:
				3809	* @ctxt: an HTML parser context
				3810	* @first: the first char to lookup
				3811	* @next: the next char to lookup or zero
				3812	* @third: the next char to lookup or zero
				3813	*
				3814	* Try to find if a sequence (first, next, third) or just (first next) or
				3815	* (first) is available in the input stream.
				3816	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3817	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3818	* parser, do not use liberally.
				3819	* This is basically similar to xmlParseLookupSequence()
				3820	*
				3821	* Returns the index to the current parsing point if the full sequence
				3822	* is available, -1 otherwise.
				3823	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3824	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3825	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3826	xmlChar next, xmlChar third) {
				3827	int base, len;
				3828	htmlParserInputPtr in;
				3829	const xmlChar *buf;
				3830
				3831	in = ctxt->input;
				3832	if (in == NULL) return(-1);
				3833	base = in->cur - in->base;
				3834	if (base < 0) return(-1);
				3835	if (ctxt->checkIndex > base)
				3836	base = ctxt->checkIndex;
				3837	if (in->buf == NULL) {
				3838	buf = in->base;
				3839	len = in->length;
				3840	} else {
				3841	buf = in->buf->buffer->content;
				3842	len = in->buf->buffer->use;
				3843	}
				3844	/* take into account the sequence length */
				3845	if (third) len -= 2;
				3846	else if (next) len --;
				3847	for (;base < len;base++) {
				3848	if (buf[base] == first) {
				3849	if (third != 0) {
				3850	if ((buf[base + 1] != next) \|\|
				3851	(buf[base + 2] != third)) continue;
				3852	} else if (next != 0) {
				3853	if (buf[base + 1] != next) continue;
				3854	}
				3855	ctxt->checkIndex = 0;
				3856	#ifdef DEBUG_PUSH
				3857	if (next == 0)
				3858	xmlGenericError(xmlGenericErrorContext,
				3859	"HPP: lookup '%c' found at %d\n",
				3860	first, base);
				3861	else if (third == 0)
				3862	xmlGenericError(xmlGenericErrorContext,
				3863	"HPP: lookup '%c%c' found at %d\n",
				3864	first, next, base);
				3865	else
				3866	xmlGenericError(xmlGenericErrorContext,
				3867	"HPP: lookup '%c%c%c' found at %d\n",
				3868	first, next, third, base);
				3869	#endif
				3870	return(base - (in->cur - in->base));
				3871	}
				3872	}
				3873	ctxt->checkIndex = base;
				3874	#ifdef DEBUG_PUSH
				3875	if (next == 0)
				3876	xmlGenericError(xmlGenericErrorContext,
				3877	"HPP: lookup '%c' failed\n", first);
				3878	else if (third == 0)
				3879	xmlGenericError(xmlGenericErrorContext,
				3880	"HPP: lookup '%c%c' failed\n", first, next);
				3881	else
				3882	xmlGenericError(xmlGenericErrorContext,
				3883	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3884	#endif
				3885	return(-1);
				3886	}
				3887
				3888	/**
				3889	* htmlParseTryOrFinish:
				3890	* @ctxt: an HTML parser context
				3891	* @terminate: last chunk indicator
				3892	*
				3893	* Try to progress on parsing
				3894	*
				3895	* Returns zero if no parsing was possible
				3896	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3897	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3898	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3899	int ret = 0;
				3900	htmlParserInputPtr in;
				3901	int avail = 0;
				3902	xmlChar cur, next;
				3903
				3904	#ifdef DEBUG_PUSH
				3905	switch (ctxt->instate) {
				3906	case XML_PARSER_EOF:
				3907	xmlGenericError(xmlGenericErrorContext,
				3908	"HPP: try EOF\n"); break;
				3909	case XML_PARSER_START:
				3910	xmlGenericError(xmlGenericErrorContext,
				3911	"HPP: try START\n"); break;
				3912	case XML_PARSER_MISC:
				3913	xmlGenericError(xmlGenericErrorContext,
				3914	"HPP: try MISC\n");break;
				3915	case XML_PARSER_COMMENT:
				3916	xmlGenericError(xmlGenericErrorContext,
				3917	"HPP: try COMMENT\n");break;
				3918	case XML_PARSER_PROLOG:
				3919	xmlGenericError(xmlGenericErrorContext,
				3920	"HPP: try PROLOG\n");break;
				3921	case XML_PARSER_START_TAG:
				3922	xmlGenericError(xmlGenericErrorContext,
				3923	"HPP: try START_TAG\n");break;
				3924	case XML_PARSER_CONTENT:
				3925	xmlGenericError(xmlGenericErrorContext,
				3926	"HPP: try CONTENT\n");break;
				3927	case XML_PARSER_CDATA_SECTION:
				3928	xmlGenericError(xmlGenericErrorContext,
				3929	"HPP: try CDATA_SECTION\n");break;
				3930	case XML_PARSER_END_TAG:
				3931	xmlGenericError(xmlGenericErrorContext,
				3932	"HPP: try END_TAG\n");break;
				3933	case XML_PARSER_ENTITY_DECL:
				3934	xmlGenericError(xmlGenericErrorContext,
				3935	"HPP: try ENTITY_DECL\n");break;
				3936	case XML_PARSER_ENTITY_VALUE:
				3937	xmlGenericError(xmlGenericErrorContext,
				3938	"HPP: try ENTITY_VALUE\n");break;
				3939	case XML_PARSER_ATTRIBUTE_VALUE:
				3940	xmlGenericError(xmlGenericErrorContext,
				3941	"HPP: try ATTRIBUTE_VALUE\n");break;
				3942	case XML_PARSER_DTD:
				3943	xmlGenericError(xmlGenericErrorContext,
				3944	"HPP: try DTD\n");break;
				3945	case XML_PARSER_EPILOG:
				3946	xmlGenericError(xmlGenericErrorContext,
				3947	"HPP: try EPILOG\n");break;
				3948	case XML_PARSER_PI:
				3949	xmlGenericError(xmlGenericErrorContext,
				3950	"HPP: try PI\n");break;
				3951	case XML_PARSER_SYSTEM_LITERAL:
				3952	xmlGenericError(xmlGenericErrorContext,
				3953	"HPP: try SYSTEM_LITERAL\n");break;
				3954	}
				3955	#endif
				3956
				3957	while (1) {
				3958
				3959	in = ctxt->input;
				3960	if (in == NULL) break;
				3961	if (in->buf == NULL)
				3962	avail = in->length - (in->cur - in->base);
				3963	else
				3964	avail = in->buf->buffer->use - (in->cur - in->base);
				3965	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3966	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3967	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3968	/*
				3969	* SAX: end of the document processing.
				3970	*/
				3971	ctxt->instate = XML_PARSER_EOF;
				3972	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3973	ctxt->sax->endDocument(ctxt->userData);
				3974	}
				3975	}
				3976	if (avail < 1)
				3977	goto done;
				3978	switch (ctxt->instate) {
				3979	case XML_PARSER_EOF:
				3980	/*
				3981	* Document parsing is done !
				3982	*/
				3983	goto done;
				3984	case XML_PARSER_START:
				3985	/*
				3986	* Very first chars read from the document flow.
				3987	*/
				3988	cur = in->cur[0];
				3989	if (IS_BLANK(cur)) {
				3990	SKIP_BLANKS;
				3991	if (in->buf == NULL)
				3992	avail = in->length - (in->cur - in->base);
				3993	else
				3994	avail = in->buf->buffer->use - (in->cur - in->base);
				3995	}
				3996	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3997	ctxt->sax->setDocumentLocator(ctxt->userData,
				3998	&xmlDefaultSAXLocator);
				3999	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4000	(!ctxt->disableSAX))
				4001	ctxt->sax->startDocument(ctxt->userData);
				4002
				4003	cur = in->cur[0];
				4004	next = in->cur[1];
				4005	if ((cur == '<') && (next == '!') &&
				4006	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4007	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4008	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4009	(UPP(8) == 'E')) {
				4010	if ((!terminate) &&
				4011	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4012	goto done;
				4013	#ifdef DEBUG_PUSH
				4014	xmlGenericError(xmlGenericErrorContext,
				4015	"HPP: Parsing internal subset\n");
				4016	#endif
				4017	htmlParseDocTypeDecl(ctxt);
				4018	ctxt->instate = XML_PARSER_PROLOG;
				4019	#ifdef DEBUG_PUSH
				4020	xmlGenericError(xmlGenericErrorContext,
				4021	"HPP: entering PROLOG\n");
				4022	#endif
				4023	} else {
				4024	ctxt->instate = XML_PARSER_MISC;
				4025	}
				4026	#ifdef DEBUG_PUSH
				4027	xmlGenericError(xmlGenericErrorContext,
				4028	"HPP: entering MISC\n");
				4029	#endif
				4030	break;
				4031	case XML_PARSER_MISC:
				4032	SKIP_BLANKS;
				4033	if (in->buf == NULL)
				4034	avail = in->length - (in->cur - in->base);
				4035	else
				4036	avail = in->buf->buffer->use - (in->cur - in->base);
				4037	if (avail < 2)
				4038	goto done;
				4039	cur = in->cur[0];
				4040	next = in->cur[1];
				4041	if ((cur == '<') && (next == '!') &&
				4042	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4043	if ((!terminate) &&
				4044	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4045	goto done;
				4046	#ifdef DEBUG_PUSH
				4047	xmlGenericError(xmlGenericErrorContext,
				4048	"HPP: Parsing Comment\n");
				4049	#endif
				4050	htmlParseComment(ctxt);
				4051	ctxt->instate = XML_PARSER_MISC;
				4052	} else if ((cur == '<') && (next == '!') &&
				4053	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4054	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4055	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4056	(UPP(8) == 'E')) {
				4057	if ((!terminate) &&
				4058	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4059	goto done;
				4060	#ifdef DEBUG_PUSH
				4061	xmlGenericError(xmlGenericErrorContext,
				4062	"HPP: Parsing internal subset\n");
				4063	#endif
				4064	htmlParseDocTypeDecl(ctxt);
				4065	ctxt->instate = XML_PARSER_PROLOG;
				4066	#ifdef DEBUG_PUSH
				4067	xmlGenericError(xmlGenericErrorContext,
				4068	"HPP: entering PROLOG\n");
				4069	#endif
				4070	} else if ((cur == '<') && (next == '!') &&
				4071	(avail < 9)) {
				4072	goto done;
				4073	} else {
				4074	ctxt->instate = XML_PARSER_START_TAG;
				4075	#ifdef DEBUG_PUSH
				4076	xmlGenericError(xmlGenericErrorContext,
				4077	"HPP: entering START_TAG\n");
				4078	#endif
				4079	}
				4080	break;
				4081	case XML_PARSER_PROLOG:
				4082	SKIP_BLANKS;
				4083	if (in->buf == NULL)
				4084	avail = in->length - (in->cur - in->base);
				4085	else
				4086	avail = in->buf->buffer->use - (in->cur - in->base);
				4087	if (avail < 2)
				4088	goto done;
				4089	cur = in->cur[0];
				4090	next = in->cur[1];
				4091	if ((cur == '<') && (next == '!') &&
				4092	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4093	if ((!terminate) &&
				4094	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4095	goto done;
				4096	#ifdef DEBUG_PUSH
				4097	xmlGenericError(xmlGenericErrorContext,
				4098	"HPP: Parsing Comment\n");
				4099	#endif
				4100	htmlParseComment(ctxt);
				4101	ctxt->instate = XML_PARSER_PROLOG;
				4102	} else if ((cur == '<') && (next == '!') &&
				4103	(avail < 4)) {
				4104	goto done;
				4105	} else {
				4106	ctxt->instate = XML_PARSER_START_TAG;
				4107	#ifdef DEBUG_PUSH
				4108	xmlGenericError(xmlGenericErrorContext,
				4109	"HPP: entering START_TAG\n");
				4110	#endif
				4111	}
				4112	break;
				4113	case XML_PARSER_EPILOG:
				4114	if (in->buf == NULL)
				4115	avail = in->length - (in->cur - in->base);
				4116	else
				4117	avail = in->buf->buffer->use - (in->cur - in->base);
				4118	if (avail < 1)
				4119	goto done;
				4120	cur = in->cur[0];
				4121	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4122	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4123	goto done;
				4124	}
				4125	if (avail < 2)
				4126	goto done;
				4127	next = in->cur[1];
				4128	if ((cur == '<') && (next == '!') &&
				4129	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4130	if ((!terminate) &&
				4131	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4132	goto done;
				4133	#ifdef DEBUG_PUSH
				4134	xmlGenericError(xmlGenericErrorContext,
				4135	"HPP: Parsing Comment\n");
				4136	#endif
				4137	htmlParseComment(ctxt);
				4138	ctxt->instate = XML_PARSER_EPILOG;
				4139	} else if ((cur == '<') && (next == '!') &&
				4140	(avail < 4)) {
				4141	goto done;
				4142	} else {
				4143	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4144	ctxt->wellFormed = 0;
				4145	ctxt->instate = XML_PARSER_EOF;
				4146	#ifdef DEBUG_PUSH
				4147	xmlGenericError(xmlGenericErrorContext,
				4148	"HPP: entering EOF\n");
				4149	#endif
				4150	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4151	ctxt->sax->endDocument(ctxt->userData);
				4152	goto done;
				4153	}
				4154	break;
				4155	case XML_PARSER_START_TAG: {
				4156	xmlChar name, oldname;
				4157	int depth = ctxt->nameNr;
				4158	htmlElemDescPtr info;
				4159
				4160	if (avail < 2)
				4161	goto done;
				4162	cur = in->cur[0];
				4163	if (cur != '<') {
				4164	ctxt->instate = XML_PARSER_CONTENT;
				4165	#ifdef DEBUG_PUSH
				4166	xmlGenericError(xmlGenericErrorContext,
				4167	"HPP: entering CONTENT\n");
				4168	#endif
				4169	break;
				4170	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4171	if (in->cur[1] == '/') {
				4172	ctxt->instate = XML_PARSER_END_TAG;
				4173	ctxt->checkIndex = 0;
				4174	#ifdef DEBUG_PUSH
				4175	xmlGenericError(xmlGenericErrorContext,
				4176	"HPP: entering END_TAG\n");
				4177	#endif
				4178	break;
				4179	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4180	if ((!terminate) &&
				4181	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4182	goto done;
				4183
				4184	oldname = xmlStrdup(ctxt->name);
				4185	htmlParseStartTag(ctxt);
				4186	name = ctxt->name;
				4187	#ifdef DEBUG
				4188	if (oldname == NULL)
				4189	xmlGenericError(xmlGenericErrorContext,
				4190	"Start of element %s\n", name);
				4191	else if (name == NULL)
				4192	xmlGenericError(xmlGenericErrorContext,
				4193	"Start of element failed, was %s\n",
				4194	oldname);
				4195	else
				4196	xmlGenericError(xmlGenericErrorContext,
				4197	"Start of element %s, was %s\n",
				4198	name, oldname);
				4199	#endif
				4200	if (((depth == ctxt->nameNr) &&
				4201	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4202	(name == NULL)) {
				4203	if (CUR == '>')
				4204	NEXT;
				4205	if (oldname != NULL)
				4206	xmlFree(oldname);
				4207	break;
				4208	}
				4209	if (oldname != NULL)
				4210	xmlFree(oldname);
				4211
				4212	/*
				4213	* Lookup the info for that element.
				4214	*/
				4215	info = htmlTagLookup(name);
				4216	if (info == NULL) {
				4217	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4218	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4219	name);
				4220	ctxt->wellFormed = 0;
				4221	} else if (info->depr) {
				4222	/***************************
				4223	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4224	ctxt->sax->warning(ctxt->userData,
				4225	"Tag %s is deprecated\n",
				4226	name);
				4227	***************************/
				4228	}
				4229
				4230	/*
				4231	* Check for an Empty Element labelled the XML/SGML way
				4232	*/
				4233	if ((CUR == '/') && (NXT(1) == '>')) {
				4234	SKIP(2);
				4235	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4236	ctxt->sax->endElement(ctxt->userData, name);
				4237	oldname = htmlnamePop(ctxt);
				4238	#ifdef DEBUG
				4239	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4240	oldname);
				4241	#endif
				4242	if (oldname != NULL)
				4243	xmlFree(oldname);
				4244	ctxt->instate = XML_PARSER_CONTENT;
				4245	#ifdef DEBUG_PUSH
				4246	xmlGenericError(xmlGenericErrorContext,
				4247	"HPP: entering CONTENT\n");
				4248	#endif
				4249	break;
				4250	}
				4251
				4252	if (CUR == '>') {
				4253	NEXT;
				4254	} else {
				4255	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4256	ctxt->sax->error(ctxt->userData,
				4257	"Couldn't find end of Start Tag %s\n",
				4258	name);
				4259	ctxt->wellFormed = 0;
				4260
				4261	/*
				4262	* end of parsing of this node.
				4263	*/
				4264	if (xmlStrEqual(name, ctxt->name)) {
				4265	nodePop(ctxt);
				4266	oldname = htmlnamePop(ctxt);
				4267	#ifdef DEBUG
				4268	xmlGenericError(xmlGenericErrorContext,
				4269	"End of start tag problem: popping out %s\n", oldname);
				4270	#endif
				4271	if (oldname != NULL)
				4272	xmlFree(oldname);
				4273	}
				4274
				4275	ctxt->instate = XML_PARSER_CONTENT;
				4276	#ifdef DEBUG_PUSH
				4277	xmlGenericError(xmlGenericErrorContext,
				4278	"HPP: entering CONTENT\n");
				4279	#endif
				4280	break;
				4281	}
				4282
				4283	/*
				4284	* Check for an Empty Element from DTD definition
				4285	*/
				4286	if ((info != NULL) && (info->empty)) {
				4287	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4288	ctxt->sax->endElement(ctxt->userData, name);
				4289	oldname = htmlnamePop(ctxt);
				4290	#ifdef DEBUG
				4291	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4292	#endif
				4293	if (oldname != NULL)
				4294	xmlFree(oldname);
				4295	}
				4296	ctxt->instate = XML_PARSER_CONTENT;
				4297	#ifdef DEBUG_PUSH
				4298	xmlGenericError(xmlGenericErrorContext,
				4299	"HPP: entering CONTENT\n");
				4300	#endif
				4301	break;
				4302	}
				4303	case XML_PARSER_CONTENT: {
				4304	long cons;
				4305	/*
				4306	* Handle preparsed entities and charRef
				4307	*/
				4308	if (ctxt->token != 0) {
				4309	xmlChar chr[2] = { 0 , 0 } ;
				4310
				4311	chr[0] = (xmlChar) ctxt->token;
				4312	htmlCheckParagraph(ctxt);
				4313	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4314	ctxt->sax->characters(ctxt->userData, chr, 1);
				4315	ctxt->token = 0;
				4316	ctxt->checkIndex = 0;
				4317	}
				4318	if ((avail == 1) && (terminate)) {
				4319	cur = in->cur[0];
				4320	if ((cur != '<') && (cur != '&')) {
				4321	if (ctxt->sax != NULL) {
				4322	if (IS_BLANK(cur)) {
				4323	if (ctxt->sax->ignorableWhitespace != NULL)
				4324	ctxt->sax->ignorableWhitespace(
				4325	ctxt->userData, &cur, 1);
				4326	} else {
				4327	htmlCheckParagraph(ctxt);
				4328	if (ctxt->sax->characters != NULL)
				4329	ctxt->sax->characters(
				4330	ctxt->userData, &cur, 1);
				4331	}
				4332	}
				4333	ctxt->token = 0;
				4334	ctxt->checkIndex = 0;
				4335	NEXT;
				4336	}
				4337	break;
				4338	}
				4339	if (avail < 2)
				4340	goto done;
				4341	cur = in->cur[0];
				4342	next = in->cur[1];
				4343	cons = ctxt->nbChars;
				4344	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4345	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4346	/*
				4347	* Handle SCRIPT/STYLE separately
				4348	*/
				4349	if ((!terminate) &&
				4350	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4351	goto done;
				4352	htmlParseScript(ctxt);
				4353	if ((cur == '<') && (next == '/')) {
				4354	ctxt->instate = XML_PARSER_END_TAG;
				4355	ctxt->checkIndex = 0;
				4356	#ifdef DEBUG_PUSH
				4357	xmlGenericError(xmlGenericErrorContext,
				4358	"HPP: entering END_TAG\n");
				4359	#endif
				4360	break;
				4361	}
				4362	} else {
				4363	/*
				4364	* Sometimes DOCTYPE arrives in the middle of the document
				4365	*/
				4366	if ((cur == '<') && (next == '!') &&
				4367	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4368	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4369	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4370	(UPP(8) == 'E')) {
				4371	if ((!terminate) &&
				4372	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4373	goto done;
				4374	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4375	ctxt->sax->error(ctxt->userData,
				4376	"Misplaced DOCTYPE declaration\n");
				4377	ctxt->wellFormed = 0;
				4378	htmlParseDocTypeDecl(ctxt);
				4379	} else if ((cur == '<') && (next == '!') &&
				4380	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4381	if ((!terminate) &&
				4382	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4383	goto done;
				4384	#ifdef DEBUG_PUSH
				4385	xmlGenericError(xmlGenericErrorContext,
				4386	"HPP: Parsing Comment\n");
				4387	#endif
				4388	htmlParseComment(ctxt);
				4389	ctxt->instate = XML_PARSER_CONTENT;
				4390	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4391	goto done;
				4392	} else if ((cur == '<') && (next == '/')) {
				4393	ctxt->instate = XML_PARSER_END_TAG;
				4394	ctxt->checkIndex = 0;
				4395	#ifdef DEBUG_PUSH
				4396	xmlGenericError(xmlGenericErrorContext,
				4397	"HPP: entering END_TAG\n");
				4398	#endif
				4399	break;
				4400	} else if (cur == '<') {
				4401	ctxt->instate = XML_PARSER_START_TAG;
				4402	ctxt->checkIndex = 0;
				4403	#ifdef DEBUG_PUSH
				4404	xmlGenericError(xmlGenericErrorContext,
				4405	"HPP: entering START_TAG\n");
				4406	#endif
				4407	break;
				4408	} else if (cur == '&') {
				4409	if ((!terminate) &&
				4410	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4411	goto done;
				4412	#ifdef DEBUG_PUSH
				4413	xmlGenericError(xmlGenericErrorContext,
				4414	"HPP: Parsing Reference\n");
				4415	#endif
				4416	/* TODO: check generation of subtrees if noent !!! */
				4417	htmlParseReference(ctxt);
				4418	} else {
				4419	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4420	/*
				4421	* Goal of the following test is :
				4422	* - minimize calls to the SAX 'character' callback
				4423	* when they are mergeable
				4424	*/
				4425	if ((ctxt->inputNr == 1) &&
				4426	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4427	if ((!terminate) &&
				4428	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4429	goto done;
				4430	}
				4431	ctxt->checkIndex = 0;
				4432	#ifdef DEBUG_PUSH
				4433	xmlGenericError(xmlGenericErrorContext,
				4434	"HPP: Parsing char data\n");
				4435	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4436	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4437	}
				4438	}
				4439	if (cons == ctxt->nbChars) {
				4440	if (ctxt->node != NULL) {
				4441	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4442	ctxt->sax->error(ctxt->userData,
				4443	"detected an error in element content\n");
				4444	ctxt->wellFormed = 0;
				4445	}
				4446	NEXT;
				4447	break;
				4448	}
				4449
				4450	break;
				4451	}
				4452	case XML_PARSER_END_TAG:
				4453	if (avail < 2)
				4454	goto done;
				4455	if ((!terminate) &&
				4456	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4457	goto done;
				4458	htmlParseEndTag(ctxt);
				4459	if (ctxt->nameNr == 0) {
				4460	ctxt->instate = XML_PARSER_EPILOG;
				4461	} else {
				4462	ctxt->instate = XML_PARSER_CONTENT;
				4463	}
				4464	ctxt->checkIndex = 0;
				4465	#ifdef DEBUG_PUSH
				4466	xmlGenericError(xmlGenericErrorContext,
				4467	"HPP: entering CONTENT\n");
				4468	#endif
				4469	break;
				4470	case XML_PARSER_CDATA_SECTION:
				4471	xmlGenericError(xmlGenericErrorContext,
				4472	"HPP: internal error, state == CDATA\n");
				4473	ctxt->instate = XML_PARSER_CONTENT;
				4474	ctxt->checkIndex = 0;
				4475	#ifdef DEBUG_PUSH
				4476	xmlGenericError(xmlGenericErrorContext,
				4477	"HPP: entering CONTENT\n");
				4478	#endif
				4479	break;
				4480	case XML_PARSER_DTD:
				4481	xmlGenericError(xmlGenericErrorContext,
				4482	"HPP: internal error, state == DTD\n");
				4483	ctxt->instate = XML_PARSER_CONTENT;
				4484	ctxt->checkIndex = 0;
				4485	#ifdef DEBUG_PUSH
				4486	xmlGenericError(xmlGenericErrorContext,
				4487	"HPP: entering CONTENT\n");
				4488	#endif
				4489	break;
				4490	case XML_PARSER_COMMENT:
				4491	xmlGenericError(xmlGenericErrorContext,
				4492	"HPP: internal error, state == COMMENT\n");
				4493	ctxt->instate = XML_PARSER_CONTENT;
				4494	ctxt->checkIndex = 0;
				4495	#ifdef DEBUG_PUSH
				4496	xmlGenericError(xmlGenericErrorContext,
				4497	"HPP: entering CONTENT\n");
				4498	#endif
				4499	break;
				4500	case XML_PARSER_PI:
				4501	xmlGenericError(xmlGenericErrorContext,
				4502	"HPP: internal error, state == PI\n");
				4503	ctxt->instate = XML_PARSER_CONTENT;
				4504	ctxt->checkIndex = 0;
				4505	#ifdef DEBUG_PUSH
				4506	xmlGenericError(xmlGenericErrorContext,
				4507	"HPP: entering CONTENT\n");
				4508	#endif
				4509	break;
				4510	case XML_PARSER_ENTITY_DECL:
				4511	xmlGenericError(xmlGenericErrorContext,
				4512	"HPP: internal error, state == ENTITY_DECL\n");
				4513	ctxt->instate = XML_PARSER_CONTENT;
				4514	ctxt->checkIndex = 0;
				4515	#ifdef DEBUG_PUSH
				4516	xmlGenericError(xmlGenericErrorContext,
				4517	"HPP: entering CONTENT\n");
				4518	#endif
				4519	break;
				4520	case XML_PARSER_ENTITY_VALUE:
				4521	xmlGenericError(xmlGenericErrorContext,
				4522	"HPP: internal error, state == ENTITY_VALUE\n");
				4523	ctxt->instate = XML_PARSER_CONTENT;
				4524	ctxt->checkIndex = 0;
				4525	#ifdef DEBUG_PUSH
				4526	xmlGenericError(xmlGenericErrorContext,
				4527	"HPP: entering DTD\n");
				4528	#endif
				4529	break;
				4530	case XML_PARSER_ATTRIBUTE_VALUE:
				4531	xmlGenericError(xmlGenericErrorContext,
				4532	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4533	ctxt->instate = XML_PARSER_START_TAG;
				4534	ctxt->checkIndex = 0;
				4535	#ifdef DEBUG_PUSH
				4536	xmlGenericError(xmlGenericErrorContext,
				4537	"HPP: entering START_TAG\n");
				4538	#endif
				4539	break;
				4540	case XML_PARSER_SYSTEM_LITERAL:
				4541	xmlGenericError(xmlGenericErrorContext,
				4542	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4543	ctxt->instate = XML_PARSER_CONTENT;
				4544	ctxt->checkIndex = 0;
				4545	#ifdef DEBUG_PUSH
				4546	xmlGenericError(xmlGenericErrorContext,
				4547	"HPP: entering CONTENT\n");
				4548	#endif
				4549	break;
				4550	case XML_PARSER_IGNORE:
				4551	xmlGenericError(xmlGenericErrorContext,
				4552	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4553	ctxt->instate = XML_PARSER_CONTENT;
				4554	ctxt->checkIndex = 0;
				4555	#ifdef DEBUG_PUSH
				4556	xmlGenericError(xmlGenericErrorContext,
				4557	"HPP: entering CONTENT\n");
				4558	#endif
				4559	break;
				4560	}
				4561	}
				4562	done:
				4563	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4564	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4565	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4566	/*
				4567	* SAX: end of the document processing.
				4568	*/
				4569	ctxt->instate = XML_PARSER_EOF;
				4570	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4571	ctxt->sax->endDocument(ctxt->userData);
				4572	}
				4573	}
				4574	if ((ctxt->myDoc != NULL) &&
				4575	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4576	(ctxt->instate == XML_PARSER_EPILOG))) {
				4577	xmlDtdPtr dtd;
				4578	dtd = xmlGetIntSubset(ctxt->myDoc);
				4579	if (dtd == NULL)
				4580	ctxt->myDoc->intSubset =
				4581	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4582	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4583	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4584	}
				4585	#ifdef DEBUG_PUSH
				4586	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4587	#endif
				4588	return(ret);
				4589	}
				4590
				4591	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4592	* htmlParseChunk:
				4593	* @ctxt: an XML parser context
				4594	* @chunk: an char array
				4595	* @size: the size in byte of the chunk
				4596	* @terminate: last chunk indicator
				4597	*
				4598	* Parse a Chunk of memory
				4599	*
				4600	* Returns zero if no error, the xmlParserErrors otherwise.
				4601	*/
				4602	int
				4603	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4604	int terminate) {
				4605	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4606	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4607	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4608	int cur = ctxt->input->cur - ctxt->input->base;
				4609
				4610	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4611	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4612	ctxt->input->cur = ctxt->input->base + cur;
				4613	#ifdef DEBUG_PUSH
				4614	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4615	#endif
				4616
				4617	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4618	htmlParseTryOrFinish(ctxt, terminate);
				4619	} else if (ctxt->instate != XML_PARSER_EOF) {
				4620	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4621	htmlParseTryOrFinish(ctxt, terminate);
				4622	}
				4623	if (terminate) {
				4624	if ((ctxt->instate != XML_PARSER_EOF) &&
				4625	(ctxt->instate != XML_PARSER_EPILOG) &&
				4626	(ctxt->instate != XML_PARSER_MISC)) {
				4627	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4628	ctxt->wellFormed = 0;
				4629	}
				4630	if (ctxt->instate != XML_PARSER_EOF) {
				4631	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4632	ctxt->sax->endDocument(ctxt->userData);
				4633	}
				4634	ctxt->instate = XML_PARSER_EOF;
				4635	}
				4636	return((xmlParserErrors) ctxt->errNo);
				4637	}
				4638
				4639	/************************************************************************
				4640	* *
				4641	* User entry points *
				4642	* *
				4643	************************************************************************/
				4644
				4645	/**
				4646	* htmlCreatePushParserCtxt :
				4647	* @sax: a SAX handler
				4648	* @user_data: The user data returned on SAX callbacks
				4649	* @chunk: a pointer to an array of chars
				4650	* @size: number of chars in the array
				4651	* @filename: an optional file name or URI
				4652	* @enc: an optional encoding
				4653	*
				4654	* Create a parser context for using the HTML parser in push mode
				4655	* To allow content encoding detection, @size should be >= 4
				4656	* The value of @filename is used for fetching external entities
				4657	* and error/warning reports.
				4658	*
				4659	* Returns the new parser context or NULL
				4660	*/
				4661	htmlParserCtxtPtr
				4662	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4663	const char chunk, int size, const char filename,
				4664	xmlCharEncoding enc) {
				4665	htmlParserCtxtPtr ctxt;
				4666	htmlParserInputPtr inputStream;
				4667	xmlParserInputBufferPtr buf;
				4668
				4669	buf = xmlAllocParserInputBuffer(enc);
				4670	if (buf == NULL) return(NULL);
				4671
				4672	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4673	if (ctxt == NULL) {
				4674	xmlFree(buf);
				4675	return(NULL);
				4676	}
				4677	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4678	htmlInitParserCtxt(ctxt);
				4679	if (sax != NULL) {
				4680	if (ctxt->sax != &htmlDefaultSAXHandler)
				4681	xmlFree(ctxt->sax);
				4682	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4683	if (ctxt->sax == NULL) {
				4684	xmlFree(buf);
				4685	xmlFree(ctxt);
				4686	return(NULL);
				4687	}
				4688	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4689	if (user_data != NULL)
				4690	ctxt->userData = user_data;
				4691	}
				4692	if (filename == NULL) {
				4693	ctxt->directory = NULL;
				4694	} else {
				4695	ctxt->directory = xmlParserGetDirectory(filename);
				4696	}
				4697
				4698	inputStream = htmlNewInputStream(ctxt);
				4699	if (inputStream == NULL) {
				4700	xmlFreeParserCtxt(ctxt);
				4701	return(NULL);
				4702	}
				4703
				4704	if (filename == NULL)
				4705	inputStream->filename = NULL;
				4706	else
				4707	inputStream->filename = xmlMemStrdup(filename);
				4708	inputStream->buf = buf;
				4709	inputStream->base = inputStream->buf->buffer->content;
				4710	inputStream->cur = inputStream->buf->buffer->content;
				4711
				4712	inputPush(ctxt, inputStream);
				4713
				4714	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4715	(ctxt->input->buf != NULL)) {
				4716	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4717	#ifdef DEBUG_PUSH
				4718	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4719	#endif
				4720	}
				4721
				4722	return(ctxt);
				4723	}
				4724
				4725	/**
				4726	* htmlSAXParseDoc :
				4727	* @cur: a pointer to an array of xmlChar
				4728	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4729	* @sax: the SAX handler block
				4730	* @userData: if using SAX, this pointer will be provided on callbacks.
				4731	*
				4732	* parse an HTML in-memory document and build a tree.
				4733	* It use the given SAX function block to handle the parsing callback.
				4734	* If sax is NULL, fallback to the default DOM tree building routines.
				4735	*
				4736	* Returns the resulting document tree
				4737	*/
				4738
				4739	htmlDocPtr
				4740	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4741	htmlDocPtr ret;
				4742	htmlParserCtxtPtr ctxt;
				4743
				4744	if (cur == NULL) return(NULL);
				4745
				4746
				4747	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4748	if (ctxt == NULL) return(NULL);
				4749	if (sax != NULL) {
				4750	ctxt->sax = sax;
				4751	ctxt->userData = userData;
				4752	}
				4753
				4754	htmlParseDocument(ctxt);
				4755	ret = ctxt->myDoc;
				4756	if (sax != NULL) {
				4757	ctxt->sax = NULL;
				4758	ctxt->userData = NULL;
				4759	}
				4760	htmlFreeParserCtxt(ctxt);
				4761
				4762	return(ret);
				4763	}
				4764
				4765	/**
				4766	* htmlParseDoc :
				4767	* @cur: a pointer to an array of xmlChar
				4768	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4769	*
				4770	* parse an HTML in-memory document and build a tree.
				4771	*
				4772	* Returns the resulting document tree
				4773	*/
				4774
				4775	htmlDocPtr
				4776	htmlParseDoc(xmlChar cur, const char encoding) {
				4777	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4778	}
				4779
				4780
				4781	/**
				4782	* htmlCreateFileParserCtxt :
				4783	* @filename: the filename
				4784	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4785	*
				4786	* Create a parser context for a file content.
				4787	* Automatic support for ZLIB/Compress compressed document is provided
				4788	* by default if found at compile-time.
				4789	*
				4790	* Returns the new parser context or NULL
				4791	*/
				4792	htmlParserCtxtPtr
				4793	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4794	{
				4795	htmlParserCtxtPtr ctxt;
				4796	htmlParserInputPtr inputStream;
				4797	xmlParserInputBufferPtr buf;
				4798	/* htmlCharEncoding enc; */
				4799	xmlChar content, content_line = (xmlChar *) "charset=";
				4800
				4801	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4802	if (buf == NULL) return(NULL);
				4803
				4804	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4805	if (ctxt == NULL) {
				4806	perror("malloc");
				4807	return(NULL);
				4808	}
				4809	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4810	htmlInitParserCtxt(ctxt);
				4811	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4812	if (inputStream == NULL) {
				4813	perror("malloc");
				4814	xmlFree(ctxt);
				4815	return(NULL);
				4816	}
				4817	memset(inputStream, 0, sizeof(htmlParserInput));
				4818
				4819	inputStream->filename = xmlMemStrdup(filename);
				4820	inputStream->line = 1;
				4821	inputStream->col = 1;
				4822	inputStream->buf = buf;
				4823	inputStream->directory = NULL;
				4824
				4825	inputStream->base = inputStream->buf->buffer->content;
				4826	inputStream->cur = inputStream->buf->buffer->content;
				4827	inputStream->free = NULL;
				4828
				4829	inputPush(ctxt, inputStream);
				4830
				4831	/* set encoding */
				4832	if (encoding) {
				4833	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4834	if (content) {
				4835	strcpy ((char )content, (char )content_line);
				4836	strcat ((char )content, (char )encoding);
				4837	htmlCheckEncoding (ctxt, content);
				4838	xmlFree (content);
				4839	}
				4840	}
				4841
				4842	return(ctxt);
				4843	}
				4844
				4845	/**
				4846	* htmlSAXParseFile :
				4847	* @filename: the filename
				4848	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4849	* @sax: the SAX handler block
				4850	* @userData: if using SAX, this pointer will be provided on callbacks.
				4851	*
				4852	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4853	* compressed document is provided by default if found at compile-time.
				4854	* It use the given SAX function block to handle the parsing callback.
				4855	* If sax is NULL, fallback to the default DOM tree building routines.
				4856	*
				4857	* Returns the resulting document tree
				4858	*/
				4859
				4860	htmlDocPtr
				4861	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4862	void *userData) {
				4863	htmlDocPtr ret;
				4864	htmlParserCtxtPtr ctxt;
				4865	htmlSAXHandlerPtr oldsax = NULL;
				4866
				4867	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4868	if (ctxt == NULL) return(NULL);
				4869	if (sax != NULL) {
				4870	oldsax = ctxt->sax;
				4871	ctxt->sax = sax;
				4872	ctxt->userData = userData;
				4873	}
				4874
				4875	htmlParseDocument(ctxt);
				4876
				4877	ret = ctxt->myDoc;
				4878	if (sax != NULL) {
				4879	ctxt->sax = oldsax;
				4880	ctxt->userData = NULL;
				4881	}
				4882	htmlFreeParserCtxt(ctxt);
				4883
				4884	return(ret);
				4885	}
				4886
				4887	/**
				4888	* htmlParseFile :
				4889	* @filename: the filename
				4890	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4891	*
				4892	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4893	* compressed document is provided by default if found at compile-time.
				4894	*
				4895	* Returns the resulting document tree
				4896	*/
				4897
				4898	htmlDocPtr
				4899	htmlParseFile(const char filename, const char encoding) {
				4900	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4901	}
				4902
				4903	/**
				4904	* htmlHandleOmittedElem:
				4905	* @val: int 0 or 1
				4906	*
				4907	* Set and return the previous value for handling HTML omitted tags.
				4908	*
				4909	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4910	*/
				4911
				4912	int
				4913	htmlHandleOmittedElem(int val) {
				4914	int old = htmlOmittedDefaultValue;
				4915
				4916	htmlOmittedDefaultValue = val;
				4917	return(old);
				4918	}
				4919
				4920	#endif /* LIBXML_HTML_ENABLED */