Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: 8812c3d08594efe5840d881e10ddb0f6fadd3bb7 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
				43
				44	#define HTML_MAX_NAMELEN 1000
				45	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				46	#define HTML_PARSER_BUFFER_SIZE 100
				47
				48	/* #define DEBUG */
				49	/* #define DEBUG_PUSH */
				50
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	51	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	52
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	53	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				54	xmlChar end, xmlChar end2, xmlChar end3);
				55
				56	/************************************************************************
				57	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	58	* Parser stacks related functions and macros *
				59	* *
				60	************************************************************************/
				61
				62	/*
				63	* Generic function for accessing stacks in the Parser Context
				64	*/
				65
				66	#define PUSH_AND_POP(scope, type, name) \
				67	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				68	if (ctxt->name##Nr >= ctxt->name##Max) { \
				69	ctxt->name##Max *= 2; \
				70	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				71	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				72	if (ctxt->name##Tab == NULL) { \
				73	xmlGenericError(xmlGenericErrorContext, \
				74	"realloc failed !\n"); \
				75	return(0); \
				76	} \
				77	} \
				78	ctxt->name##Tab[ctxt->name##Nr] = value; \
				79	ctxt->name = value; \
				80	return(ctxt->name##Nr++); \
				81	} \
				82	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				83	type ret; \
				84	if (ctxt->name##Nr < 0) return(0); \
				85	ctxt->name##Nr--; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	if (ctxt->name##Nr > 0) \
				88	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				89	else \
				90	ctxt->name = NULL; \
				91	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				92	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				93	return(ret); \
				94	} \
				95
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	96	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				97	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	98
				99	/*
				100	* Macros for accessing the content. Those should be used only by the parser,
				101	* and not exported.
				102	*
				103	* Dirty macros, i.e. one need to make assumption on the context to use them
				104	*
				105	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				106	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				107	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				108	* in UNICODE mode. This should be used internally by the parser
				109	* only to compare to ASCII values otherwise it would break when
				110	* running with UTF-8 encoding.
				111	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				112	* to compare on ASCII based substring.
				113	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				114	* it should be used only to compare on ASCII based substring.
				115	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				116	* strings within the parser.
				117	*
				118	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				119	*
				120	* CURRENT Returns the current char value, with the full decoding of
				121	* UTF-8 if we are using this mode. It returns an int.
				122	* NEXT Skip to the next character, this does the proper decoding
				123	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				124	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				125	*/
				126
				127	#define UPPER (toupper(*ctxt->input->cur))
				128
				129	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				130
				131	#define NXT(val) ctxt->input->cur[(val)]
				132
				133	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				134
				135	#define CUR_PTR ctxt->input->cur
				136
				137	#define SHRINK xmlParserInputShrink(ctxt->input)
				138
				139	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				140
				141	#define CURRENT ((int) (*ctxt->input->cur))
				142
				143	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				144
				145	/* Inported from XML */
				146
				147	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				148	#define CUR ((int) (*ctxt->input->cur))
				149	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				150
				151	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				152	#define NXT(val) ctxt->input->cur[(val)]
				153	#define CUR_PTR ctxt->input->cur
				154
				155
				156	#define NEXTL(l) do { \
				157	if (*(ctxt->input->cur) == '\n') { \
				158	ctxt->input->line++; ctxt->input->col = 1; \
				159	} else ctxt->input->col++; \
				160	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				161	} while (0)
				162
				163	/************
				164	\
				165	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				166	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				167	************/
				168
				169	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				170	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				171
				172	#define COPY_BUF(l,b,i,v) \
				173	if (l == 1) b[i++] = (xmlChar) v; \
				174	else i += xmlCopyChar(l,&b[i],v)
				175
				176	/**
				177	* htmlCurrentChar:
				178	* @ctxt: the HTML parser context
				179	* @len: pointer to the length of the char read
				180	*
				181	* The current char value, if using UTF-8 this may actaully span multiple
				182	* bytes in the input buffer. Implement the end of line normalization:
				183	* 2.11 End-of-Line Handling
				184	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				185	* char, then the encoding converter is plugged in automatically.
				186	*
				187	* Returns the current char value and its lenght
				188	*/
				189
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	190	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				192	if (ctxt->instate == XML_PARSER_EOF)
				193	return(0);
				194
				195	if (ctxt->token != 0) {
				196	*len = 0;
				197	return(ctxt->token);
				198	}
				199	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				200	/*
				201	* We are supposed to handle UTF8, check it's valid
				202	* From rfc2044: encoding of the Unicode values on UTF-8:
				203	*
				204	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				205	* 0000 0000-0000 007F 0xxxxxxx
				206	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				207	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				208	*
				209	* Check for the 0x110000 limit too
				210	*/
				211	const unsigned char *cur = ctxt->input->cur;
				212	unsigned char c;
				213	unsigned int val;
				214
				215	c = *cur;
				216	if (c & 0x80) {
				217	if (cur[1] == 0)
				218	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				219	if ((cur[1] & 0xc0) != 0x80)
				220	goto encoding_error;
				221	if ((c & 0xe0) == 0xe0) {
				222
				223	if (cur[2] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[2] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xf0) == 0xf0) {
				228	if (cur[3] == 0)
				229	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				230	if (((c & 0xf8) != 0xf0) \|\|
				231	((cur[3] & 0xc0) != 0x80))
				232	goto encoding_error;
				233	/* 4-byte code */
				234	*len = 4;
				235	val = (cur[0] & 0x7) << 18;
				236	val \|= (cur[1] & 0x3f) << 12;
				237	val \|= (cur[2] & 0x3f) << 6;
				238	val \|= cur[3] & 0x3f;
				239	} else {
				240	/* 3-byte code */
				241	*len = 3;
				242	val = (cur[0] & 0xf) << 12;
				243	val \|= (cur[1] & 0x3f) << 6;
				244	val \|= cur[2] & 0x3f;
				245	}
				246	} else {
				247	/* 2-byte code */
				248	*len = 2;
				249	val = (cur[0] & 0x1f) << 6;
				250	val \|= cur[1] & 0x3f;
				251	}
				252	if (!IS_CHAR(val)) {
				253	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				254	if ((ctxt->sax != NULL) &&
				255	(ctxt->sax->error != NULL))
				256	ctxt->sax->error(ctxt->userData,
				257	"Char 0x%X out of allowed range\n", val);
				258	ctxt->wellFormed = 0;
				259	ctxt->disableSAX = 1;
				260	}
				261	return(val);
				262	} else {
				263	/* 1-byte code */
				264	*len = 1;
				265	return((int) *ctxt->input->cur);
				266	}
				267	}
				268	/*
				269	* Assume it's a fixed lenght encoding (1) with
				270	* a compatibke encoding for the ASCII set, since
				271	* XML constructs only use < 128 chars
				272	*/
				273	*len = 1;
				274	if ((int) *ctxt->input->cur < 0x80)
				275	return((int) *ctxt->input->cur);
				276
				277	/*
				278	* Humm this is bad, do an automatic flow conversion
				279	*/
				280	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				281	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				282	return(xmlCurrentChar(ctxt, len));
				283
				284	encoding_error:
				285	/*
				286	* If we detect an UTF8 error that probably mean that the
				287	* input encoding didn't get properly advertized in the
				288	* declaration header. Report the error and switch the encoding
				289	* to ISO-Latin-1 (if you don't like this policy, just declare the
				290	* encoding !)
				291	*/
				292	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				293	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				294	ctxt->sax->error(ctxt->userData,
				295	"Input is not proper UTF-8, indicate encoding !\n");
				296	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				297	ctxt->input->cur[0], ctxt->input->cur[1],
				298	ctxt->input->cur[2], ctxt->input->cur[3]);
				299	}
				300
				301	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				302	*len = 1;
				303	return((int) *ctxt->input->cur);
				304	}
				305
				306	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	307	* htmlSkipBlankChars:
				308	* @ctxt: the HTML parser context
				309	*
				310	* skip all blanks character found at that point in the input streams.
				311	*
				312	* Returns the number of space chars skipped
				313	*/
				314
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	315	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	316	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				317	int res = 0;
				318
				319	while (IS_BLANK(*(ctxt->input->cur))) {
				320	if ((*ctxt->input->cur == 0) &&
				321	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				322	xmlPopInput(ctxt);
				323	} else {
				324	if (*(ctxt->input->cur) == '\n') {
				325	ctxt->input->line++; ctxt->input->col = 1;
				326	} else ctxt->input->col++;
				327	ctxt->input->cur++;
				328	ctxt->nbChars++;
				329	if (*ctxt->input->cur == 0)
				330	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				331	}
				332	res++;
				333	}
				334	return(res);
				335	}
				336
				337
				338
				339	/************************************************************************
				340	* *
				341	* The list of HTML elements and their properties *
				342	* *
				343	************************************************************************/
				344
				345	/*
				346	* Start Tag: 1 means the start tag can be ommited
				347	* End Tag: 1 means the end tag can be ommited
				348	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	349	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	350	* Depr: this element is deprecated
				351	* DTD: 1 means that this element is valid only in the Loose DTD
				352	* 2 means that this element is valid only in the Frameset DTD
				353	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	354	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	355	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	356	static const htmlElemDesc
				357	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	358	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				359	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				360	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				361	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				362	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				363	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				364	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				365	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				366	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				367	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				368	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				369	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				370	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				371	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				372	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				373	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				374	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				375	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				376	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				377	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				378	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				379	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				380	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				381	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				382	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				383	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				384	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				385	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				386	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				387	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				388	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				389	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				390	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				391	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				392	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				393	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				398	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				399	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				400	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				401	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				402	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				403	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				404	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				405	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				406	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				407	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				408	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				409	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				410	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				411	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				412	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				413	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				414	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				415	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				416	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				417	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				418	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				419	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				420	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				421	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				422	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				423	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				424	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				425	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				426	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				427	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				428	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				429	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				430	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				431	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				432	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				433	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				434	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				435	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				436	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				437	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				438	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				439	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				440	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				441	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				442	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				443	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				444	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				445	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				446	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				447	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				448	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	449	};
				450
				451	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	452	* start tags that imply the end of current element
				453	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	454	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	455	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				456	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				457	"listing", "xmp", "head", NULL,
				458	"head", "p", NULL,
				459	"title", "p", NULL,
				460	"body", "head", "style", "link", "title", "p", NULL,
				461	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				462	"pre", "listing", "xmp", "head", "li", NULL,
				463	"hr", "p", "head", NULL,
				464	"h1", "p", "head", NULL,
				465	"h2", "p", "head", NULL,
				466	"h3", "p", "head", NULL,
				467	"h4", "p", "head", NULL,
				468	"h5", "p", "head", NULL,
				469	"h6", "p", "head", NULL,
				470	"dir", "p", "head", NULL,
				471	"address", "p", "head", "ul", NULL,
				472	"pre", "p", "head", "ul", NULL,
				473	"listing", "p", "head", NULL,
				474	"xmp", "p", "head", NULL,
				475	"blockquote", "p", "head", NULL,
				476	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				477	"xmp", "head", NULL,
				478	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				479	"head", "dd", NULL,
				480	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				481	"head", "dt", NULL,
				482	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				483	"listing", "xmp", NULL,
				484	"ol", "p", "head", "ul", NULL,
				485	"menu", "p", "head", "ul", NULL,
				486	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				487	"div", "p", "head", NULL,
				488	"noscript", "p", "head", NULL,
				489	"center", "font", "b", "i", "p", "head", NULL,
				490	"a", "a", NULL,
				491	"caption", "p", NULL,
				492	"colgroup", "caption", "colgroup", "col", "p", NULL,
				493	"col", "caption", "col", "p", NULL,
				494	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				495	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	496	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				497	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	498	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				499	"thead", "caption", "col", "colgroup", NULL,
				500	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				501	"tbody", "p", NULL,
				502	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				503	"tfoot", "tbody", "p", NULL,
				504	"optgroup", "option", NULL,
				505	"option", "option", NULL,
				506	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				507	"pre", "listing", "xmp", "a", NULL,
				508	NULL
				509	};
				510
				511	/*
				512	* The list of HTML elements which are supposed not to have
				513	* CDATA content and where a p element will be implied
				514	*
				515	* TODO: extend that list by reading the HTML SGML DtD on
				516	* implied paragraph
				517	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	518	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	519	"html",
				520	"head",
				521	"body",
				522	NULL
				523	};
				524
				525	/*
				526	* The list of HTML attributes which are of content %Script;
				527	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				528	* it assumes the name starts with 'on'
				529	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	530	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	531	"onclick",
				532	"ondblclick",
				533	"onmousedown",
				534	"onmouseup",
				535	"onmouseover",
				536	"onmousemove",
				537	"onmouseout",
				538	"onkeypress",
				539	"onkeydown",
				540	"onkeyup",
				541	"onload",
				542	"onunload",
				543	"onfocus",
				544	"onblur",
				545	"onsubmit",
				546	"onrest",
				547	"onchange",
				548	"onselect"
				549	};
				550
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	551	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	552	* This table is used by the htmlparser to know what to do with
				553	* broken html pages. By assigning different priorities to different
				554	* elements the parser can decide how to handle extra endtags.
				555	* Endtags are only allowed to close elements with lower or equal
				556	* priority.
				557	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	558
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	559	typedef struct {
				560	const char *name;
				561	int priority;
				562	} elementPriority;
				563
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	564	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	565	{"div", 150},
				566	{"td", 160},
				567	{"th", 160},
				568	{"tr", 170},
				569	{"thead", 180},
				570	{"tbody", 180},
				571	{"tfoot", 180},
				572	{"table", 190},
				573	{"head", 200},
				574	{"body", 200},
				575	{"html", 220},
				576	{NULL, 100} /* Default priority */
				577	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	578
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	579	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	580	static int htmlStartCloseIndexinitialized = 0;
				581
				582	/************************************************************************
				583	* *
				584	* functions to handle HTML specific data *
				585	* *
				586	************************************************************************/
				587
				588	/**
				589	* htmlInitAutoClose:
				590	*
				591	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				592	* This is not reentrant. Call xmlInitParser() once before processing in
				593	* case of use in multithreaded programs.
				594	*/
				595	void
				596	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	597	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	598
				599	if (htmlStartCloseIndexinitialized) return;
				600
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	601	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				602	indx = 0;
				603	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				604	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	605	while (htmlStartClose[i] != NULL) i++;
				606	i++;
				607	}
				608	htmlStartCloseIndexinitialized = 1;
				609	}
				610
				611	/**
				612	* htmlTagLookup:
				613	* @tag: The tag name in lowercase
				614	*
				615	* Lookup the HTML tag in the ElementTable
				616	*
				617	* Returns the related htmlElemDescPtr or NULL if not found.
				618	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	619	const htmlElemDescPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	620	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	621	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	622
				623	for (i = 0; i < (sizeof(html40ElementTable) /
				624	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	625	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	626	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	627	}
				628	return(NULL);
				629	}
				630
				631	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	632	* htmlGetEndPriority:
				633	* @name: The name of the element to look up the priority for.
				634	*
				635	* Return value: The "endtag" priority.
				636	**/
				637	static int
				638	htmlGetEndPriority (const xmlChar *name) {
				639	int i = 0;
				640
				641	while ((htmlEndPriority[i].name != NULL) &&
				642	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				643	i++;
				644
				645	return(htmlEndPriority[i].priority);
				646	}
				647
				648	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	649	* htmlCheckAutoClose:
				650	* @newtag: The new tag name
				651	* @oldtag: The old tag name
				652	*
				653	* Checks wether the new tag is one of the registered valid tags for closing old.
				654	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				655	*
				656	* Returns 0 if no, 1 if yes.
				657	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	658	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	659	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	660	int i, indx;
				661	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	662
				663	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				664
				665	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	666	for (indx = 0; indx < 100;indx++) {
				667	closed = htmlStartCloseIndex[indx];
				668	if (closed == NULL) return(0);
				669	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	670	}
				671
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	672	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	673	i++;
				674	while (htmlStartClose[i] != NULL) {
				675	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				676	return(1);
				677	}
				678	i++;
				679	}
				680	return(0);
				681	}
				682
				683	/**
				684	* htmlAutoCloseOnClose:
				685	* @ctxt: an HTML parser context
				686	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	687	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	688	*
				689	* The HTmL DtD allows an ending tag to implicitely close other tags.
				690	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	691	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	692	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				693	htmlElemDescPtr info;
				694	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	695	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	696
				697	#ifdef DEBUG
				698	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				699	for (i = 0;i < ctxt->nameNr;i++)
				700	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				701	#endif
				702
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	703	priority = htmlGetEndPriority (newtag);
				704
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	705	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	706
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	707	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	708	/*
				709	* A missplaced endtagad can only close elements with lower
				710	* or equal priority, so if we find an element with higher
				711	* priority before we find an element with
				712	* matching name, we just ignore this endtag
				713	*/
				714	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	715	}
				716	if (i < 0) return;
				717
				718	while (!xmlStrEqual(newtag, ctxt->name)) {
				719	info = htmlTagLookup(ctxt->name);
				720	if ((info == NULL) \|\| (info->endTag == 1)) {
				721	#ifdef DEBUG
				722	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				723	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	724	} else if (info->endTag == 3) {
				725	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	726	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	727	#endif
				728	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				729	ctxt->sax->error(ctxt->userData,
				730	"Opening and ending tag mismatch: %s and %s\n",
				731	newtag, ctxt->name);
				732	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	733	}
				734	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				735	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				736	oldname = htmlnamePop(ctxt);
				737	if (oldname != NULL) {
				738	#ifdef DEBUG
				739	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				740	#endif
				741	xmlFree(oldname);
				742	}
				743	}
				744	}
				745
				746	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	747	* htmlAutoCloseOnEnd:
				748	* @ctxt: an HTML parser context
				749	*
				750	* Close all remaining tags at the end of the stream
				751	*/
				752	static void
				753	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				754	xmlChar *oldname;
				755	int i;
				756
				757	if (ctxt->nameNr == 0)
				758	return;
				759	#ifdef DEBUG
				760	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				761	#endif
				762
				763	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				764	#ifdef DEBUG
				765	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				766	#endif
				767	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				768	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				769	oldname = htmlnamePop(ctxt);
				770	if (oldname != NULL) {
				771	#ifdef DEBUG
				772	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				773	#endif
				774	xmlFree(oldname);
				775	}
				776	}
				777	}
				778
				779	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	780	* htmlAutoClose:
				781	* @ctxt: an HTML parser context
				782	* @newtag: The new tag name or NULL
				783	*
				784	* The HTmL DtD allows a tag to implicitely close other tags.
				785	* The list is kept in htmlStartClose array. This function is
				786	* called when a new tag has been detected and generates the
				787	* appropriates closes if possible/needed.
				788	* If newtag is NULL this mean we are at the end of the resource
				789	* and we should check
				790	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	791	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	792	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				793	xmlChar *oldname;
				794	while ((newtag != NULL) && (ctxt->name != NULL) &&
				795	(htmlCheckAutoClose(newtag, ctxt->name))) {
				796	#ifdef DEBUG
				797	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				798	#endif
				799	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				800	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				801	oldname = htmlnamePop(ctxt);
				802	if (oldname != NULL) {
				803	#ifdef DEBUG
				804	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				805	#endif
				806	xmlFree(oldname);
				807	}
				808	}
				809	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	810	htmlAutoCloseOnEnd(ctxt);
				811	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	812	}
				813	while ((newtag == NULL) && (ctxt->name != NULL) &&
				814	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				815	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				816	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				817	#ifdef DEBUG
				818	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				819	#endif
				820	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				821	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				822	oldname = htmlnamePop(ctxt);
				823	if (oldname != NULL) {
				824	#ifdef DEBUG
				825	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				826	#endif
				827	xmlFree(oldname);
				828	}
				829	}
				830
				831	}
				832
				833	/**
				834	* htmlAutoCloseTag:
				835	* @doc: the HTML document
				836	* @name: The tag name
				837	* @elem: the HTML element
				838	*
				839	* The HTmL DtD allows a tag to implicitely close other tags.
				840	* The list is kept in htmlStartClose array. This function checks
				841	* if the element or one of it's children would autoclose the
				842	* given tag.
				843	*
				844	* Returns 1 if autoclose, 0 otherwise
				845	*/
				846	int
				847	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				848	htmlNodePtr child;
				849
				850	if (elem == NULL) return(1);
				851	if (xmlStrEqual(name, elem->name)) return(0);
				852	if (htmlCheckAutoClose(elem->name, name)) return(1);
				853	child = elem->children;
				854	while (child != NULL) {
				855	if (htmlAutoCloseTag(doc, name, child)) return(1);
				856	child = child->next;
				857	}
				858	return(0);
				859	}
				860
				861	/**
				862	* htmlIsAutoClosed:
				863	* @doc: the HTML document
				864	* @elem: the HTML element
				865	*
				866	* The HTmL DtD allows a tag to implicitely close other tags.
				867	* The list is kept in htmlStartClose array. This function checks
				868	* if a tag is autoclosed by one of it's child
				869	*
				870	* Returns 1 if autoclosed, 0 otherwise
				871	*/
				872	int
				873	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				874	htmlNodePtr child;
				875
				876	if (elem == NULL) return(1);
				877	child = elem->children;
				878	while (child != NULL) {
				879	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				880	child = child->next;
				881	}
				882	return(0);
				883	}
				884
				885	/**
				886	* htmlCheckImplied:
				887	* @ctxt: an HTML parser context
				888	* @newtag: The new tag name
				889	*
				890	* The HTML DtD allows a tag to exists only implicitely
				891	* called when a new tag has been detected and generates the
				892	* appropriates implicit tags if missing
				893	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	894	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	895	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				896	if (!htmlOmittedDefaultValue)
				897	return;
				898	if (xmlStrEqual(newtag, BAD_CAST"html"))
				899	return;
				900	if (ctxt->nameNr <= 0) {
				901	#ifdef DEBUG
				902	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				903	#endif
				904	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				905	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				906	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				907	}
				908	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				909	return;
				910	if ((ctxt->nameNr <= 1) &&
				911	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				912	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				913	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				914	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				915	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				916	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				917	/*
				918	* dropped OBJECT ... i you put it first BODY will be
				919	* assumed !
				920	*/
				921	#ifdef DEBUG
				922	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				923	#endif
				924	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				925	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				926	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				927	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				928	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				929	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				930	int i;
				931	for (i = 0;i < ctxt->nameNr;i++) {
				932	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				933	return;
				934	}
				935	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				936	return;
				937	}
				938	}
				939
				940	#ifdef DEBUG
				941	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				942	#endif
				943	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				944	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				945	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				946	}
				947	}
				948
				949	/**
				950	* htmlCheckParagraph
				951	* @ctxt: an HTML parser context
				952	*
				953	* Check whether a p element need to be implied before inserting
				954	* characters in the current element.
				955	*
				956	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				957	* in case of error.
				958	*/
				959
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	960	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	961	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				962	const xmlChar *tag;
				963	int i;
				964
				965	if (ctxt == NULL)
				966	return(-1);
				967	tag = ctxt->name;
				968	if (tag == NULL) {
				969	htmlAutoClose(ctxt, BAD_CAST"p");
				970	htmlCheckImplied(ctxt, BAD_CAST"p");
				971	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				972	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				973	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				974	return(1);
				975	}
				976	if (!htmlOmittedDefaultValue)
				977	return(0);
				978	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				979	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				980	#ifdef DEBUG
				981	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				982	#endif
				983	htmlAutoClose(ctxt, BAD_CAST"p");
				984	htmlCheckImplied(ctxt, BAD_CAST"p");
				985	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				986	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				987	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				988	return(1);
				989	}
				990	}
				991	return(0);
				992	}
				993
				994	/**
				995	* htmlIsScriptAttribute:
				996	* @name: an attribute name
				997	*
				998	* Check if an attribute is of content type Script
				999	*
				1000	* Returns 1 is the attribute is a script 0 otherwise
				1001	*/
				1002	int
				1003	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1004	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1005
				1006	if (name == NULL)
				1007	return(0);
				1008	/*
				1009	* all script attributes start with 'on'
				1010	*/
				1011	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1012	return(0);
				1013	for (i = 0;
				1014	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1015	i++) {
				1016	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1017	return(1);
				1018	}
				1019	return(0);
				1020	}
				1021
				1022	/************************************************************************
				1023	* *
				1024	* The list of HTML predefined entities *
				1025	* *
				1026	************************************************************************/
				1027
				1028
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1029	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1030	/*
				1031	* the 4 absolute ones, plus apostrophe.
				1032	*/
				1033	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1034	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1035	{ 39, "apos", "single quote" },
				1036	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1037	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1038
				1039	/*
				1040	* A bunch still in the 128-255 range
				1041	* Replacing them depend really on the charset used.
				1042	*/
				1043	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1044	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1045	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1046	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1047	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1048	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1049	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1050	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1051	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1052	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1053	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1054	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1055	{ 172, "not", "not sign, U+00AC ISOnum" },
				1056	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1057	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1058	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1059	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1060	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1061	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1062	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1063	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1064	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1065	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1066	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1067	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1068	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1069	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1070	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1071	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1072	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1073	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1074	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1075	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1076	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1077	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1078	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1079	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1080	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1081	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1082	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1083	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1084	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1085	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1086	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1087	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1088	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1089	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1090	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1091	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1092	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1093	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1094	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1095	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1096	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1097	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1098	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1099	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1100	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1101	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1102	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1103	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1104	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1105	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1106	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1107	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1108	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1109	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1110	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1111	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1112	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1113	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1114	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1115	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1116	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1117	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1118	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1119	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1120	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1121	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1122	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1123	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1124	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1125	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1126	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1127	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1128	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1129	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1130	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1131	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1132	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1133	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1134	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1135	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1136	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1137	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1138	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1139
				1140	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1141	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1142	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1143	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1144	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1145
				1146	/*
				1147	* Anything below should really be kept as entities references
				1148	*/
				1149	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1150
				1151	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1152	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1153
				1154	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1155	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1156	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1157	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1158	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1159	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1160	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1161	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1162	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1163	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1164	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1165	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1166	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1167	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1168	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1169	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1170	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1171	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1172	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1173	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1174	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1175	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1176	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1177	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1178
				1179	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1180	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1181	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1182	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1183	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1184	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1185	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1186	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1187	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1188	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1189	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1190	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1191	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1192	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1193	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1194	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1195	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1196	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1197	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1198	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1199	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1200	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1201	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1202	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1203	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1204	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1205	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1206	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1207
				1208	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1209	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1210	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1211	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1212	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1213	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1214	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1215	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1216	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1217	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1218	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1219	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1220	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1221	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1222	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1223	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1224	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1225
				1226	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1227	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1228
				1229	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1230
				1231	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1232	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1233
				1234	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1235	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1236
				1237	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1238	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1239
				1240	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1241
				1242	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1243	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1244	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1245	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1246	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1247	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1248	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1249	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1250	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1251	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1252	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1253	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1254	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1255	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1256	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1257	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1258
				1259	{ 8704, "forall","for all, U+2200 ISOtech" },
				1260	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1261	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1262	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1263	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1264	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1265	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1266	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1267	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1268	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1269	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1270	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1271	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1272	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1273	{ 8734, "infin","infinity, U+221E ISOtech" },
				1274	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1275	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1276	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1277	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1278	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1279	{ 8747, "int", "integral, U+222B ISOtech" },
				1280	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1281	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1282	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1283	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1284	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1285	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1286	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1287	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1288	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1289	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1290	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1291	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1292	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1293	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1294	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1295	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1296	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1297	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1298	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1299	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1300	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1301	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1302	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1303	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1304
				1305	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1306	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1307	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1308	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1309
				1310	};
				1311
				1312	/************************************************************************
				1313	* *
				1314	* Commodity functions to handle entities *
				1315	* *
				1316	************************************************************************/
				1317
				1318	/*
				1319	* Macro used to grow the current buffer.
				1320	*/
				1321	#define growBuffer(buffer) { \
				1322	buffer##_size *= 2; \
				1323	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1324	if (buffer == NULL) { \
				1325	perror("realloc failed"); \
				1326	return(NULL); \
				1327	} \
				1328	}
				1329
				1330	/**
				1331	* htmlEntityLookup:
				1332	* @name: the entity name
				1333	*
				1334	* Lookup the given entity in EntitiesTable
				1335	*
				1336	* TODO: the linear scan is really ugly, an hash table is really needed.
				1337	*
				1338	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1339	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1340	const htmlEntityDescPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1341	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1342	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1343
				1344	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1345	sizeof(html40EntitiesTable[0]));i++) {
				1346	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1347	#ifdef DEBUG
				1348	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1349	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1350	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1351	}
				1352	}
				1353	return(NULL);
				1354	}
				1355
				1356	/**
				1357	* htmlEntityValueLookup:
				1358	* @value: the entity's unicode value
				1359	*
				1360	* Lookup the given entity in EntitiesTable
				1361	*
				1362	* TODO: the linear scan is really ugly, an hash table is really needed.
				1363	*
				1364	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1365	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1366	const htmlEntityDescPtr
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1367	htmlEntityValueLookup(unsigned int value) {
				1368	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1369	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1370	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1371	#endif
				1372
				1373	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1374	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1375	if (html40EntitiesTable[i].value >= value) {
				1376	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1377	break;
				1378	#ifdef DEBUG
				1379	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1380	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1381	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1382	}
				1383	#ifdef DEBUG
				1384	if (lv > html40EntitiesTable[i].value) {
				1385	xmlGenericError(xmlGenericErrorContext,
				1386	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1387	lv, html40EntitiesTable[i].value);
				1388	}
				1389	lv = html40EntitiesTable[i].value;
				1390	#endif
				1391	}
				1392	return(NULL);
				1393	}
				1394
				1395	/**
				1396	* UTF8ToHtml:
				1397	* @out: a pointer to an array of bytes to store the result
				1398	* @outlen: the length of @out
				1399	* @in: a pointer to an array of UTF-8 chars
				1400	* @inlen: the length of @in
				1401	*
				1402	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1403	* plus HTML entities block of chars out.
				1404	*
				1405	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1406	* The value of @inlen after return is the number of octets consumed
				1407	* as the return value is positive, else unpredictiable.
				1408	* The value of @outlen after return is the number of octets consumed.
				1409	*/
				1410	int
				1411	UTF8ToHtml(unsigned char* out, int *outlen,
				1412	const unsigned char* in, int *inlen) {
				1413	const unsigned char* processed = in;
				1414	const unsigned char* outend;
				1415	const unsigned char* outstart = out;
				1416	const unsigned char* instart = in;
				1417	const unsigned char* inend;
				1418	unsigned int c, d;
				1419	int trailing;
				1420
				1421	if (in == NULL) {
				1422	/*
				1423	* initialization nothing to do
				1424	*/
				1425	*outlen = 0;
				1426	*inlen = 0;
				1427	return(0);
				1428	}
				1429	inend = in + (*inlen);
				1430	outend = out + (*outlen);
				1431	while (in < inend) {
				1432	d = *in++;
				1433	if (d < 0x80) { c= d; trailing= 0; }
				1434	else if (d < 0xC0) {
				1435	/* trailing byte in leading position */
				1436	*outlen = out - outstart;
				1437	*inlen = processed - instart;
				1438	return(-2);
				1439	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1440	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1441	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1442	else {
				1443	/* no chance for this in Ascii */
				1444	*outlen = out - outstart;
				1445	*inlen = processed - instart;
				1446	return(-2);
				1447	}
				1448
				1449	if (inend - in < trailing) {
				1450	break;
				1451	}
				1452
				1453	for ( ; trailing; trailing--) {
				1454	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1455	break;
				1456	c <<= 6;
				1457	c \|= d & 0x3F;
				1458	}
				1459
				1460	/* assertion: c is a single UTF-4 value */
				1461	if (c < 0x80) {
				1462	if (out + 1 >= outend)
				1463	break;
				1464	*out++ = c;
				1465	} else {
				1466	int len;
				1467	htmlEntityDescPtr ent;
				1468
				1469	/*
				1470	* Try to lookup a predefined HTML entity for it
				1471	*/
				1472
				1473	ent = htmlEntityValueLookup(c);
				1474	if (ent == NULL) {
				1475	/* no chance for this in Ascii */
				1476	*outlen = out - outstart;
				1477	*inlen = processed - instart;
				1478	return(-2);
				1479	}
				1480	len = strlen(ent->name);
				1481	if (out + 2 + len >= outend)
				1482	break;
				1483	*out++ = '&';
				1484	memcpy(out, ent->name, len);
				1485	out += len;
				1486	*out++ = ';';
				1487	}
				1488	processed = in;
				1489	}
				1490	*outlen = out - outstart;
				1491	*inlen = processed - instart;
				1492	return(0);
				1493	}
				1494
				1495	/**
				1496	* htmlEncodeEntities:
				1497	* @out: a pointer to an array of bytes to store the result
				1498	* @outlen: the length of @out
				1499	* @in: a pointer to an array of UTF-8 chars
				1500	* @inlen: the length of @in
				1501	* @quoteChar: the quote character to escape (' or ") or zero.
				1502	*
				1503	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1504	* plus HTML entities block of chars out.
				1505	*
				1506	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1507	* The value of @inlen after return is the number of octets consumed
				1508	* as the return value is positive, else unpredictiable.
				1509	* The value of @outlen after return is the number of octets consumed.
				1510	*/
				1511	int
				1512	htmlEncodeEntities(unsigned char* out, int *outlen,
				1513	const unsigned char* in, int *inlen, int quoteChar) {
				1514	const unsigned char* processed = in;
				1515	const unsigned char* outend = out + (*outlen);
				1516	const unsigned char* outstart = out;
				1517	const unsigned char* instart = in;
				1518	const unsigned char* inend = in + (*inlen);
				1519	unsigned int c, d;
				1520	int trailing;
				1521
				1522	while (in < inend) {
				1523	d = *in++;
				1524	if (d < 0x80) { c= d; trailing= 0; }
				1525	else if (d < 0xC0) {
				1526	/* trailing byte in leading position */
				1527	*outlen = out - outstart;
				1528	*inlen = processed - instart;
				1529	return(-2);
				1530	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1531	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1532	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1533	else {
				1534	/* no chance for this in Ascii */
				1535	*outlen = out - outstart;
				1536	*inlen = processed - instart;
				1537	return(-2);
				1538	}
				1539
				1540	if (inend - in < trailing)
				1541	break;
				1542
				1543	while (trailing--) {
				1544	if (((d= *in++) & 0xC0) != 0x80) {
				1545	*outlen = out - outstart;
				1546	*inlen = processed - instart;
				1547	return(-2);
				1548	}
				1549	c <<= 6;
				1550	c \|= d & 0x3F;
				1551	}
				1552
				1553	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1554	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1555	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1556	if (out >= outend)
				1557	break;
				1558	*out++ = c;
				1559	} else {
				1560	htmlEntityDescPtr ent;
				1561	const char *cp;
				1562	char nbuf[16];
				1563	int len;
				1564
				1565	/*
				1566	* Try to lookup a predefined HTML entity for it
				1567	*/
				1568	ent = htmlEntityValueLookup(c);
				1569	if (ent == NULL) {
				1570	sprintf(nbuf, "#%u", c);
				1571	cp = nbuf;
				1572	}
				1573	else
				1574	cp = ent->name;
				1575	len = strlen(cp);
				1576	if (out + 2 + len > outend)
				1577	break;
				1578	*out++ = '&';
				1579	memcpy(out, cp, len);
				1580	out += len;
				1581	*out++ = ';';
				1582	}
				1583	processed = in;
				1584	}
				1585	*outlen = out - outstart;
				1586	*inlen = processed - instart;
				1587	return(0);
				1588	}
				1589
				1590	/**
				1591	* htmlDecodeEntities:
				1592	* @ctxt: the parser context
				1593	* @len: the len to decode (in bytes !), -1 for no size limit
				1594	* @end: an end marker xmlChar, 0 if none
				1595	* @end2: an end marker xmlChar, 0 if none
				1596	* @end3: an end marker xmlChar, 0 if none
				1597	*
				1598	* Subtitute the HTML entities by their value
				1599	*
				1600	* DEPRECATED !!!!
				1601	*
				1602	* Returns A newly allocated string with the substitution done. The caller
				1603	* must deallocate it !
				1604	*/
				1605	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1606	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1607	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1608	static int deprecated = 0;
				1609	if (!deprecated) {
				1610	xmlGenericError(xmlGenericErrorContext,
				1611	"htmlDecodeEntities() deprecated function reached\n");
				1612	deprecated = 1;
				1613	}
				1614	return(NULL);
				1615	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1616	xmlChar *name = NULL;
				1617	xmlChar *buffer = NULL;
				1618	unsigned int buffer_size = 0;
				1619	unsigned int nbchars = 0;
				1620	htmlEntityDescPtr ent;
				1621	unsigned int max = (unsigned int) len;
				1622	int c,l;
				1623
				1624	if (ctxt->depth > 40) {
				1625	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1626	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1627	ctxt->sax->error(ctxt->userData,
				1628	"Detected entity reference loop\n");
				1629	ctxt->wellFormed = 0;
				1630	ctxt->disableSAX = 1;
				1631	return(NULL);
				1632	}
				1633
				1634	/*
				1635	* allocate a translation buffer.
				1636	*/
				1637	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1638	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1639	if (buffer == NULL) {
				1640	perror("xmlDecodeEntities: malloc failed");
				1641	return(NULL);
				1642	}
				1643
				1644	/*
				1645	* Ok loop until we reach one of the ending char or a size limit.
				1646	*/
				1647	c = CUR_CHAR(l);
				1648	while ((nbchars < max) && (c != end) &&
				1649	(c != end2) && (c != end3)) {
				1650
				1651	if (c == 0) break;
				1652	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1653	int val = htmlParseCharRef(ctxt);
				1654	COPY_BUF(0,buffer,nbchars,val);
				1655	NEXTL(l);
				1656	} else if ((c == '&') && (ctxt->token != '&')) {
				1657	ent = htmlParseEntityRef(ctxt, &name);
				1658	if (name != NULL) {
				1659	if (ent != NULL) {
				1660	int val = ent->value;
				1661	COPY_BUF(0,buffer,nbchars,val);
				1662	NEXTL(l);
				1663	} else {
				1664	const xmlChar *cur = name;
				1665
				1666	buffer[nbchars++] = '&';
				1667	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1668	growBuffer(buffer);
				1669	}
				1670	while (*cur != 0) {
				1671	buffer[nbchars++] = *cur++;
				1672	}
				1673	buffer[nbchars++] = ';';
				1674	}
				1675	}
				1676	} else {
				1677	COPY_BUF(l,buffer,nbchars,c);
				1678	NEXTL(l);
				1679	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1680	growBuffer(buffer);
				1681	}
				1682	}
				1683	c = CUR_CHAR(l);
				1684	}
				1685	buffer[nbchars++] = 0;
				1686	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1687	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1688	}
				1689
				1690	/************************************************************************
				1691	* *
				1692	* Commodity functions to handle streams *
				1693	* *
				1694	************************************************************************/
				1695
				1696	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1697	* htmlNewInputStream:
				1698	* @ctxt: an HTML parser context
				1699	*
				1700	* Create a new input stream structure
				1701	* Returns the new input stream or NULL
				1702	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1703	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1704	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1705	htmlParserInputPtr input;
				1706
				1707	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1708	if (input == NULL) {
				1709	ctxt->errNo = XML_ERR_NO_MEMORY;
				1710	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1711	ctxt->sax->error(ctxt->userData,
				1712	"malloc: couldn't allocate a new input stream\n");
				1713	return(NULL);
				1714	}
				1715	memset(input, 0, sizeof(htmlParserInput));
				1716	input->filename = NULL;
				1717	input->directory = NULL;
				1718	input->base = NULL;
				1719	input->cur = NULL;
				1720	input->buf = NULL;
				1721	input->line = 1;
				1722	input->col = 1;
				1723	input->buf = NULL;
				1724	input->free = NULL;
				1725	input->version = NULL;
				1726	input->consumed = 0;
				1727	input->length = 0;
				1728	return(input);
				1729	}
				1730
				1731
				1732	/************************************************************************
				1733	* *
				1734	* Commodity functions, cleanup needed ? *
				1735	* *
				1736	************************************************************************/
				1737
				1738	/**
				1739	* areBlanks:
				1740	* @ctxt: an HTML parser context
				1741	* @str: a xmlChar *
				1742	* @len: the size of @str
				1743	*
				1744	* Is this a sequence of blank chars that one can ignore ?
				1745	*
				1746	* Returns 1 if ignorable 0 otherwise.
				1747	*/
				1748
				1749	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1750	int i;
				1751	xmlNodePtr lastChild;
				1752
				1753	for (i = 0;i < len;i++)
				1754	if (!(IS_BLANK(str[i]))) return(0);
				1755
				1756	if (CUR == 0) return(1);
				1757	if (CUR != '<') return(0);
				1758	if (ctxt->name == NULL)
				1759	return(1);
				1760	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1761	return(1);
				1762	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1763	return(1);
				1764	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1765	return(1);
				1766	if (ctxt->node == NULL) return(0);
				1767	lastChild = xmlGetLastChild(ctxt->node);
				1768	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1769	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1770	(ctxt->node->content != NULL)) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1771	} else if (xmlNodeIsText(lastChild)) {
				1772	return(0);
				1773	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1774	return(0);
				1775	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1776	return(0);
				1777	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1778	return(0);
				1779	}
				1780	return(1);
				1781	}
				1782
				1783	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1784	* htmlNewDocNoDtD:
				1785	* @URI: URI for the dtd, or NULL
				1786	* @ExternalID: the external ID of the DTD, or NULL
				1787	*
				1788	* Returns a new document, do not intialize the DTD if not provided
				1789	*/
				1790	htmlDocPtr
				1791	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1792	xmlDocPtr cur;
				1793
				1794	/*
				1795	* Allocate a new document and fill the fields.
				1796	*/
				1797	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1798	if (cur == NULL) {
				1799	xmlGenericError(xmlGenericErrorContext,
				1800	"xmlNewDoc : malloc failed\n");
				1801	return(NULL);
				1802	}
				1803	memset(cur, 0, sizeof(xmlDoc));
				1804
				1805	cur->type = XML_HTML_DOCUMENT_NODE;
				1806	cur->version = NULL;
				1807	cur->intSubset = NULL;
				1808	if ((ExternalID != NULL) \|\|
				1809	(URI != NULL))
				1810	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1811	cur->doc = cur;
				1812	cur->name = NULL;
				1813	cur->children = NULL;
				1814	cur->extSubset = NULL;
				1815	cur->oldNs = NULL;
				1816	cur->encoding = NULL;
				1817	cur->standalone = 1;
				1818	cur->compression = 0;
				1819	cur->ids = NULL;
				1820	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1821	cur->_private = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1822	return(cur);
				1823	}
				1824
				1825	/**
				1826	* htmlNewDoc:
				1827	* @URI: URI for the dtd, or NULL
				1828	* @ExternalID: the external ID of the DTD, or NULL
				1829	*
				1830	* Returns a new document
				1831	*/
				1832	htmlDocPtr
				1833	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1834	if ((URI == NULL) && (ExternalID == NULL))
				1835	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1836	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1837	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1838
				1839	return(htmlNewDocNoDtD(URI, ExternalID));
				1840	}
				1841
				1842
				1843	/************************************************************************
				1844	* *
				1845	* The parser itself *
				1846	* Relates to http://www.w3.org/TR/html40 *
				1847	* *
				1848	************************************************************************/
				1849
				1850	/************************************************************************
				1851	* *
				1852	* The parser itself *
				1853	* *
				1854	************************************************************************/
				1855
				1856	/**
				1857	* htmlParseHTMLName:
				1858	* @ctxt: an HTML parser context
				1859	*
				1860	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1861	* since HTML names are not case-sensitive.
				1862	*
				1863	* Returns the Tag Name parsed or NULL
				1864	*/
				1865
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1866	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1867	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1868	xmlChar *ret = NULL;
				1869	int i = 0;
				1870	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1871
				1872	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1873	(CUR != ':')) return(NULL);
				1874
				1875	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1876	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1877	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1878	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1879	else loc[i] = CUR;
				1880	i++;
				1881
				1882	NEXT;
				1883	}
				1884
				1885	ret = xmlStrndup(loc, i);
				1886
				1887	return(ret);
				1888	}
				1889
				1890	/**
				1891	* htmlParseName:
				1892	* @ctxt: an HTML parser context
				1893	*
				1894	* parse an HTML name, this routine is case sensistive.
				1895	*
				1896	* Returns the Name parsed or NULL
				1897	*/
				1898
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1899	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1900	htmlParseName(htmlParserCtxtPtr ctxt) {
				1901	xmlChar buf[HTML_MAX_NAMELEN];
				1902	int len = 0;
				1903
				1904	GROW;
				1905	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1906	return(NULL);
				1907	}
				1908
				1909	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1910	(CUR == '.') \|\| (CUR == '-') \|\|
				1911	(CUR == '_') \|\| (CUR == ':') \|\|
				1912	(IS_COMBINING(CUR)) \|\|
				1913	(IS_EXTENDER(CUR))) {
				1914	buf[len++] = CUR;
				1915	NEXT;
				1916	if (len >= HTML_MAX_NAMELEN) {
				1917	xmlGenericError(xmlGenericErrorContext,
				1918	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1919	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1920	(CUR == '.') \|\| (CUR == '-') \|\|
				1921	(CUR == '_') \|\| (CUR == ':') \|\|
				1922	(IS_COMBINING(CUR)) \|\|
				1923	(IS_EXTENDER(CUR)))
				1924	NEXT;
				1925	break;
				1926	}
				1927	}
				1928	return(xmlStrndup(buf, len));
				1929	}
				1930
				1931	/**
				1932	* htmlParseHTMLAttribute:
				1933	* @ctxt: an HTML parser context
				1934	* @stop: a char stop value
				1935	*
				1936	* parse an HTML attribute value till the stop (quote), if
				1937	* stop is 0 then it stops at the first space
				1938	*
				1939	* Returns the attribute parsed or NULL
				1940	*/
				1941
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1942	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1943	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1944	xmlChar *buffer = NULL;
				1945	int buffer_size = 0;
				1946	xmlChar *out = NULL;
				1947	xmlChar *name = NULL;
				1948
				1949	xmlChar *cur = NULL;
				1950	htmlEntityDescPtr ent;
				1951
				1952	/*
				1953	* allocate a translation buffer.
				1954	*/
				1955	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1956	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1957	if (buffer == NULL) {
				1958	perror("htmlParseHTMLAttribute: malloc failed");
				1959	return(NULL);
				1960	}
				1961	out = buffer;
				1962
				1963	/*
				1964	* Ok loop until we reach one of the ending chars
				1965	*/
				1966	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1967	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1968	if (CUR == '&') {
				1969	if (NXT(1) == '#') {
				1970	unsigned int c;
				1971	int bits;
				1972
				1973	c = htmlParseCharRef(ctxt);
				1974	if (c < 0x80)
				1975	{ *out++ = c; bits= -6; }
				1976	else if (c < 0x800)
				1977	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1978	else if (c < 0x10000)
				1979	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1980	else
				1981	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1982
				1983	for ( ; bits >= 0; bits-= 6) {
				1984	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1985	}
				1986	} else {
				1987	ent = htmlParseEntityRef(ctxt, &name);
				1988	if (name == NULL) {
				1989	*out++ = '&';
				1990	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1991	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1992
				1993	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1994	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1995	}
				1996	} else if (ent == NULL) {
				1997	*out++ = '&';
				1998	cur = name;
				1999	while (*cur != 0) {
				2000	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2001	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2002
				2003	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2004	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2005	}
				2006	out++ = cur++;
				2007	}
				2008	xmlFree(name);
				2009	} else {
				2010	unsigned int c;
				2011	int bits;
				2012
				2013	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2014	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2015
				2016	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2017	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2018	}
				2019	c = (xmlChar)ent->value;
				2020	if (c < 0x80)
				2021	{ *out++ = c; bits= -6; }
				2022	else if (c < 0x800)
				2023	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2024	else if (c < 0x10000)
				2025	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2026	else
				2027	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2028
				2029	for ( ; bits >= 0; bits-= 6) {
				2030	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2031	}
				2032	xmlFree(name);
				2033	}
				2034	}
				2035	} else {
				2036	unsigned int c;
				2037	int bits, l;
				2038
				2039	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2040	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2041
				2042	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2043	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2044	}
				2045	c = CUR_CHAR(l);
				2046	if (c < 0x80)
				2047	{ *out++ = c; bits= -6; }
				2048	else if (c < 0x800)
				2049	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2050	else if (c < 0x10000)
				2051	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2052	else
				2053	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2054
				2055	for ( ; bits >= 0; bits-= 6) {
				2056	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2057	}
				2058	NEXT;
				2059	}
				2060	}
				2061	*out++ = 0;
				2062	return(buffer);
				2063	}
				2064
				2065	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2066	* htmlParseEntityRef:
				2067	* @ctxt: an HTML parser context
				2068	* @str: location to store the entity name
				2069	*
				2070	* parse an HTML ENTITY references
				2071	*
				2072	* [68] EntityRef ::= '&' Name ';'
				2073	*
				2074	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2075	* if non-NULL *str will have to be freed by the caller.
				2076	*/
				2077	htmlEntityDescPtr
				2078	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2079	xmlChar *name;
				2080	htmlEntityDescPtr ent = NULL;
				2081	*str = NULL;
				2082
				2083	if (CUR == '&') {
				2084	NEXT;
				2085	name = htmlParseName(ctxt);
				2086	if (name == NULL) {
				2087	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2088	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2089	ctxt->wellFormed = 0;
				2090	} else {
				2091	GROW;
				2092	if (CUR == ';') {
				2093	*str = name;
				2094
				2095	/*
				2096	* Lookup the entity in the table.
				2097	*/
				2098	ent = htmlEntityLookup(name);
				2099	if (ent != NULL) /* OK that's ugly !!! */
				2100	NEXT;
				2101	} else {
				2102	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2103	ctxt->sax->error(ctxt->userData,
				2104	"htmlParseEntityRef: expecting ';'\n");
				2105	*str = name;
				2106	}
				2107	}
				2108	}
				2109	return(ent);
				2110	}
				2111
				2112	/**
				2113	* htmlParseAttValue:
				2114	* @ctxt: an HTML parser context
				2115	*
				2116	* parse a value for an attribute
				2117	* Note: the parser won't do substitution of entities here, this
				2118	* will be handled later in xmlStringGetNodeList, unless it was
				2119	* asked for ctxt->replaceEntities != 0
				2120	*
				2121	* Returns the AttValue parsed or NULL.
				2122	*/
				2123
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2124	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2125	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2126	xmlChar *ret = NULL;
				2127
				2128	if (CUR == '"') {
				2129	NEXT;
				2130	ret = htmlParseHTMLAttribute(ctxt, '"');
				2131	if (CUR != '"') {
				2132	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2133	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2134	ctxt->wellFormed = 0;
				2135	} else
				2136	NEXT;
				2137	} else if (CUR == '\'') {
				2138	NEXT;
				2139	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2140	if (CUR != '\'') {
				2141	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2142	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2143	ctxt->wellFormed = 0;
				2144	} else
				2145	NEXT;
				2146	} else {
				2147	/*
				2148	* That's an HTMLism, the attribute value may not be quoted
				2149	*/
				2150	ret = htmlParseHTMLAttribute(ctxt, 0);
				2151	if (ret == NULL) {
				2152	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2153	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2154	ctxt->wellFormed = 0;
				2155	}
				2156	}
				2157	return(ret);
				2158	}
				2159
				2160	/**
				2161	* htmlParseSystemLiteral:
				2162	* @ctxt: an HTML parser context
				2163	*
				2164	* parse an HTML Literal
				2165	*
				2166	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2167	*
				2168	* Returns the SystemLiteral parsed or NULL
				2169	*/
				2170
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2171	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2172	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2173	const xmlChar *q;
				2174	xmlChar *ret = NULL;
				2175
				2176	if (CUR == '"') {
				2177	NEXT;
				2178	q = CUR_PTR;
				2179	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2180	NEXT;
				2181	if (!IS_CHAR(CUR)) {
				2182	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2183	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2184	ctxt->wellFormed = 0;
				2185	} else {
				2186	ret = xmlStrndup(q, CUR_PTR - q);
				2187	NEXT;
				2188	}
				2189	} else if (CUR == '\'') {
				2190	NEXT;
				2191	q = CUR_PTR;
				2192	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2193	NEXT;
				2194	if (!IS_CHAR(CUR)) {
				2195	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2196	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2197	ctxt->wellFormed = 0;
				2198	} else {
				2199	ret = xmlStrndup(q, CUR_PTR - q);
				2200	NEXT;
				2201	}
				2202	} else {
				2203	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2204	ctxt->sax->error(ctxt->userData,
				2205	"SystemLiteral \" or ' expected\n");
				2206	ctxt->wellFormed = 0;
				2207	}
				2208
				2209	return(ret);
				2210	}
				2211
				2212	/**
				2213	* htmlParsePubidLiteral:
				2214	* @ctxt: an HTML parser context
				2215	*
				2216	* parse an HTML public literal
				2217	*
				2218	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2219	*
				2220	* Returns the PubidLiteral parsed or NULL.
				2221	*/
				2222
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2223	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2224	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2225	const xmlChar *q;
				2226	xmlChar *ret = NULL;
				2227	/*
				2228	* Name ::= (Letter \| '_') (NameChar)*
				2229	*/
				2230	if (CUR == '"') {
				2231	NEXT;
				2232	q = CUR_PTR;
				2233	while (IS_PUBIDCHAR(CUR)) NEXT;
				2234	if (CUR != '"') {
				2235	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2236	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2237	ctxt->wellFormed = 0;
				2238	} else {
				2239	ret = xmlStrndup(q, CUR_PTR - q);
				2240	NEXT;
				2241	}
				2242	} else if (CUR == '\'') {
				2243	NEXT;
				2244	q = CUR_PTR;
				2245	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2246	NEXT;
				2247	if (!IS_LETTER(CUR)) {
				2248	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2249	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2250	ctxt->wellFormed = 0;
				2251	} else {
				2252	ret = xmlStrndup(q, CUR_PTR - q);
				2253	NEXT;
				2254	}
				2255	} else {
				2256	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2257	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2258	ctxt->wellFormed = 0;
				2259	}
				2260
				2261	return(ret);
				2262	}
				2263
				2264	/**
				2265	* htmlParseScript:
				2266	* @ctxt: an HTML parser context
				2267	*
				2268	* parse the content of an HTML SCRIPT or STYLE element
				2269	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2270	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2271	* http://www.w3.org/TR/html4/types.html#type-script
				2272	* http://www.w3.org/TR/html4/types.html#h-6.15
				2273	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2274	*
				2275	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2276	* element and the value of intrinsic event attributes. User agents must
				2277	* not evaluate script data as HTML markup but instead must pass it on as
				2278	* data to a script engine.
				2279	* NOTES:
				2280	* - The content is passed like CDATA
				2281	* - the attributes for style and scripting "onXXX" are also described
				2282	* as CDATA but SGML allows entities references in attributes so their
				2283	* processing is identical as other attributes
				2284	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2285	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2286	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2287	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2288	int nbchar = 0;
				2289	xmlChar cur;
				2290
				2291	SHRINK;
				2292	cur = CUR;
				2293	while (IS_CHAR(cur)) {
				2294	if ((cur == '<') && (NXT(1) == '/')) {
				2295	/*
				2296	* One should break here, the specification is clear:
				2297	* Authors should therefore escape "</" within the content.
				2298	* Escape mechanisms are specific to each scripting or
				2299	* style sheet language.
				2300	*/
				2301	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2302	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2303	break; /* while */
				2304	}
				2305	buf[nbchar++] = cur;
				2306	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2307	if (ctxt->sax->cdataBlock!= NULL) {
				2308	/*
				2309	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2310	*/
				2311	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2312	}
				2313	nbchar = 0;
				2314	}
				2315	NEXT;
				2316	cur = CUR;
				2317	}
				2318	if (!(IS_CHAR(cur))) {
				2319	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2320	ctxt->sax->error(ctxt->userData,
				2321	"Invalid char in CDATA 0x%X\n", cur);
				2322	ctxt->wellFormed = 0;
				2323	NEXT;
				2324	}
				2325
				2326	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2327	if (ctxt->sax->cdataBlock!= NULL) {
				2328	/*
				2329	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2330	*/
				2331	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2332	}
				2333	}
				2334	}
				2335
				2336
				2337	/**
				2338	* htmlParseCharData:
				2339	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2340	*
				2341	* parse a CharData section.
				2342	* if we are within a CDATA section ']]>' marks an end of section.
				2343	*
				2344	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2345	*/
				2346
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2347	static void
				2348	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2349	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2350	int nbchar = 0;
				2351	int cur, l;
				2352
				2353	SHRINK;
				2354	cur = CUR_CHAR(l);
				2355	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2356	((cur != '&') \|\| (ctxt->token == '&')) &&
				2357	(IS_CHAR(cur))) {
				2358	COPY_BUF(l,buf,nbchar,cur);
				2359	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2360	/*
				2361	* Ok the segment is to be consumed as chars.
				2362	*/
				2363	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2364	if (areBlanks(ctxt, buf, nbchar)) {
				2365	if (ctxt->sax->ignorableWhitespace != NULL)
				2366	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2367	buf, nbchar);
				2368	} else {
				2369	htmlCheckParagraph(ctxt);
				2370	if (ctxt->sax->characters != NULL)
				2371	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2372	}
				2373	}
				2374	nbchar = 0;
				2375	}
				2376	NEXTL(l);
				2377	cur = CUR_CHAR(l);
				2378	}
				2379	if (nbchar != 0) {
				2380	/*
				2381	* Ok the segment is to be consumed as chars.
				2382	*/
				2383	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2384	if (areBlanks(ctxt, buf, nbchar)) {
				2385	if (ctxt->sax->ignorableWhitespace != NULL)
				2386	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2387	} else {
				2388	htmlCheckParagraph(ctxt);
				2389	if (ctxt->sax->characters != NULL)
				2390	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2391	}
				2392	}
				2393	}
				2394	}
				2395
				2396	/**
				2397	* htmlParseExternalID:
				2398	* @ctxt: an HTML parser context
				2399	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2400	*
				2401	* Parse an External ID or a Public ID
				2402	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2403	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2404	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2405	*
				2406	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2407	*
				2408	* Returns the function returns SystemLiteral and in the second
				2409	* case publicID receives PubidLiteral, is strict is off
				2410	* it is possible to return NULL and have publicID set.
				2411	*/
				2412
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2413	static xmlChar *
				2414	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2415	xmlChar *URI = NULL;
				2416
				2417	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2418	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2419	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2420	SKIP(6);
				2421	if (!IS_BLANK(CUR)) {
				2422	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2423	ctxt->sax->error(ctxt->userData,
				2424	"Space required after 'SYSTEM'\n");
				2425	ctxt->wellFormed = 0;
				2426	}
				2427	SKIP_BLANKS;
				2428	URI = htmlParseSystemLiteral(ctxt);
				2429	if (URI == NULL) {
				2430	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2431	ctxt->sax->error(ctxt->userData,
				2432	"htmlParseExternalID: SYSTEM, no URI\n");
				2433	ctxt->wellFormed = 0;
				2434	}
				2435	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2436	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2437	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2438	SKIP(6);
				2439	if (!IS_BLANK(CUR)) {
				2440	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2441	ctxt->sax->error(ctxt->userData,
				2442	"Space required after 'PUBLIC'\n");
				2443	ctxt->wellFormed = 0;
				2444	}
				2445	SKIP_BLANKS;
				2446	*publicID = htmlParsePubidLiteral(ctxt);
				2447	if (*publicID == NULL) {
				2448	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2449	ctxt->sax->error(ctxt->userData,
				2450	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2451	ctxt->wellFormed = 0;
				2452	}
				2453	SKIP_BLANKS;
				2454	if ((CUR == '"') \|\| (CUR == '\'')) {
				2455	URI = htmlParseSystemLiteral(ctxt);
				2456	}
				2457	}
				2458	return(URI);
				2459	}
				2460
				2461	/**
				2462	* htmlParseComment:
				2463	* @ctxt: an HTML parser context
				2464	*
				2465	* Parse an XML (SGML) comment <!-- .... -->
				2466	*
				2467	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2468	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2469	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2470	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2471	xmlChar *buf = NULL;
				2472	int len;
				2473	int size = HTML_PARSER_BUFFER_SIZE;
				2474	int q, ql;
				2475	int r, rl;
				2476	int cur, l;
				2477	xmlParserInputState state;
				2478
				2479	/*
				2480	* Check that there is a comment right here.
				2481	*/
				2482	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2483	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2484
				2485	state = ctxt->instate;
				2486	ctxt->instate = XML_PARSER_COMMENT;
				2487	SHRINK;
				2488	SKIP(4);
				2489	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2490	if (buf == NULL) {
				2491	xmlGenericError(xmlGenericErrorContext,
				2492	"malloc of %d byte failed\n", size);
				2493	ctxt->instate = state;
				2494	return;
				2495	}
				2496	q = CUR_CHAR(ql);
				2497	NEXTL(ql);
				2498	r = CUR_CHAR(rl);
				2499	NEXTL(rl);
				2500	cur = CUR_CHAR(l);
				2501	len = 0;
				2502	while (IS_CHAR(cur) &&
				2503	((cur != '>') \|\|
				2504	(r != '-') \|\| (q != '-'))) {
				2505	if (len + 5 >= size) {
				2506	size *= 2;
				2507	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2508	if (buf == NULL) {
				2509	xmlGenericError(xmlGenericErrorContext,
				2510	"realloc of %d byte failed\n", size);
				2511	ctxt->instate = state;
				2512	return;
				2513	}
				2514	}
				2515	COPY_BUF(ql,buf,len,q);
				2516	q = r;
				2517	ql = rl;
				2518	r = cur;
				2519	rl = l;
				2520	NEXTL(l);
				2521	cur = CUR_CHAR(l);
				2522	if (cur == 0) {
				2523	SHRINK;
				2524	GROW;
				2525	cur = CUR_CHAR(l);
				2526	}
				2527	}
				2528	buf[len] = 0;
				2529	if (!IS_CHAR(cur)) {
				2530	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2531	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2532	ctxt->sax->error(ctxt->userData,
				2533	"Comment not terminated \n<!--%.50s\n", buf);
				2534	ctxt->wellFormed = 0;
				2535	xmlFree(buf);
				2536	} else {
				2537	NEXT;
				2538	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2539	(!ctxt->disableSAX))
				2540	ctxt->sax->comment(ctxt->userData, buf);
				2541	xmlFree(buf);
				2542	}
				2543	ctxt->instate = state;
				2544	}
				2545
				2546	/**
				2547	* htmlParseCharRef:
				2548	* @ctxt: an HTML parser context
				2549	*
				2550	* parse Reference declarations
				2551	*
				2552	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2553	* '&#x' [0-9a-fA-F]+ ';'
				2554	*
				2555	* Returns the value parsed (as an int)
				2556	*/
				2557	int
				2558	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2559	int val = 0;
				2560
				2561	if ((CUR == '&') && (NXT(1) == '#') &&
				2562	(NXT(2) == 'x')) {
				2563	SKIP(3);
				2564	while (CUR != ';') {
				2565	if ((CUR >= '0') && (CUR <= '9'))
				2566	val = val * 16 + (CUR - '0');
				2567	else if ((CUR >= 'a') && (CUR <= 'f'))
				2568	val = val * 16 + (CUR - 'a') + 10;
				2569	else if ((CUR >= 'A') && (CUR <= 'F'))
				2570	val = val * 16 + (CUR - 'A') + 10;
				2571	else {
				2572	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2573	ctxt->sax->error(ctxt->userData,
				2574	"htmlParseCharRef: invalid hexadecimal value\n");
				2575	ctxt->wellFormed = 0;
				2576	return(0);
				2577	}
				2578	NEXT;
				2579	}
				2580	if (CUR == ';')
				2581	NEXT;
				2582	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2583	SKIP(2);
				2584	while (CUR != ';') {
				2585	if ((CUR >= '0') && (CUR <= '9'))
				2586	val = val * 10 + (CUR - '0');
				2587	else {
				2588	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2589	ctxt->sax->error(ctxt->userData,
				2590	"htmlParseCharRef: invalid decimal value\n");
				2591	ctxt->wellFormed = 0;
				2592	return(0);
				2593	}
				2594	NEXT;
				2595	}
				2596	if (CUR == ';')
				2597	NEXT;
				2598	} else {
				2599	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2600	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2601	ctxt->wellFormed = 0;
				2602	}
				2603	/*
				2604	* Check the value IS_CHAR ...
				2605	*/
				2606	if (IS_CHAR(val)) {
				2607	return(val);
				2608	} else {
				2609	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2610	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2611	val);
				2612	ctxt->wellFormed = 0;
				2613	}
				2614	return(0);
				2615	}
				2616
				2617
				2618	/**
				2619	* htmlParseDocTypeDecl :
				2620	* @ctxt: an HTML parser context
				2621	*
				2622	* parse a DOCTYPE declaration
				2623	*
				2624	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2625	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2626	*/
				2627
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2628	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2629	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2630	xmlChar *name;
				2631	xmlChar *ExternalID = NULL;
				2632	xmlChar *URI = NULL;
				2633
				2634	/*
				2635	* We know that '<!DOCTYPE' has been detected.
				2636	*/
				2637	SKIP(9);
				2638
				2639	SKIP_BLANKS;
				2640
				2641	/*
				2642	* Parse the DOCTYPE name.
				2643	*/
				2644	name = htmlParseName(ctxt);
				2645	if (name == NULL) {
				2646	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2647	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2648	ctxt->wellFormed = 0;
				2649	}
				2650	/*
				2651	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2652	*/
				2653
				2654	SKIP_BLANKS;
				2655
				2656	/*
				2657	* Check for SystemID and ExternalID
				2658	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2659	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2660	SKIP_BLANKS;
				2661
				2662	/*
				2663	* We should be at the end of the DOCTYPE declaration.
				2664	*/
				2665	if (CUR != '>') {
				2666	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2667	ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
				2668	ctxt->wellFormed = 0;
				2669	/* We shouldn't try to resynchronize ... */
				2670	}
				2671	NEXT;
				2672
				2673	/*
				2674	* Create or update the document accordingly to the DOCTYPE
				2675	*/
				2676	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2677	(!ctxt->disableSAX))
				2678	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2679
				2680	/*
				2681	* Cleanup, since we don't use all those identifiers
				2682	*/
				2683	if (URI != NULL) xmlFree(URI);
				2684	if (ExternalID != NULL) xmlFree(ExternalID);
				2685	if (name != NULL) xmlFree(name);
				2686	}
				2687
				2688	/**
				2689	* htmlParseAttribute:
				2690	* @ctxt: an HTML parser context
				2691	* @value: a xmlChar ** used to store the value of the attribute
				2692	*
				2693	* parse an attribute
				2694	*
				2695	* [41] Attribute ::= Name Eq AttValue
				2696	*
				2697	* [25] Eq ::= S? '=' S?
				2698	*
				2699	* With namespace:
				2700	*
				2701	* [NS 11] Attribute ::= QName Eq AttValue
				2702	*
				2703	* Also the case QName == xmlns:??? is handled independently as a namespace
				2704	* definition.
				2705	*
				2706	* Returns the attribute name, and the value in *value.
				2707	*/
				2708
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2709	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2710	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2711	xmlChar name, val = NULL;
				2712
				2713	*value = NULL;
				2714	name = htmlParseHTMLName(ctxt);
				2715	if (name == NULL) {
				2716	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2717	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2718	ctxt->wellFormed = 0;
				2719	return(NULL);
				2720	}
				2721
				2722	/*
				2723	* read the value
				2724	*/
				2725	SKIP_BLANKS;
				2726	if (CUR == '=') {
				2727	NEXT;
				2728	SKIP_BLANKS;
				2729	val = htmlParseAttValue(ctxt);
				2730	/******
				2731	} else {
				2732	* TODO : some attribute must have values, some may not
				2733	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2734	ctxt->sax->warning(ctxt->userData,
				2735	"No value for attribute %s\n", name); */
				2736	}
				2737
				2738	*value = val;
				2739	return(name);
				2740	}
				2741
				2742	/**
				2743	* htmlCheckEncoding:
				2744	* @ctxt: an HTML parser context
				2745	* @attvalue: the attribute value
				2746	*
				2747	* Checks an http-equiv attribute from a Meta tag to detect
				2748	* the encoding
				2749	* If a new encoding is detected the parser is switched to decode
				2750	* it and pass UTF8
				2751	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2752	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2753	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2754	const xmlChar *encoding;
				2755
				2756	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2757	return;
				2758
				2759	/* do not change encoding */
				2760	if (ctxt->input->encoding != NULL)
				2761	return;
				2762
				2763	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2764	if (encoding != NULL) {
				2765	encoding += 8;
				2766	} else {
				2767	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2768	if (encoding != NULL)
				2769	encoding += 9;
				2770	}
				2771	if (encoding != NULL) {
				2772	xmlCharEncoding enc;
				2773	xmlCharEncodingHandlerPtr handler;
				2774
				2775	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2776
				2777	if (ctxt->input->encoding != NULL)
				2778	xmlFree((xmlChar *) ctxt->input->encoding);
				2779	ctxt->input->encoding = xmlStrdup(encoding);
				2780
				2781	enc = xmlParseCharEncoding((const char *) encoding);
				2782	/*
				2783	* registered set of known encodings
				2784	*/
				2785	if (enc != XML_CHAR_ENCODING_ERROR) {
				2786	xmlSwitchEncoding(ctxt, enc);
				2787	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2788	} else {
				2789	/*
				2790	* fallback for unknown encodings
				2791	*/
				2792	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2793	if (handler != NULL) {
				2794	xmlSwitchToEncoding(ctxt, handler);
				2795	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2796	} else {
				2797	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2798	}
				2799	}
				2800
				2801	if ((ctxt->input->buf != NULL) &&
				2802	(ctxt->input->buf->encoder != NULL) &&
				2803	(ctxt->input->buf->raw != NULL) &&
				2804	(ctxt->input->buf->buffer != NULL)) {
				2805	int nbchars;
				2806	int processed;
				2807
				2808	/*
				2809	* convert as much as possible to the parser reading buffer.
				2810	*/
				2811	processed = ctxt->input->cur - ctxt->input->base;
				2812	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2813	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2814	ctxt->input->buf->buffer,
				2815	ctxt->input->buf->raw);
				2816	if (nbchars < 0) {
				2817	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2818	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2819	ctxt->sax->error(ctxt->userData,
				2820	"htmlCheckEncoding: encoder error\n");
				2821	}
				2822	ctxt->input->base =
				2823	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2824	}
				2825	}
				2826	}
				2827
				2828	/**
				2829	* htmlCheckMeta:
				2830	* @ctxt: an HTML parser context
				2831	* @atts: the attributes values
				2832	*
				2833	* Checks an attributes from a Meta tag
				2834	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2835	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2836	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2837	int i;
				2838	const xmlChar att, value;
				2839	int http = 0;
				2840	const xmlChar *content = NULL;
				2841
				2842	if ((ctxt == NULL) \|\| (atts == NULL))
				2843	return;
				2844
				2845	i = 0;
				2846	att = atts[i++];
				2847	while (att != NULL) {
				2848	value = atts[i++];
				2849	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2850	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2851	http = 1;
				2852	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2853	content = value;
				2854	att = atts[i++];
				2855	}
				2856	if ((http) && (content != NULL))
				2857	htmlCheckEncoding(ctxt, content);
				2858
				2859	}
				2860
				2861	/**
				2862	* htmlParseStartTag:
				2863	* @ctxt: an HTML parser context
				2864	*
				2865	* parse a start of tag either for rule element or
				2866	* EmptyElement. In both case we don't parse the tag closing chars.
				2867	*
				2868	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2869	*
				2870	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2871	*
				2872	* With namespace:
				2873	*
				2874	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2875	*
				2876	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2877	*
				2878	*/
				2879
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2880	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2881	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2882	xmlChar *name;
				2883	xmlChar *attname;
				2884	xmlChar *attvalue;
				2885	const xmlChar **atts = NULL;
				2886	int nbatts = 0;
				2887	int maxatts = 0;
				2888	int meta = 0;
				2889	int i;
				2890
				2891	if (CUR != '<') return;
				2892	NEXT;
				2893
				2894	GROW;
				2895	name = htmlParseHTMLName(ctxt);
				2896	if (name == NULL) {
				2897	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2898	ctxt->sax->error(ctxt->userData,
				2899	"htmlParseStartTag: invalid element name\n");
				2900	ctxt->wellFormed = 0;
				2901	/* Dump the bogus tag like browsers do */
				2902	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2903	NEXT;
				2904	return;
				2905	}
				2906	if (xmlStrEqual(name, BAD_CAST"meta"))
				2907	meta = 1;
				2908
				2909	/*
				2910	* Check for auto-closure of HTML elements.
				2911	*/
				2912	htmlAutoClose(ctxt, name);
				2913
				2914	/*
				2915	* Check for implied HTML elements.
				2916	*/
				2917	htmlCheckImplied(ctxt, name);
				2918
				2919	/*
				2920	* Avoid html at any level > 0, head at any level != 1
				2921	* or any attempt to recurse body
				2922	*/
				2923	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2924	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2925	ctxt->sax->error(ctxt->userData,
				2926	"htmlParseStartTag: misplaced <html> tag\n");
				2927	ctxt->wellFormed = 0;
				2928	xmlFree(name);
				2929	return;
				2930	}
				2931	if ((ctxt->nameNr != 1) &&
				2932	(xmlStrEqual(name, BAD_CAST"head"))) {
				2933	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2934	ctxt->sax->error(ctxt->userData,
				2935	"htmlParseStartTag: misplaced <head> tag\n");
				2936	ctxt->wellFormed = 0;
				2937	xmlFree(name);
				2938	return;
				2939	}
				2940	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2941	int indx;
				2942	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2943	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2944	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2945	ctxt->sax->error(ctxt->userData,
				2946	"htmlParseStartTag: misplaced <body> tag\n");
				2947	ctxt->wellFormed = 0;
				2948	xmlFree(name);
				2949	return;
				2950	}
				2951	}
				2952	}
				2953
				2954	/*
				2955	* Now parse the attributes, it ends up with the ending
				2956	*
				2957	* (S Attribute)* S?
				2958	*/
				2959	SKIP_BLANKS;
				2960	while ((IS_CHAR(CUR)) &&
				2961	(CUR != '>') &&
				2962	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2963	long cons = ctxt->nbChars;
				2964
				2965	GROW;
				2966	attname = htmlParseAttribute(ctxt, &attvalue);
				2967	if (attname != NULL) {
				2968
				2969	/*
				2970	* Well formedness requires at most one declaration of an attribute
				2971	*/
				2972	for (i = 0; i < nbatts;i += 2) {
				2973	if (xmlStrEqual(atts[i], attname)) {
				2974	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2975	ctxt->sax->error(ctxt->userData,
				2976	"Attribute %s redefined\n",
				2977	attname);
				2978	ctxt->wellFormed = 0;
				2979	xmlFree(attname);
				2980	if (attvalue != NULL)
				2981	xmlFree(attvalue);
				2982	goto failed;
				2983	}
				2984	}
				2985
				2986	/*
				2987	* Add the pair to atts
				2988	*/
				2989	if (atts == NULL) {
				2990	maxatts = 10;
				2991	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				2992	if (atts == NULL) {
				2993	xmlGenericError(xmlGenericErrorContext,
				2994	"malloc of %ld byte failed\n",
				2995	maxatts * (long)sizeof(xmlChar *));
				2996	if (name != NULL) xmlFree(name);
				2997	return;
				2998	}
				2999	} else if (nbatts + 4 > maxatts) {
				3000	maxatts *= 2;
				3001	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3002	maxatts * sizeof(xmlChar *));
				3003	if (atts == NULL) {
				3004	xmlGenericError(xmlGenericErrorContext,
				3005	"realloc of %ld byte failed\n",
				3006	maxatts * (long)sizeof(xmlChar *));
				3007	if (name != NULL) xmlFree(name);
				3008	return;
				3009	}
				3010	}
				3011	atts[nbatts++] = attname;
				3012	atts[nbatts++] = attvalue;
				3013	atts[nbatts] = NULL;
				3014	atts[nbatts + 1] = NULL;
				3015	}
				3016	else {
				3017	/* Dump the bogus attribute string up to the next blank or
				3018	* the end of the tag. */
				3019	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3020	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3021	NEXT;
				3022	}
				3023
				3024	failed:
				3025	SKIP_BLANKS;
				3026	if (cons == ctxt->nbChars) {
				3027	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3028	ctxt->sax->error(ctxt->userData,
				3029	"htmlParseStartTag: problem parsing attributes\n");
				3030	ctxt->wellFormed = 0;
				3031	break;
				3032	}
				3033	}
				3034
				3035	/*
				3036	* Handle specific association to the META tag
				3037	*/
				3038	if (meta)
				3039	htmlCheckMeta(ctxt, atts);
				3040
				3041	/*
				3042	* SAX: Start of Element !
				3043	*/
				3044	htmlnamePush(ctxt, xmlStrdup(name));
				3045	#ifdef DEBUG
				3046	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3047	#endif
				3048	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3049	ctxt->sax->startElement(ctxt->userData, name, atts);
				3050
				3051	if (atts != NULL) {
				3052	for (i = 0;i < nbatts;i++) {
				3053	if (atts[i] != NULL)
				3054	xmlFree((xmlChar *) atts[i]);
				3055	}
				3056	xmlFree((void *) atts);
				3057	}
				3058	if (name != NULL) xmlFree(name);
				3059	}
				3060
				3061	/**
				3062	* htmlParseEndTag:
				3063	* @ctxt: an HTML parser context
				3064	*
				3065	* parse an end of tag
				3066	*
				3067	* [42] ETag ::= '</' Name S? '>'
				3068	*
				3069	* With namespace
				3070	*
				3071	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3072	*
				3073	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3074	*/
				3075
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3076	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3077	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3078	xmlChar *name;
				3079	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3080	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3081
				3082	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3083	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3084	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3085	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3086	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3087	}
				3088	SKIP(2);
				3089
				3090	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3091	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3092
				3093	/*
				3094	* We should definitely be at the ending "S? '>'" part
				3095	*/
				3096	SKIP_BLANKS;
				3097	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3098	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3099	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3100	ctxt->wellFormed = 0;
				3101	} else
				3102	NEXT;
				3103
				3104	/*
				3105	* If the name read is not one of the element in the parsing stack
				3106	* then return, it's just an error.
				3107	*/
				3108	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3109	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3110	}
				3111	if (i < 0) {
				3112	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3113	ctxt->sax->error(ctxt->userData,
				3114	"Unexpected end tag : %s\n", name);
				3115	xmlFree(name);
				3116	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3117	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3118	}
				3119
				3120
				3121	/*
				3122	* Check for auto-closure of HTML elements.
				3123	*/
				3124
				3125	htmlAutoCloseOnClose(ctxt, name);
				3126
				3127	/*
				3128	* Well formedness constraints, opening and closing must match.
				3129	* With the exception that the autoclose may have popped stuff out
				3130	* of the stack.
				3131	*/
				3132	if (!xmlStrEqual(name, ctxt->name)) {
				3133	#ifdef DEBUG
				3134	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3135	#endif
				3136	if ((ctxt->name != NULL) &&
				3137	(!xmlStrEqual(ctxt->name, name))) {
				3138	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3139	ctxt->sax->error(ctxt->userData,
				3140	"Opening and ending tag mismatch: %s and %s\n",
				3141	name, ctxt->name);
				3142	ctxt->wellFormed = 0;
				3143	}
				3144	}
				3145
				3146	/*
				3147	* SAX: End of Tag
				3148	*/
				3149	oldname = ctxt->name;
				3150	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3151	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3152	ctxt->sax->endElement(ctxt->userData, name);
				3153	oldname = htmlnamePop(ctxt);
				3154	if (oldname != NULL) {
				3155	#ifdef DEBUG
				3156	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3157	#endif
				3158	xmlFree(oldname);
				3159	#ifdef DEBUG
				3160	} else {
				3161	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3162	#endif
				3163	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3164	ret = 1;
				3165	} else {
				3166	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3167	}
				3168
				3169	if (name != NULL)
				3170	xmlFree(name);
				3171
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3172	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3173	}
				3174
				3175
				3176	/**
				3177	* htmlParseReference:
				3178	* @ctxt: an HTML parser context
				3179	*
				3180	* parse and handle entity references in content,
				3181	* this will end-up in a call to character() since this is either a
				3182	* CharRef, or a predefined entity.
				3183	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3184	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3185	htmlParseReference(htmlParserCtxtPtr ctxt) {
				3186	htmlEntityDescPtr ent;
				3187	xmlChar out[6];
				3188	xmlChar *name;
				3189	if (CUR != '&') return;
				3190
				3191	if (NXT(1) == '#') {
				3192	unsigned int c;
				3193	int bits, i = 0;
				3194
				3195	c = htmlParseCharRef(ctxt);
				3196	if (c == 0)
				3197	return;
				3198
				3199	if (c < 0x80) { out[i++]= c; bits= -6; }
				3200	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3201	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3202	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3203
				3204	for ( ; bits >= 0; bits-= 6) {
				3205	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3206	}
				3207	out[i] = 0;
				3208
				3209	htmlCheckParagraph(ctxt);
				3210	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3211	ctxt->sax->characters(ctxt->userData, out, i);
				3212	} else {
				3213	ent = htmlParseEntityRef(ctxt, &name);
				3214	if (name == NULL) {
				3215	htmlCheckParagraph(ctxt);
				3216	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3217	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3218	return;
				3219	}
				3220	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3221	htmlCheckParagraph(ctxt);
				3222	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3223	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3224	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3225	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3226	}
				3227	} else {
				3228	unsigned int c;
				3229	int bits, i = 0;
				3230
				3231	c = ent->value;
				3232	if (c < 0x80)
				3233	{ out[i++]= c; bits= -6; }
				3234	else if (c < 0x800)
				3235	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3236	else if (c < 0x10000)
				3237	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3238	else
				3239	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3240
				3241	for ( ; bits >= 0; bits-= 6) {
				3242	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3243	}
				3244	out[i] = 0;
				3245
				3246	htmlCheckParagraph(ctxt);
				3247	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3248	ctxt->sax->characters(ctxt->userData, out, i);
				3249	}
				3250	xmlFree(name);
				3251	}
				3252	}
				3253
				3254	/**
				3255	* htmlParseContent:
				3256	* @ctxt: an HTML parser context
				3257	* @name: the node name
				3258	*
				3259	* Parse a content: comment, sub-element, reference or text.
				3260	*
				3261	*/
				3262
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3263	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3264	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3265	xmlChar *currentNode;
				3266	int depth;
				3267
				3268	currentNode = xmlStrdup(ctxt->name);
				3269	depth = ctxt->nameNr;
				3270	while (1) {
				3271	long cons = ctxt->nbChars;
				3272
				3273	GROW;
				3274	/*
				3275	* Our tag or one of it's parent or children is ending.
				3276	*/
				3277	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3278	if (htmlParseEndTag(ctxt) &&
				3279	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3280	if (currentNode != NULL)
				3281	xmlFree(currentNode);
				3282	return;
				3283	}
				3284	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3285	}
				3286
				3287	/*
				3288	* Has this node been popped out during parsing of
				3289	* the next element
				3290	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3291	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3292	(!xmlStrEqual(currentNode, ctxt->name)))
				3293	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3294	if (currentNode != NULL) xmlFree(currentNode);
				3295	return;
				3296	}
				3297
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3298	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3299	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3300	/*
				3301	* Handle SCRIPT/STYLE separately
				3302	*/
				3303	htmlParseScript(ctxt);
				3304	} else {
				3305	/*
				3306	* Sometimes DOCTYPE arrives in the middle of the document
				3307	*/
				3308	if ((CUR == '<') && (NXT(1) == '!') &&
				3309	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3310	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3311	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3312	(UPP(8) == 'E')) {
				3313	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3314	ctxt->sax->error(ctxt->userData,
				3315	"Misplaced DOCTYPE declaration\n");
				3316	ctxt->wellFormed = 0;
				3317	htmlParseDocTypeDecl(ctxt);
				3318	}
				3319
				3320	/*
				3321	* First case : a comment
				3322	*/
				3323	if ((CUR == '<') && (NXT(1) == '!') &&
				3324	(NXT(2) == '-') && (NXT(3) == '-')) {
				3325	htmlParseComment(ctxt);
				3326	}
				3327
				3328	/*
				3329	* Second case : a sub-element.
				3330	*/
				3331	else if (CUR == '<') {
				3332	htmlParseElement(ctxt);
				3333	}
				3334
				3335	/*
				3336	* Third case : a reference. If if has not been resolved,
				3337	* parsing returns it's Name, create the node
				3338	*/
				3339	else if (CUR == '&') {
				3340	htmlParseReference(ctxt);
				3341	}
				3342
				3343	/*
				3344	* Fourth : end of the resource
				3345	*/
				3346	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3347	htmlAutoCloseOnEnd(ctxt);
				3348	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3349	}
				3350
				3351	/*
				3352	* Last case, text. Note that References are handled directly.
				3353	*/
				3354	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3355	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3356	}
				3357
				3358	if (cons == ctxt->nbChars) {
				3359	if (ctxt->node != NULL) {
				3360	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3361	ctxt->sax->error(ctxt->userData,
				3362	"detected an error in element content\n");
				3363	ctxt->wellFormed = 0;
				3364	}
				3365	break;
				3366	}
				3367	}
				3368	GROW;
				3369	}
				3370	if (currentNode != NULL) xmlFree(currentNode);
				3371	}
				3372
				3373	/**
				3374	* htmlParseElement:
				3375	* @ctxt: an HTML parser context
				3376	*
				3377	* parse an HTML element, this is highly recursive
				3378	*
				3379	* [39] element ::= EmptyElemTag \| STag content ETag
				3380	*
				3381	* [41] Attribute ::= Name Eq AttValue
				3382	*/
				3383
				3384	void
				3385	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3386	xmlChar *name;
				3387	xmlChar *currentNode = NULL;
				3388	htmlElemDescPtr info;
				3389	htmlParserNodeInfo node_info;
				3390	xmlChar *oldname;
				3391	int depth = ctxt->nameNr;
				3392
				3393	/* Capture start position */
				3394	if (ctxt->record_info) {
				3395	node_info.begin_pos = ctxt->input->consumed +
				3396	(CUR_PTR - ctxt->input->base);
				3397	node_info.begin_line = ctxt->input->line;
				3398	}
				3399
				3400	oldname = xmlStrdup(ctxt->name);
				3401	htmlParseStartTag(ctxt);
				3402	name = ctxt->name;
				3403	#ifdef DEBUG
				3404	if (oldname == NULL)
				3405	xmlGenericError(xmlGenericErrorContext,
				3406	"Start of element %s\n", name);
				3407	else if (name == NULL)
				3408	xmlGenericError(xmlGenericErrorContext,
				3409	"Start of element failed, was %s\n", oldname);
				3410	else
				3411	xmlGenericError(xmlGenericErrorContext,
				3412	"Start of element %s, was %s\n", name, oldname);
				3413	#endif
				3414	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3415	(name == NULL)) {
				3416	if (CUR == '>')
				3417	NEXT;
				3418	if (oldname != NULL)
				3419	xmlFree(oldname);
				3420	return;
				3421	}
				3422	if (oldname != NULL)
				3423	xmlFree(oldname);
				3424
				3425	/*
				3426	* Lookup the info for that element.
				3427	*/
				3428	info = htmlTagLookup(name);
				3429	if (info == NULL) {
				3430	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3431	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3432	name);
				3433	ctxt->wellFormed = 0;
				3434	} else if (info->depr) {
				3435	/***************************
				3436	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3437	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3438	name);
				3439	***************************/
				3440	}
				3441
				3442	/*
				3443	* Check for an Empty Element labelled the XML/SGML way
				3444	*/
				3445	if ((CUR == '/') && (NXT(1) == '>')) {
				3446	SKIP(2);
				3447	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3448	ctxt->sax->endElement(ctxt->userData, name);
				3449	oldname = htmlnamePop(ctxt);
				3450	#ifdef DEBUG
				3451	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3452	#endif
				3453	if (oldname != NULL)
				3454	xmlFree(oldname);
				3455	return;
				3456	}
				3457
				3458	if (CUR == '>') {
				3459	NEXT;
				3460	} else {
				3461	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3462	ctxt->sax->error(ctxt->userData,
				3463	"Couldn't find end of Start Tag %s\n",
				3464	name);
				3465	ctxt->wellFormed = 0;
				3466
				3467	/*
				3468	* end of parsing of this node.
				3469	*/
				3470	if (xmlStrEqual(name, ctxt->name)) {
				3471	nodePop(ctxt);
				3472	oldname = htmlnamePop(ctxt);
				3473	#ifdef DEBUG
				3474	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3475	#endif
				3476	if (oldname != NULL)
				3477	xmlFree(oldname);
				3478	}
				3479
				3480	/*
				3481	* Capture end position and add node
				3482	*/
				3483	if ( currentNode != NULL && ctxt->record_info ) {
				3484	node_info.end_pos = ctxt->input->consumed +
				3485	(CUR_PTR - ctxt->input->base);
				3486	node_info.end_line = ctxt->input->line;
				3487	node_info.node = ctxt->node;
				3488	xmlParserAddNodeInfo(ctxt, &node_info);
				3489	}
				3490	return;
				3491	}
				3492
				3493	/*
				3494	* Check for an Empty Element from DTD definition
				3495	*/
				3496	if ((info != NULL) && (info->empty)) {
				3497	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3498	ctxt->sax->endElement(ctxt->userData, name);
				3499	oldname = htmlnamePop(ctxt);
				3500	#ifdef DEBUG
				3501	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3502	#endif
				3503	if (oldname != NULL)
				3504	xmlFree(oldname);
				3505	return;
				3506	}
				3507
				3508	/*
				3509	* Parse the content of the element:
				3510	*/
				3511	currentNode = xmlStrdup(ctxt->name);
				3512	depth = ctxt->nameNr;
				3513	while (IS_CHAR(CUR)) {
				3514	htmlParseContent(ctxt);
				3515	if (ctxt->nameNr < depth) break;
				3516	}
				3517
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3518	/*
				3519	* Capture end position and add node
				3520	*/
				3521	if ( currentNode != NULL && ctxt->record_info ) {
				3522	node_info.end_pos = ctxt->input->consumed +
				3523	(CUR_PTR - ctxt->input->base);
				3524	node_info.end_line = ctxt->input->line;
				3525	node_info.node = ctxt->node;
				3526	xmlParserAddNodeInfo(ctxt, &node_info);
				3527	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3528	if (!IS_CHAR(CUR)) {
				3529	htmlAutoCloseOnEnd(ctxt);
				3530	}
				3531
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3532	if (currentNode != NULL)
				3533	xmlFree(currentNode);
				3534	}
				3535
				3536	/**
				3537	* htmlParseDocument :
				3538	* @ctxt: an HTML parser context
				3539	*
				3540	* parse an HTML document (and build a tree if using the standard SAX
				3541	* interface).
				3542	*
				3543	* Returns 0, -1 in case of error. the parser context is augmented
				3544	* as a result of the parsing.
				3545	*/
				3546
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3547	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3548	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3549	xmlDtdPtr dtd;
				3550
				3551	htmlDefaultSAXHandlerInit();
				3552	ctxt->html = 1;
				3553
				3554	GROW;
				3555	/*
				3556	* SAX: beginning of the document processing.
				3557	*/
				3558	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3559	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3560
				3561	/*
				3562	* Wipe out everything which is before the first '<'
				3563	*/
				3564	SKIP_BLANKS;
				3565	if (CUR == 0) {
				3566	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3567	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3568	ctxt->wellFormed = 0;
				3569	}
				3570
				3571	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3572	ctxt->sax->startDocument(ctxt->userData);
				3573
				3574
				3575	/*
				3576	* Parse possible comments before any content
				3577	*/
				3578	while ((CUR == '<') && (NXT(1) == '!') &&
				3579	(NXT(2) == '-') && (NXT(3) == '-')) {
				3580	htmlParseComment(ctxt);
				3581	SKIP_BLANKS;
				3582	}
				3583
				3584
				3585	/*
				3586	* Then possibly doc type declaration(s) and more Misc
				3587	* (doctypedecl Misc*)?
				3588	*/
				3589	if ((CUR == '<') && (NXT(1) == '!') &&
				3590	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3591	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3592	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3593	(UPP(8) == 'E')) {
				3594	htmlParseDocTypeDecl(ctxt);
				3595	}
				3596	SKIP_BLANKS;
				3597
				3598	/*
				3599	* Parse possible comments before any content
				3600	*/
				3601	while ((CUR == '<') && (NXT(1) == '!') &&
				3602	(NXT(2) == '-') && (NXT(3) == '-')) {
				3603	htmlParseComment(ctxt);
				3604	SKIP_BLANKS;
				3605	}
				3606
				3607	/*
				3608	* Time to start parsing the tree itself
				3609	*/
				3610	htmlParseContent(ctxt);
				3611
				3612	/*
				3613	* autoclose
				3614	*/
				3615	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3616	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3617
				3618
				3619	/*
				3620	* SAX: end of the document processing.
				3621	*/
				3622	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3623	ctxt->sax->endDocument(ctxt->userData);
				3624
				3625	if (ctxt->myDoc != NULL) {
				3626	dtd = xmlGetIntSubset(ctxt->myDoc);
				3627	if (dtd == NULL)
				3628	ctxt->myDoc->intSubset =
				3629	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3630	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3631	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3632	}
				3633	if (! ctxt->wellFormed) return(-1);
				3634	return(0);
				3635	}
				3636
				3637
				3638	/************************************************************************
				3639	* *
				3640	* Parser contexts handling *
				3641	* *
				3642	************************************************************************/
				3643
				3644	/**
				3645	* xmlInitParserCtxt:
				3646	* @ctxt: an HTML parser context
				3647	*
				3648	* Initialize a parser context
				3649	*/
				3650
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3651	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3652	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3653	{
				3654	htmlSAXHandler *sax;
				3655
				3656	if (ctxt == NULL) return;
				3657	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3658
				3659	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3660	if (sax == NULL) {
				3661	xmlGenericError(xmlGenericErrorContext,
				3662	"htmlInitParserCtxt: out of memory\n");
				3663	}
				3664	else
				3665	memset(sax, 0, sizeof(htmlSAXHandler));
				3666
				3667	/* Allocate the Input stack */
				3668	ctxt->inputTab = (htmlParserInputPtr *)
				3669	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3670	if (ctxt->inputTab == NULL) {
				3671	xmlGenericError(xmlGenericErrorContext,
				3672	"htmlInitParserCtxt: out of memory\n");
				3673	ctxt->inputNr = 0;
				3674	ctxt->inputMax = 0;
				3675	ctxt->input = NULL;
				3676	return;
				3677	}
				3678	ctxt->inputNr = 0;
				3679	ctxt->inputMax = 5;
				3680	ctxt->input = NULL;
				3681	ctxt->version = NULL;
				3682	ctxt->encoding = NULL;
				3683	ctxt->standalone = -1;
				3684	ctxt->instate = XML_PARSER_START;
				3685
				3686	/* Allocate the Node stack */
				3687	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3688	if (ctxt->nodeTab == NULL) {
				3689	xmlGenericError(xmlGenericErrorContext,
				3690	"htmlInitParserCtxt: out of memory\n");
				3691	ctxt->nodeNr = 0;
				3692	ctxt->nodeMax = 0;
				3693	ctxt->node = NULL;
				3694	ctxt->inputNr = 0;
				3695	ctxt->inputMax = 0;
				3696	ctxt->input = NULL;
				3697	return;
				3698	}
				3699	ctxt->nodeNr = 0;
				3700	ctxt->nodeMax = 10;
				3701	ctxt->node = NULL;
				3702
				3703	/* Allocate the Name stack */
				3704	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3705	if (ctxt->nameTab == NULL) {
				3706	xmlGenericError(xmlGenericErrorContext,
				3707	"htmlInitParserCtxt: out of memory\n");
				3708	ctxt->nameNr = 0;
				3709	ctxt->nameMax = 10;
				3710	ctxt->name = NULL;
				3711	ctxt->nodeNr = 0;
				3712	ctxt->nodeMax = 0;
				3713	ctxt->node = NULL;
				3714	ctxt->inputNr = 0;
				3715	ctxt->inputMax = 0;
				3716	ctxt->input = NULL;
				3717	return;
				3718	}
				3719	ctxt->nameNr = 0;
				3720	ctxt->nameMax = 10;
				3721	ctxt->name = NULL;
				3722
				3723	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3724	else {
				3725	ctxt->sax = sax;
				3726	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3727	}
				3728	ctxt->userData = ctxt;
				3729	ctxt->myDoc = NULL;
				3730	ctxt->wellFormed = 1;
				3731	ctxt->replaceEntities = 0;
				3732	ctxt->html = 1;
				3733	ctxt->record_info = 0;
				3734	ctxt->validate = 0;
				3735	ctxt->nbChars = 0;
				3736	ctxt->checkIndex = 0;
				3737	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3738	}
				3739
				3740	/**
				3741	* htmlFreeParserCtxt:
				3742	* @ctxt: an HTML parser context
				3743	*
				3744	* Free all the memory used by a parser context. However the parsed
				3745	* document in ctxt->myDoc is not freed.
				3746	*/
				3747
				3748	void
				3749	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3750	{
				3751	xmlFreeParserCtxt(ctxt);
				3752	}
				3753
				3754	/**
				3755	* htmlCreateDocParserCtxt :
				3756	* @cur: a pointer to an array of xmlChar
				3757	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3758	*
				3759	* Create a parser context for an HTML document.
				3760	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3761	* TODO: check the need to add encoding handling there
				3762	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3763	* Returns the new parser context or NULL
				3764	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3765	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3766	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3767	htmlParserCtxtPtr ctxt;
				3768	htmlParserInputPtr input;
				3769	/* htmlCharEncoding enc; */
				3770
				3771	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3772	if (ctxt == NULL) {
				3773	perror("malloc");
				3774	return(NULL);
				3775	}
				3776	htmlInitParserCtxt(ctxt);
				3777	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3778	if (input == NULL) {
				3779	perror("malloc");
				3780	xmlFree(ctxt);
				3781	return(NULL);
				3782	}
				3783	memset(input, 0, sizeof(htmlParserInput));
				3784
				3785	input->line = 1;
				3786	input->col = 1;
				3787	input->base = cur;
				3788	input->cur = cur;
				3789
				3790	inputPush(ctxt, input);
				3791	return(ctxt);
				3792	}
				3793
				3794	/************************************************************************
				3795	* *
				3796	* Progressive parsing interfaces *
				3797	* *
				3798	************************************************************************/
				3799
				3800	/**
				3801	* htmlParseLookupSequence:
				3802	* @ctxt: an HTML parser context
				3803	* @first: the first char to lookup
				3804	* @next: the next char to lookup or zero
				3805	* @third: the next char to lookup or zero
				3806	*
				3807	* Try to find if a sequence (first, next, third) or just (first next) or
				3808	* (first) is available in the input stream.
				3809	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3810	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3811	* parser, do not use liberally.
				3812	* This is basically similar to xmlParseLookupSequence()
				3813	*
				3814	* Returns the index to the current parsing point if the full sequence
				3815	* is available, -1 otherwise.
				3816	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3817	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3818	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3819	xmlChar next, xmlChar third) {
				3820	int base, len;
				3821	htmlParserInputPtr in;
				3822	const xmlChar *buf;
				3823
				3824	in = ctxt->input;
				3825	if (in == NULL) return(-1);
				3826	base = in->cur - in->base;
				3827	if (base < 0) return(-1);
				3828	if (ctxt->checkIndex > base)
				3829	base = ctxt->checkIndex;
				3830	if (in->buf == NULL) {
				3831	buf = in->base;
				3832	len = in->length;
				3833	} else {
				3834	buf = in->buf->buffer->content;
				3835	len = in->buf->buffer->use;
				3836	}
				3837	/* take into account the sequence length */
				3838	if (third) len -= 2;
				3839	else if (next) len --;
				3840	for (;base < len;base++) {
				3841	if (buf[base] == first) {
				3842	if (third != 0) {
				3843	if ((buf[base + 1] != next) \|\|
				3844	(buf[base + 2] != third)) continue;
				3845	} else if (next != 0) {
				3846	if (buf[base + 1] != next) continue;
				3847	}
				3848	ctxt->checkIndex = 0;
				3849	#ifdef DEBUG_PUSH
				3850	if (next == 0)
				3851	xmlGenericError(xmlGenericErrorContext,
				3852	"HPP: lookup '%c' found at %d\n",
				3853	first, base);
				3854	else if (third == 0)
				3855	xmlGenericError(xmlGenericErrorContext,
				3856	"HPP: lookup '%c%c' found at %d\n",
				3857	first, next, base);
				3858	else
				3859	xmlGenericError(xmlGenericErrorContext,
				3860	"HPP: lookup '%c%c%c' found at %d\n",
				3861	first, next, third, base);
				3862	#endif
				3863	return(base - (in->cur - in->base));
				3864	}
				3865	}
				3866	ctxt->checkIndex = base;
				3867	#ifdef DEBUG_PUSH
				3868	if (next == 0)
				3869	xmlGenericError(xmlGenericErrorContext,
				3870	"HPP: lookup '%c' failed\n", first);
				3871	else if (third == 0)
				3872	xmlGenericError(xmlGenericErrorContext,
				3873	"HPP: lookup '%c%c' failed\n", first, next);
				3874	else
				3875	xmlGenericError(xmlGenericErrorContext,
				3876	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3877	#endif
				3878	return(-1);
				3879	}
				3880
				3881	/**
				3882	* htmlParseTryOrFinish:
				3883	* @ctxt: an HTML parser context
				3884	* @terminate: last chunk indicator
				3885	*
				3886	* Try to progress on parsing
				3887	*
				3888	* Returns zero if no parsing was possible
				3889	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3890	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3891	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3892	int ret = 0;
				3893	htmlParserInputPtr in;
				3894	int avail = 0;
				3895	xmlChar cur, next;
				3896
				3897	#ifdef DEBUG_PUSH
				3898	switch (ctxt->instate) {
				3899	case XML_PARSER_EOF:
				3900	xmlGenericError(xmlGenericErrorContext,
				3901	"HPP: try EOF\n"); break;
				3902	case XML_PARSER_START:
				3903	xmlGenericError(xmlGenericErrorContext,
				3904	"HPP: try START\n"); break;
				3905	case XML_PARSER_MISC:
				3906	xmlGenericError(xmlGenericErrorContext,
				3907	"HPP: try MISC\n");break;
				3908	case XML_PARSER_COMMENT:
				3909	xmlGenericError(xmlGenericErrorContext,
				3910	"HPP: try COMMENT\n");break;
				3911	case XML_PARSER_PROLOG:
				3912	xmlGenericError(xmlGenericErrorContext,
				3913	"HPP: try PROLOG\n");break;
				3914	case XML_PARSER_START_TAG:
				3915	xmlGenericError(xmlGenericErrorContext,
				3916	"HPP: try START_TAG\n");break;
				3917	case XML_PARSER_CONTENT:
				3918	xmlGenericError(xmlGenericErrorContext,
				3919	"HPP: try CONTENT\n");break;
				3920	case XML_PARSER_CDATA_SECTION:
				3921	xmlGenericError(xmlGenericErrorContext,
				3922	"HPP: try CDATA_SECTION\n");break;
				3923	case XML_PARSER_END_TAG:
				3924	xmlGenericError(xmlGenericErrorContext,
				3925	"HPP: try END_TAG\n");break;
				3926	case XML_PARSER_ENTITY_DECL:
				3927	xmlGenericError(xmlGenericErrorContext,
				3928	"HPP: try ENTITY_DECL\n");break;
				3929	case XML_PARSER_ENTITY_VALUE:
				3930	xmlGenericError(xmlGenericErrorContext,
				3931	"HPP: try ENTITY_VALUE\n");break;
				3932	case XML_PARSER_ATTRIBUTE_VALUE:
				3933	xmlGenericError(xmlGenericErrorContext,
				3934	"HPP: try ATTRIBUTE_VALUE\n");break;
				3935	case XML_PARSER_DTD:
				3936	xmlGenericError(xmlGenericErrorContext,
				3937	"HPP: try DTD\n");break;
				3938	case XML_PARSER_EPILOG:
				3939	xmlGenericError(xmlGenericErrorContext,
				3940	"HPP: try EPILOG\n");break;
				3941	case XML_PARSER_PI:
				3942	xmlGenericError(xmlGenericErrorContext,
				3943	"HPP: try PI\n");break;
				3944	case XML_PARSER_SYSTEM_LITERAL:
				3945	xmlGenericError(xmlGenericErrorContext,
				3946	"HPP: try SYSTEM_LITERAL\n");break;
				3947	}
				3948	#endif
				3949
				3950	while (1) {
				3951
				3952	in = ctxt->input;
				3953	if (in == NULL) break;
				3954	if (in->buf == NULL)
				3955	avail = in->length - (in->cur - in->base);
				3956	else
				3957	avail = in->buf->buffer->use - (in->cur - in->base);
				3958	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3959	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3960	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3961	/*
				3962	* SAX: end of the document processing.
				3963	*/
				3964	ctxt->instate = XML_PARSER_EOF;
				3965	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3966	ctxt->sax->endDocument(ctxt->userData);
				3967	}
				3968	}
				3969	if (avail < 1)
				3970	goto done;
				3971	switch (ctxt->instate) {
				3972	case XML_PARSER_EOF:
				3973	/*
				3974	* Document parsing is done !
				3975	*/
				3976	goto done;
				3977	case XML_PARSER_START:
				3978	/*
				3979	* Very first chars read from the document flow.
				3980	*/
				3981	cur = in->cur[0];
				3982	if (IS_BLANK(cur)) {
				3983	SKIP_BLANKS;
				3984	if (in->buf == NULL)
				3985	avail = in->length - (in->cur - in->base);
				3986	else
				3987	avail = in->buf->buffer->use - (in->cur - in->base);
				3988	}
				3989	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3990	ctxt->sax->setDocumentLocator(ctxt->userData,
				3991	&xmlDefaultSAXLocator);
				3992	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				3993	(!ctxt->disableSAX))
				3994	ctxt->sax->startDocument(ctxt->userData);
				3995
				3996	cur = in->cur[0];
				3997	next = in->cur[1];
				3998	if ((cur == '<') && (next == '!') &&
				3999	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4000	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4001	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4002	(UPP(8) == 'E')) {
				4003	if ((!terminate) &&
				4004	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4005	goto done;
				4006	#ifdef DEBUG_PUSH
				4007	xmlGenericError(xmlGenericErrorContext,
				4008	"HPP: Parsing internal subset\n");
				4009	#endif
				4010	htmlParseDocTypeDecl(ctxt);
				4011	ctxt->instate = XML_PARSER_PROLOG;
				4012	#ifdef DEBUG_PUSH
				4013	xmlGenericError(xmlGenericErrorContext,
				4014	"HPP: entering PROLOG\n");
				4015	#endif
				4016	} else {
				4017	ctxt->instate = XML_PARSER_MISC;
				4018	}
				4019	#ifdef DEBUG_PUSH
				4020	xmlGenericError(xmlGenericErrorContext,
				4021	"HPP: entering MISC\n");
				4022	#endif
				4023	break;
				4024	case XML_PARSER_MISC:
				4025	SKIP_BLANKS;
				4026	if (in->buf == NULL)
				4027	avail = in->length - (in->cur - in->base);
				4028	else
				4029	avail = in->buf->buffer->use - (in->cur - in->base);
				4030	if (avail < 2)
				4031	goto done;
				4032	cur = in->cur[0];
				4033	next = in->cur[1];
				4034	if ((cur == '<') && (next == '!') &&
				4035	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4036	if ((!terminate) &&
				4037	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4038	goto done;
				4039	#ifdef DEBUG_PUSH
				4040	xmlGenericError(xmlGenericErrorContext,
				4041	"HPP: Parsing Comment\n");
				4042	#endif
				4043	htmlParseComment(ctxt);
				4044	ctxt->instate = XML_PARSER_MISC;
				4045	} else if ((cur == '<') && (next == '!') &&
				4046	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4047	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4048	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4049	(UPP(8) == 'E')) {
				4050	if ((!terminate) &&
				4051	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4052	goto done;
				4053	#ifdef DEBUG_PUSH
				4054	xmlGenericError(xmlGenericErrorContext,
				4055	"HPP: Parsing internal subset\n");
				4056	#endif
				4057	htmlParseDocTypeDecl(ctxt);
				4058	ctxt->instate = XML_PARSER_PROLOG;
				4059	#ifdef DEBUG_PUSH
				4060	xmlGenericError(xmlGenericErrorContext,
				4061	"HPP: entering PROLOG\n");
				4062	#endif
				4063	} else if ((cur == '<') && (next == '!') &&
				4064	(avail < 9)) {
				4065	goto done;
				4066	} else {
				4067	ctxt->instate = XML_PARSER_START_TAG;
				4068	#ifdef DEBUG_PUSH
				4069	xmlGenericError(xmlGenericErrorContext,
				4070	"HPP: entering START_TAG\n");
				4071	#endif
				4072	}
				4073	break;
				4074	case XML_PARSER_PROLOG:
				4075	SKIP_BLANKS;
				4076	if (in->buf == NULL)
				4077	avail = in->length - (in->cur - in->base);
				4078	else
				4079	avail = in->buf->buffer->use - (in->cur - in->base);
				4080	if (avail < 2)
				4081	goto done;
				4082	cur = in->cur[0];
				4083	next = in->cur[1];
				4084	if ((cur == '<') && (next == '!') &&
				4085	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4086	if ((!terminate) &&
				4087	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4088	goto done;
				4089	#ifdef DEBUG_PUSH
				4090	xmlGenericError(xmlGenericErrorContext,
				4091	"HPP: Parsing Comment\n");
				4092	#endif
				4093	htmlParseComment(ctxt);
				4094	ctxt->instate = XML_PARSER_PROLOG;
				4095	} else if ((cur == '<') && (next == '!') &&
				4096	(avail < 4)) {
				4097	goto done;
				4098	} else {
				4099	ctxt->instate = XML_PARSER_START_TAG;
				4100	#ifdef DEBUG_PUSH
				4101	xmlGenericError(xmlGenericErrorContext,
				4102	"HPP: entering START_TAG\n");
				4103	#endif
				4104	}
				4105	break;
				4106	case XML_PARSER_EPILOG:
				4107	if (in->buf == NULL)
				4108	avail = in->length - (in->cur - in->base);
				4109	else
				4110	avail = in->buf->buffer->use - (in->cur - in->base);
				4111	if (avail < 1)
				4112	goto done;
				4113	cur = in->cur[0];
				4114	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4115	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4116	goto done;
				4117	}
				4118	if (avail < 2)
				4119	goto done;
				4120	next = in->cur[1];
				4121	if ((cur == '<') && (next == '!') &&
				4122	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4123	if ((!terminate) &&
				4124	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4125	goto done;
				4126	#ifdef DEBUG_PUSH
				4127	xmlGenericError(xmlGenericErrorContext,
				4128	"HPP: Parsing Comment\n");
				4129	#endif
				4130	htmlParseComment(ctxt);
				4131	ctxt->instate = XML_PARSER_EPILOG;
				4132	} else if ((cur == '<') && (next == '!') &&
				4133	(avail < 4)) {
				4134	goto done;
				4135	} else {
				4136	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4137	ctxt->wellFormed = 0;
				4138	ctxt->instate = XML_PARSER_EOF;
				4139	#ifdef DEBUG_PUSH
				4140	xmlGenericError(xmlGenericErrorContext,
				4141	"HPP: entering EOF\n");
				4142	#endif
				4143	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4144	ctxt->sax->endDocument(ctxt->userData);
				4145	goto done;
				4146	}
				4147	break;
				4148	case XML_PARSER_START_TAG: {
				4149	xmlChar name, oldname;
				4150	int depth = ctxt->nameNr;
				4151	htmlElemDescPtr info;
				4152
				4153	if (avail < 2)
				4154	goto done;
				4155	cur = in->cur[0];
				4156	if (cur != '<') {
				4157	ctxt->instate = XML_PARSER_CONTENT;
				4158	#ifdef DEBUG_PUSH
				4159	xmlGenericError(xmlGenericErrorContext,
				4160	"HPP: entering CONTENT\n");
				4161	#endif
				4162	break;
				4163	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4164	if (in->cur[1] == '/') {
				4165	ctxt->instate = XML_PARSER_END_TAG;
				4166	ctxt->checkIndex = 0;
				4167	#ifdef DEBUG_PUSH
				4168	xmlGenericError(xmlGenericErrorContext,
				4169	"HPP: entering END_TAG\n");
				4170	#endif
				4171	break;
				4172	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4173	if ((!terminate) &&
				4174	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4175	goto done;
				4176
				4177	oldname = xmlStrdup(ctxt->name);
				4178	htmlParseStartTag(ctxt);
				4179	name = ctxt->name;
				4180	#ifdef DEBUG
				4181	if (oldname == NULL)
				4182	xmlGenericError(xmlGenericErrorContext,
				4183	"Start of element %s\n", name);
				4184	else if (name == NULL)
				4185	xmlGenericError(xmlGenericErrorContext,
				4186	"Start of element failed, was %s\n",
				4187	oldname);
				4188	else
				4189	xmlGenericError(xmlGenericErrorContext,
				4190	"Start of element %s, was %s\n",
				4191	name, oldname);
				4192	#endif
				4193	if (((depth == ctxt->nameNr) &&
				4194	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4195	(name == NULL)) {
				4196	if (CUR == '>')
				4197	NEXT;
				4198	if (oldname != NULL)
				4199	xmlFree(oldname);
				4200	break;
				4201	}
				4202	if (oldname != NULL)
				4203	xmlFree(oldname);
				4204
				4205	/*
				4206	* Lookup the info for that element.
				4207	*/
				4208	info = htmlTagLookup(name);
				4209	if (info == NULL) {
				4210	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4211	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4212	name);
				4213	ctxt->wellFormed = 0;
				4214	} else if (info->depr) {
				4215	/***************************
				4216	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4217	ctxt->sax->warning(ctxt->userData,
				4218	"Tag %s is deprecated\n",
				4219	name);
				4220	***************************/
				4221	}
				4222
				4223	/*
				4224	* Check for an Empty Element labelled the XML/SGML way
				4225	*/
				4226	if ((CUR == '/') && (NXT(1) == '>')) {
				4227	SKIP(2);
				4228	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4229	ctxt->sax->endElement(ctxt->userData, name);
				4230	oldname = htmlnamePop(ctxt);
				4231	#ifdef DEBUG
				4232	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4233	oldname);
				4234	#endif
				4235	if (oldname != NULL)
				4236	xmlFree(oldname);
				4237	ctxt->instate = XML_PARSER_CONTENT;
				4238	#ifdef DEBUG_PUSH
				4239	xmlGenericError(xmlGenericErrorContext,
				4240	"HPP: entering CONTENT\n");
				4241	#endif
				4242	break;
				4243	}
				4244
				4245	if (CUR == '>') {
				4246	NEXT;
				4247	} else {
				4248	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4249	ctxt->sax->error(ctxt->userData,
				4250	"Couldn't find end of Start Tag %s\n",
				4251	name);
				4252	ctxt->wellFormed = 0;
				4253
				4254	/*
				4255	* end of parsing of this node.
				4256	*/
				4257	if (xmlStrEqual(name, ctxt->name)) {
				4258	nodePop(ctxt);
				4259	oldname = htmlnamePop(ctxt);
				4260	#ifdef DEBUG
				4261	xmlGenericError(xmlGenericErrorContext,
				4262	"End of start tag problem: popping out %s\n", oldname);
				4263	#endif
				4264	if (oldname != NULL)
				4265	xmlFree(oldname);
				4266	}
				4267
				4268	ctxt->instate = XML_PARSER_CONTENT;
				4269	#ifdef DEBUG_PUSH
				4270	xmlGenericError(xmlGenericErrorContext,
				4271	"HPP: entering CONTENT\n");
				4272	#endif
				4273	break;
				4274	}
				4275
				4276	/*
				4277	* Check for an Empty Element from DTD definition
				4278	*/
				4279	if ((info != NULL) && (info->empty)) {
				4280	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4281	ctxt->sax->endElement(ctxt->userData, name);
				4282	oldname = htmlnamePop(ctxt);
				4283	#ifdef DEBUG
				4284	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4285	#endif
				4286	if (oldname != NULL)
				4287	xmlFree(oldname);
				4288	}
				4289	ctxt->instate = XML_PARSER_CONTENT;
				4290	#ifdef DEBUG_PUSH
				4291	xmlGenericError(xmlGenericErrorContext,
				4292	"HPP: entering CONTENT\n");
				4293	#endif
				4294	break;
				4295	}
				4296	case XML_PARSER_CONTENT: {
				4297	long cons;
				4298	/*
				4299	* Handle preparsed entities and charRef
				4300	*/
				4301	if (ctxt->token != 0) {
				4302	xmlChar chr[2] = { 0 , 0 } ;
				4303
				4304	chr[0] = (xmlChar) ctxt->token;
				4305	htmlCheckParagraph(ctxt);
				4306	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4307	ctxt->sax->characters(ctxt->userData, chr, 1);
				4308	ctxt->token = 0;
				4309	ctxt->checkIndex = 0;
				4310	}
				4311	if ((avail == 1) && (terminate)) {
				4312	cur = in->cur[0];
				4313	if ((cur != '<') && (cur != '&')) {
				4314	if (ctxt->sax != NULL) {
				4315	if (IS_BLANK(cur)) {
				4316	if (ctxt->sax->ignorableWhitespace != NULL)
				4317	ctxt->sax->ignorableWhitespace(
				4318	ctxt->userData, &cur, 1);
				4319	} else {
				4320	htmlCheckParagraph(ctxt);
				4321	if (ctxt->sax->characters != NULL)
				4322	ctxt->sax->characters(
				4323	ctxt->userData, &cur, 1);
				4324	}
				4325	}
				4326	ctxt->token = 0;
				4327	ctxt->checkIndex = 0;
				4328	NEXT;
				4329	}
				4330	break;
				4331	}
				4332	if (avail < 2)
				4333	goto done;
				4334	cur = in->cur[0];
				4335	next = in->cur[1];
				4336	cons = ctxt->nbChars;
				4337	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4338	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4339	/*
				4340	* Handle SCRIPT/STYLE separately
				4341	*/
				4342	if ((!terminate) &&
				4343	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4344	goto done;
				4345	htmlParseScript(ctxt);
				4346	if ((cur == '<') && (next == '/')) {
				4347	ctxt->instate = XML_PARSER_END_TAG;
				4348	ctxt->checkIndex = 0;
				4349	#ifdef DEBUG_PUSH
				4350	xmlGenericError(xmlGenericErrorContext,
				4351	"HPP: entering END_TAG\n");
				4352	#endif
				4353	break;
				4354	}
				4355	} else {
				4356	/*
				4357	* Sometimes DOCTYPE arrives in the middle of the document
				4358	*/
				4359	if ((cur == '<') && (next == '!') &&
				4360	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4361	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4362	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4363	(UPP(8) == 'E')) {
				4364	if ((!terminate) &&
				4365	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4366	goto done;
				4367	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4368	ctxt->sax->error(ctxt->userData,
				4369	"Misplaced DOCTYPE declaration\n");
				4370	ctxt->wellFormed = 0;
				4371	htmlParseDocTypeDecl(ctxt);
				4372	} else if ((cur == '<') && (next == '!') &&
				4373	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4374	if ((!terminate) &&
				4375	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4376	goto done;
				4377	#ifdef DEBUG_PUSH
				4378	xmlGenericError(xmlGenericErrorContext,
				4379	"HPP: Parsing Comment\n");
				4380	#endif
				4381	htmlParseComment(ctxt);
				4382	ctxt->instate = XML_PARSER_CONTENT;
				4383	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4384	goto done;
				4385	} else if ((cur == '<') && (next == '/')) {
				4386	ctxt->instate = XML_PARSER_END_TAG;
				4387	ctxt->checkIndex = 0;
				4388	#ifdef DEBUG_PUSH
				4389	xmlGenericError(xmlGenericErrorContext,
				4390	"HPP: entering END_TAG\n");
				4391	#endif
				4392	break;
				4393	} else if (cur == '<') {
				4394	ctxt->instate = XML_PARSER_START_TAG;
				4395	ctxt->checkIndex = 0;
				4396	#ifdef DEBUG_PUSH
				4397	xmlGenericError(xmlGenericErrorContext,
				4398	"HPP: entering START_TAG\n");
				4399	#endif
				4400	break;
				4401	} else if (cur == '&') {
				4402	if ((!terminate) &&
				4403	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4404	goto done;
				4405	#ifdef DEBUG_PUSH
				4406	xmlGenericError(xmlGenericErrorContext,
				4407	"HPP: Parsing Reference\n");
				4408	#endif
				4409	/* TODO: check generation of subtrees if noent !!! */
				4410	htmlParseReference(ctxt);
				4411	} else {
				4412	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4413	/*
				4414	* Goal of the following test is :
				4415	* - minimize calls to the SAX 'character' callback
				4416	* when they are mergeable
				4417	*/
				4418	if ((ctxt->inputNr == 1) &&
				4419	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4420	if ((!terminate) &&
				4421	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4422	goto done;
				4423	}
				4424	ctxt->checkIndex = 0;
				4425	#ifdef DEBUG_PUSH
				4426	xmlGenericError(xmlGenericErrorContext,
				4427	"HPP: Parsing char data\n");
				4428	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4429	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4430	}
				4431	}
				4432	if (cons == ctxt->nbChars) {
				4433	if (ctxt->node != NULL) {
				4434	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4435	ctxt->sax->error(ctxt->userData,
				4436	"detected an error in element content\n");
				4437	ctxt->wellFormed = 0;
				4438	}
				4439	NEXT;
				4440	break;
				4441	}
				4442
				4443	break;
				4444	}
				4445	case XML_PARSER_END_TAG:
				4446	if (avail < 2)
				4447	goto done;
				4448	if ((!terminate) &&
				4449	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4450	goto done;
				4451	htmlParseEndTag(ctxt);
				4452	if (ctxt->nameNr == 0) {
				4453	ctxt->instate = XML_PARSER_EPILOG;
				4454	} else {
				4455	ctxt->instate = XML_PARSER_CONTENT;
				4456	}
				4457	ctxt->checkIndex = 0;
				4458	#ifdef DEBUG_PUSH
				4459	xmlGenericError(xmlGenericErrorContext,
				4460	"HPP: entering CONTENT\n");
				4461	#endif
				4462	break;
				4463	case XML_PARSER_CDATA_SECTION:
				4464	xmlGenericError(xmlGenericErrorContext,
				4465	"HPP: internal error, state == CDATA\n");
				4466	ctxt->instate = XML_PARSER_CONTENT;
				4467	ctxt->checkIndex = 0;
				4468	#ifdef DEBUG_PUSH
				4469	xmlGenericError(xmlGenericErrorContext,
				4470	"HPP: entering CONTENT\n");
				4471	#endif
				4472	break;
				4473	case XML_PARSER_DTD:
				4474	xmlGenericError(xmlGenericErrorContext,
				4475	"HPP: internal error, state == DTD\n");
				4476	ctxt->instate = XML_PARSER_CONTENT;
				4477	ctxt->checkIndex = 0;
				4478	#ifdef DEBUG_PUSH
				4479	xmlGenericError(xmlGenericErrorContext,
				4480	"HPP: entering CONTENT\n");
				4481	#endif
				4482	break;
				4483	case XML_PARSER_COMMENT:
				4484	xmlGenericError(xmlGenericErrorContext,
				4485	"HPP: internal error, state == COMMENT\n");
				4486	ctxt->instate = XML_PARSER_CONTENT;
				4487	ctxt->checkIndex = 0;
				4488	#ifdef DEBUG_PUSH
				4489	xmlGenericError(xmlGenericErrorContext,
				4490	"HPP: entering CONTENT\n");
				4491	#endif
				4492	break;
				4493	case XML_PARSER_PI:
				4494	xmlGenericError(xmlGenericErrorContext,
				4495	"HPP: internal error, state == PI\n");
				4496	ctxt->instate = XML_PARSER_CONTENT;
				4497	ctxt->checkIndex = 0;
				4498	#ifdef DEBUG_PUSH
				4499	xmlGenericError(xmlGenericErrorContext,
				4500	"HPP: entering CONTENT\n");
				4501	#endif
				4502	break;
				4503	case XML_PARSER_ENTITY_DECL:
				4504	xmlGenericError(xmlGenericErrorContext,
				4505	"HPP: internal error, state == ENTITY_DECL\n");
				4506	ctxt->instate = XML_PARSER_CONTENT;
				4507	ctxt->checkIndex = 0;
				4508	#ifdef DEBUG_PUSH
				4509	xmlGenericError(xmlGenericErrorContext,
				4510	"HPP: entering CONTENT\n");
				4511	#endif
				4512	break;
				4513	case XML_PARSER_ENTITY_VALUE:
				4514	xmlGenericError(xmlGenericErrorContext,
				4515	"HPP: internal error, state == ENTITY_VALUE\n");
				4516	ctxt->instate = XML_PARSER_CONTENT;
				4517	ctxt->checkIndex = 0;
				4518	#ifdef DEBUG_PUSH
				4519	xmlGenericError(xmlGenericErrorContext,
				4520	"HPP: entering DTD\n");
				4521	#endif
				4522	break;
				4523	case XML_PARSER_ATTRIBUTE_VALUE:
				4524	xmlGenericError(xmlGenericErrorContext,
				4525	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4526	ctxt->instate = XML_PARSER_START_TAG;
				4527	ctxt->checkIndex = 0;
				4528	#ifdef DEBUG_PUSH
				4529	xmlGenericError(xmlGenericErrorContext,
				4530	"HPP: entering START_TAG\n");
				4531	#endif
				4532	break;
				4533	case XML_PARSER_SYSTEM_LITERAL:
				4534	xmlGenericError(xmlGenericErrorContext,
				4535	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4536	ctxt->instate = XML_PARSER_CONTENT;
				4537	ctxt->checkIndex = 0;
				4538	#ifdef DEBUG_PUSH
				4539	xmlGenericError(xmlGenericErrorContext,
				4540	"HPP: entering CONTENT\n");
				4541	#endif
				4542	break;
				4543	case XML_PARSER_IGNORE:
				4544	xmlGenericError(xmlGenericErrorContext,
				4545	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4546	ctxt->instate = XML_PARSER_CONTENT;
				4547	ctxt->checkIndex = 0;
				4548	#ifdef DEBUG_PUSH
				4549	xmlGenericError(xmlGenericErrorContext,
				4550	"HPP: entering CONTENT\n");
				4551	#endif
				4552	break;
				4553	}
				4554	}
				4555	done:
				4556	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4557	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4558	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4559	/*
				4560	* SAX: end of the document processing.
				4561	*/
				4562	ctxt->instate = XML_PARSER_EOF;
				4563	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4564	ctxt->sax->endDocument(ctxt->userData);
				4565	}
				4566	}
				4567	if ((ctxt->myDoc != NULL) &&
				4568	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4569	(ctxt->instate == XML_PARSER_EPILOG))) {
				4570	xmlDtdPtr dtd;
				4571	dtd = xmlGetIntSubset(ctxt->myDoc);
				4572	if (dtd == NULL)
				4573	ctxt->myDoc->intSubset =
				4574	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4575	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4576	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4577	}
				4578	#ifdef DEBUG_PUSH
				4579	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4580	#endif
				4581	return(ret);
				4582	}
				4583
				4584	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4585	* htmlParseChunk:
				4586	* @ctxt: an XML parser context
				4587	* @chunk: an char array
				4588	* @size: the size in byte of the chunk
				4589	* @terminate: last chunk indicator
				4590	*
				4591	* Parse a Chunk of memory
				4592	*
				4593	* Returns zero if no error, the xmlParserErrors otherwise.
				4594	*/
				4595	int
				4596	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4597	int terminate) {
				4598	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4599	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4600	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4601	int cur = ctxt->input->cur - ctxt->input->base;
				4602
				4603	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4604	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4605	ctxt->input->cur = ctxt->input->base + cur;
				4606	#ifdef DEBUG_PUSH
				4607	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4608	#endif
				4609
				4610	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4611	htmlParseTryOrFinish(ctxt, terminate);
				4612	} else if (ctxt->instate != XML_PARSER_EOF) {
				4613	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4614	htmlParseTryOrFinish(ctxt, terminate);
				4615	}
				4616	if (terminate) {
				4617	if ((ctxt->instate != XML_PARSER_EOF) &&
				4618	(ctxt->instate != XML_PARSER_EPILOG) &&
				4619	(ctxt->instate != XML_PARSER_MISC)) {
				4620	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4621	ctxt->wellFormed = 0;
				4622	}
				4623	if (ctxt->instate != XML_PARSER_EOF) {
				4624	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4625	ctxt->sax->endDocument(ctxt->userData);
				4626	}
				4627	ctxt->instate = XML_PARSER_EOF;
				4628	}
				4629	return((xmlParserErrors) ctxt->errNo);
				4630	}
				4631
				4632	/************************************************************************
				4633	* *
				4634	* User entry points *
				4635	* *
				4636	************************************************************************/
				4637
				4638	/**
				4639	* htmlCreatePushParserCtxt :
				4640	* @sax: a SAX handler
				4641	* @user_data: The user data returned on SAX callbacks
				4642	* @chunk: a pointer to an array of chars
				4643	* @size: number of chars in the array
				4644	* @filename: an optional file name or URI
				4645	* @enc: an optional encoding
				4646	*
				4647	* Create a parser context for using the HTML parser in push mode
				4648	* To allow content encoding detection, @size should be >= 4
				4649	* The value of @filename is used for fetching external entities
				4650	* and error/warning reports.
				4651	*
				4652	* Returns the new parser context or NULL
				4653	*/
				4654	htmlParserCtxtPtr
				4655	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4656	const char chunk, int size, const char filename,
				4657	xmlCharEncoding enc) {
				4658	htmlParserCtxtPtr ctxt;
				4659	htmlParserInputPtr inputStream;
				4660	xmlParserInputBufferPtr buf;
				4661
				4662	buf = xmlAllocParserInputBuffer(enc);
				4663	if (buf == NULL) return(NULL);
				4664
				4665	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4666	if (ctxt == NULL) {
				4667	xmlFree(buf);
				4668	return(NULL);
				4669	}
				4670	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4671	htmlInitParserCtxt(ctxt);
				4672	if (sax != NULL) {
				4673	if (ctxt->sax != &htmlDefaultSAXHandler)
				4674	xmlFree(ctxt->sax);
				4675	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4676	if (ctxt->sax == NULL) {
				4677	xmlFree(buf);
				4678	xmlFree(ctxt);
				4679	return(NULL);
				4680	}
				4681	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4682	if (user_data != NULL)
				4683	ctxt->userData = user_data;
				4684	}
				4685	if (filename == NULL) {
				4686	ctxt->directory = NULL;
				4687	} else {
				4688	ctxt->directory = xmlParserGetDirectory(filename);
				4689	}
				4690
				4691	inputStream = htmlNewInputStream(ctxt);
				4692	if (inputStream == NULL) {
				4693	xmlFreeParserCtxt(ctxt);
				4694	return(NULL);
				4695	}
				4696
				4697	if (filename == NULL)
				4698	inputStream->filename = NULL;
				4699	else
				4700	inputStream->filename = xmlMemStrdup(filename);
				4701	inputStream->buf = buf;
				4702	inputStream->base = inputStream->buf->buffer->content;
				4703	inputStream->cur = inputStream->buf->buffer->content;
				4704
				4705	inputPush(ctxt, inputStream);
				4706
				4707	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4708	(ctxt->input->buf != NULL)) {
				4709	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4710	#ifdef DEBUG_PUSH
				4711	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4712	#endif
				4713	}
				4714
				4715	return(ctxt);
				4716	}
				4717
				4718	/**
				4719	* htmlSAXParseDoc :
				4720	* @cur: a pointer to an array of xmlChar
				4721	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4722	* @sax: the SAX handler block
				4723	* @userData: if using SAX, this pointer will be provided on callbacks.
				4724	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4725	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4726	* to handle parse events. If sax is NULL, fallback to the default DOM
				4727	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4728	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4729	* Returns the resulting document tree unless SAX is NULL or the document is
				4730	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4731	*/
				4732
				4733	htmlDocPtr
				4734	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4735	htmlDocPtr ret;
				4736	htmlParserCtxtPtr ctxt;
				4737
				4738	if (cur == NULL) return(NULL);
				4739
				4740
				4741	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4742	if (ctxt == NULL) return(NULL);
				4743	if (sax != NULL) {
				4744	ctxt->sax = sax;
				4745	ctxt->userData = userData;
				4746	}
				4747
				4748	htmlParseDocument(ctxt);
				4749	ret = ctxt->myDoc;
				4750	if (sax != NULL) {
				4751	ctxt->sax = NULL;
				4752	ctxt->userData = NULL;
				4753	}
				4754	htmlFreeParserCtxt(ctxt);
				4755
				4756	return(ret);
				4757	}
				4758
				4759	/**
				4760	* htmlParseDoc :
				4761	* @cur: a pointer to an array of xmlChar
				4762	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4763	*
				4764	* parse an HTML in-memory document and build a tree.
				4765	*
				4766	* Returns the resulting document tree
				4767	*/
				4768
				4769	htmlDocPtr
				4770	htmlParseDoc(xmlChar cur, const char encoding) {
				4771	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4772	}
				4773
				4774
				4775	/**
				4776	* htmlCreateFileParserCtxt :
				4777	* @filename: the filename
				4778	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4779	*
				4780	* Create a parser context for a file content.
				4781	* Automatic support for ZLIB/Compress compressed document is provided
				4782	* by default if found at compile-time.
				4783	*
				4784	* Returns the new parser context or NULL
				4785	*/
				4786	htmlParserCtxtPtr
				4787	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4788	{
				4789	htmlParserCtxtPtr ctxt;
				4790	htmlParserInputPtr inputStream;
				4791	xmlParserInputBufferPtr buf;
				4792	/* htmlCharEncoding enc; */
				4793	xmlChar content, content_line = (xmlChar *) "charset=";
				4794
				4795	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4796	if (buf == NULL) return(NULL);
				4797
				4798	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4799	if (ctxt == NULL) {
				4800	perror("malloc");
				4801	return(NULL);
				4802	}
				4803	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4804	htmlInitParserCtxt(ctxt);
				4805	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4806	if (inputStream == NULL) {
				4807	perror("malloc");
				4808	xmlFree(ctxt);
				4809	return(NULL);
				4810	}
				4811	memset(inputStream, 0, sizeof(htmlParserInput));
				4812
				4813	inputStream->filename = xmlMemStrdup(filename);
				4814	inputStream->line = 1;
				4815	inputStream->col = 1;
				4816	inputStream->buf = buf;
				4817	inputStream->directory = NULL;
				4818
				4819	inputStream->base = inputStream->buf->buffer->content;
				4820	inputStream->cur = inputStream->buf->buffer->content;
				4821	inputStream->free = NULL;
				4822
				4823	inputPush(ctxt, inputStream);
				4824
				4825	/* set encoding */
				4826	if (encoding) {
				4827	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4828	if (content) {
				4829	strcpy ((char )content, (char )content_line);
				4830	strcat ((char )content, (char )encoding);
				4831	htmlCheckEncoding (ctxt, content);
				4832	xmlFree (content);
				4833	}
				4834	}
				4835
				4836	return(ctxt);
				4837	}
				4838
				4839	/**
				4840	* htmlSAXParseFile :
				4841	* @filename: the filename
				4842	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4843	* @sax: the SAX handler block
				4844	* @userData: if using SAX, this pointer will be provided on callbacks.
				4845	*
				4846	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4847	* compressed document is provided by default if found at compile-time.
				4848	* It use the given SAX function block to handle the parsing callback.
				4849	* If sax is NULL, fallback to the default DOM tree building routines.
				4850	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4851	* Returns the resulting document tree unless SAX is NULL or the document is
				4852	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4853	*/
				4854
				4855	htmlDocPtr
				4856	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4857	void *userData) {
				4858	htmlDocPtr ret;
				4859	htmlParserCtxtPtr ctxt;
				4860	htmlSAXHandlerPtr oldsax = NULL;
				4861
				4862	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4863	if (ctxt == NULL) return(NULL);
				4864	if (sax != NULL) {
				4865	oldsax = ctxt->sax;
				4866	ctxt->sax = sax;
				4867	ctxt->userData = userData;
				4868	}
				4869
				4870	htmlParseDocument(ctxt);
				4871
				4872	ret = ctxt->myDoc;
				4873	if (sax != NULL) {
				4874	ctxt->sax = oldsax;
				4875	ctxt->userData = NULL;
				4876	}
				4877	htmlFreeParserCtxt(ctxt);
				4878
				4879	return(ret);
				4880	}
				4881
				4882	/**
				4883	* htmlParseFile :
				4884	* @filename: the filename
				4885	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4886	*
				4887	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4888	* compressed document is provided by default if found at compile-time.
				4889	*
				4890	* Returns the resulting document tree
				4891	*/
				4892
				4893	htmlDocPtr
				4894	htmlParseFile(const char filename, const char encoding) {
				4895	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4896	}
				4897
				4898	/**
				4899	* htmlHandleOmittedElem:
				4900	* @val: int 0 or 1
				4901	*
				4902	* Set and return the previous value for handling HTML omitted tags.
				4903	*
				4904	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4905	*/
				4906
				4907	int
				4908	htmlHandleOmittedElem(int val) {
				4909	int old = htmlOmittedDefaultValue;
				4910
				4911	htmlOmittedDefaultValue = val;
				4912	return(old);
				4913	}
				4914
				4915	#endif /* LIBXML_HTML_ENABLED */