Blame - HTMLparser.c - fp2-dev/platform/external/libxml2

blob: 86d9e5451cf99366a3c37950a35b805c99d5295a [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* HTMLparser.c : an HTML 4.0 non-verifying parser
				3	*
				4	* See Copyright for the status of this software.
				5	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	6	* daniel@veillard.com
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	7	*/
				8
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	9	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	10	#ifdef LIBXML_HTML_ENABLED
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	11
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	12	#include <string.h>
				13	#ifdef HAVE_CTYPE_H
				14	#include <ctype.h>
				15	#endif
				16	#ifdef HAVE_STDLIB_H
				17	#include <stdlib.h>
				18	#endif
				19	#ifdef HAVE_SYS_STAT_H
				20	#include <sys/stat.h>
				21	#endif
				22	#ifdef HAVE_FCNTL_H
				23	#include <fcntl.h>
				24	#endif
				25	#ifdef HAVE_UNISTD_H
				26	#include <unistd.h>
				27	#endif
				28	#ifdef HAVE_ZLIB_H
				29	#include <zlib.h>
				30	#endif
				31
				32	#include <libxml/xmlmemory.h>
				33	#include <libxml/tree.h>
				34	#include <libxml/parser.h>
				35	#include <libxml/parserInternals.h>
				36	#include <libxml/xmlerror.h>
				37	#include <libxml/HTMLparser.h>
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	38	#include <libxml/HTMLtree.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	39	#include <libxml/entities.h>
				40	#include <libxml/encoding.h>
				41	#include <libxml/valid.h>
				42	#include <libxml/xmlIO.h>
				43
				44	#define HTML_MAX_NAMELEN 1000
				45	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
				46	#define HTML_PARSER_BUFFER_SIZE 100
				47
				48	/* #define DEBUG */
				49	/* #define DEBUG_PUSH */
				50
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	51	static int htmlOmittedDefaultValue = 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	52
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	53	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
				54	xmlChar end, xmlChar end2, xmlChar end3);
				55
				56	/************************************************************************
				57	* *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	58	* Parser stacks related functions and macros *
				59	* *
				60	************************************************************************/
				61
				62	/*
				63	* Generic function for accessing stacks in the Parser Context
				64	*/
				65
				66	#define PUSH_AND_POP(scope, type, name) \
				67	scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
				68	if (ctxt->name##Nr >= ctxt->name##Max) { \
				69	ctxt->name##Max *= 2; \
				70	ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
				71	ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
				72	if (ctxt->name##Tab == NULL) { \
				73	xmlGenericError(xmlGenericErrorContext, \
				74	"realloc failed !\n"); \
				75	return(0); \
				76	} \
				77	} \
				78	ctxt->name##Tab[ctxt->name##Nr] = value; \
				79	ctxt->name = value; \
				80	return(ctxt->name##Nr++); \
				81	} \
				82	scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
				83	type ret; \
				84	if (ctxt->name##Nr < 0) return(0); \
				85	ctxt->name##Nr--; \
				86	if (ctxt->name##Nr < 0) return(0); \
				87	if (ctxt->name##Nr > 0) \
				88	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
				89	else \
				90	ctxt->name = NULL; \
				91	ret = ctxt->name##Tab[ctxt->name##Nr]; \
				92	ctxt->name##Tab[ctxt->name##Nr] = 0; \
				93	return(ret); \
				94	} \
				95
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	96	/* PUSH_AND_POP(static, xmlNodePtr, node) */
				97	PUSH_AND_POP(static, xmlChar*, name)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	98
				99	/*
				100	* Macros for accessing the content. Those should be used only by the parser,
				101	* and not exported.
				102	*
				103	* Dirty macros, i.e. one need to make assumption on the context to use them
				104	*
				105	* CUR_PTR return the current pointer to the xmlChar to be parsed.
				106	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
				107	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
				108	* in UNICODE mode. This should be used internally by the parser
				109	* only to compare to ASCII values otherwise it would break when
				110	* running with UTF-8 encoding.
				111	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
				112	* to compare on ASCII based substring.
				113	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
				114	* it should be used only to compare on ASCII based substring.
				115	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
				116	* strings within the parser.
				117	*
				118	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
				119	*
				120	* CURRENT Returns the current char value, with the full decoding of
				121	* UTF-8 if we are using this mode. It returns an int.
				122	* NEXT Skip to the next character, this does the proper decoding
				123	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
				124	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
				125	*/
				126
				127	#define UPPER (toupper(*ctxt->input->cur))
				128
				129	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
				130
				131	#define NXT(val) ctxt->input->cur[(val)]
				132
				133	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
				134
				135	#define CUR_PTR ctxt->input->cur
				136
				137	#define SHRINK xmlParserInputShrink(ctxt->input)
				138
				139	#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
				140
				141	#define CURRENT ((int) (*ctxt->input->cur))
				142
				143	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
				144
				145	/* Inported from XML */
				146
				147	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
				148	#define CUR ((int) (*ctxt->input->cur))
				149	#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
				150
				151	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
				152	#define NXT(val) ctxt->input->cur[(val)]
				153	#define CUR_PTR ctxt->input->cur
				154
				155
				156	#define NEXTL(l) do { \
				157	if (*(ctxt->input->cur) == '\n') { \
				158	ctxt->input->line++; ctxt->input->col = 1; \
				159	} else ctxt->input->col++; \
				160	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
				161	} while (0)
				162
				163	/************
				164	\
				165	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
				166	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
				167	************/
				168
				169	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
				170	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
				171
				172	#define COPY_BUF(l,b,i,v) \
				173	if (l == 1) b[i++] = (xmlChar) v; \
				174	else i += xmlCopyChar(l,&b[i],v)
				175
				176	/**
				177	* htmlCurrentChar:
				178	* @ctxt: the HTML parser context
				179	* @len: pointer to the length of the char read
				180	*
				181	* The current char value, if using UTF-8 this may actaully span multiple
				182	* bytes in the input buffer. Implement the end of line normalization:
				183	* 2.11 End-of-Line Handling
				184	* If the encoding is unspecified, in the case we find an ISO-Latin-1
				185	* char, then the encoding converter is plugged in automatically.
				186	*
				187	* Returns the current char value and its lenght
				188	*/
				189
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	190	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	191	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
				192	if (ctxt->instate == XML_PARSER_EOF)
				193	return(0);
				194
				195	if (ctxt->token != 0) {
				196	*len = 0;
				197	return(ctxt->token);
				198	}
				199	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
				200	/*
				201	* We are supposed to handle UTF8, check it's valid
				202	* From rfc2044: encoding of the Unicode values on UTF-8:
				203	*
				204	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				205	* 0000 0000-0000 007F 0xxxxxxx
				206	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				207	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				208	*
				209	* Check for the 0x110000 limit too
				210	*/
				211	const unsigned char *cur = ctxt->input->cur;
				212	unsigned char c;
				213	unsigned int val;
				214
				215	c = *cur;
				216	if (c & 0x80) {
				217	if (cur[1] == 0)
				218	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				219	if ((cur[1] & 0xc0) != 0x80)
				220	goto encoding_error;
				221	if ((c & 0xe0) == 0xe0) {
				222
				223	if (cur[2] == 0)
				224	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				225	if ((cur[2] & 0xc0) != 0x80)
				226	goto encoding_error;
				227	if ((c & 0xf0) == 0xf0) {
				228	if (cur[3] == 0)
				229	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				230	if (((c & 0xf8) != 0xf0) \|\|
				231	((cur[3] & 0xc0) != 0x80))
				232	goto encoding_error;
				233	/* 4-byte code */
				234	*len = 4;
				235	val = (cur[0] & 0x7) << 18;
				236	val \|= (cur[1] & 0x3f) << 12;
				237	val \|= (cur[2] & 0x3f) << 6;
				238	val \|= cur[3] & 0x3f;
				239	} else {
				240	/* 3-byte code */
				241	*len = 3;
				242	val = (cur[0] & 0xf) << 12;
				243	val \|= (cur[1] & 0x3f) << 6;
				244	val \|= cur[2] & 0x3f;
				245	}
				246	} else {
				247	/* 2-byte code */
				248	*len = 2;
				249	val = (cur[0] & 0x1f) << 6;
				250	val \|= cur[1] & 0x3f;
				251	}
				252	if (!IS_CHAR(val)) {
				253	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				254	if ((ctxt->sax != NULL) &&
				255	(ctxt->sax->error != NULL))
				256	ctxt->sax->error(ctxt->userData,
				257	"Char 0x%X out of allowed range\n", val);
				258	ctxt->wellFormed = 0;
				259	ctxt->disableSAX = 1;
				260	}
				261	return(val);
				262	} else {
				263	/* 1-byte code */
				264	*len = 1;
				265	return((int) *ctxt->input->cur);
				266	}
				267	}
				268	/*
				269	* Assume it's a fixed lenght encoding (1) with
				270	* a compatibke encoding for the ASCII set, since
				271	* XML constructs only use < 128 chars
				272	*/
				273	*len = 1;
				274	if ((int) *ctxt->input->cur < 0x80)
				275	return((int) *ctxt->input->cur);
				276
				277	/*
				278	* Humm this is bad, do an automatic flow conversion
				279	*/
				280	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
				281	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				282	return(xmlCurrentChar(ctxt, len));
				283
				284	encoding_error:
				285	/*
				286	* If we detect an UTF8 error that probably mean that the
				287	* input encoding didn't get properly advertized in the
				288	* declaration header. Report the error and switch the encoding
				289	* to ISO-Latin-1 (if you don't like this policy, just declare the
				290	* encoding !)
				291	*/
				292	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				293	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
				294	ctxt->sax->error(ctxt->userData,
				295	"Input is not proper UTF-8, indicate encoding !\n");
				296	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				297	ctxt->input->cur[0], ctxt->input->cur[1],
				298	ctxt->input->cur[2], ctxt->input->cur[3]);
				299	}
				300
				301	ctxt->charset = XML_CHAR_ENCODING_8859_1;
				302	*len = 1;
				303	return((int) *ctxt->input->cur);
				304	}
				305
				306	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	307	* htmlSkipBlankChars:
				308	* @ctxt: the HTML parser context
				309	*
				310	* skip all blanks character found at that point in the input streams.
				311	*
				312	* Returns the number of space chars skipped
				313	*/
				314
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	315	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	316	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
				317	int res = 0;
				318
				319	while (IS_BLANK(*(ctxt->input->cur))) {
				320	if ((*ctxt->input->cur == 0) &&
				321	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
				322	xmlPopInput(ctxt);
				323	} else {
				324	if (*(ctxt->input->cur) == '\n') {
				325	ctxt->input->line++; ctxt->input->col = 1;
				326	} else ctxt->input->col++;
				327	ctxt->input->cur++;
				328	ctxt->nbChars++;
				329	if (*ctxt->input->cur == 0)
				330	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
				331	}
				332	res++;
				333	}
				334	return(res);
				335	}
				336
				337
				338
				339	/************************************************************************
				340	* *
				341	* The list of HTML elements and their properties *
				342	* *
				343	************************************************************************/
				344
				345	/*
				346	* Start Tag: 1 means the start tag can be ommited
				347	* End Tag: 1 means the end tag can be ommited
				348	* 2 means it's forbidden (empty elements)
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	349	* 3 means the tag is stylistic and should be closed easilly
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	350	* Depr: this element is deprecated
				351	* DTD: 1 means that this element is valid only in the Loose DTD
				352	* 2 means that this element is valid only in the Frameset DTD
				353	*
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	354	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	355	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	356	static const htmlElemDesc
				357	html40ElementTable[] = {
Daniel Veillard	02bb170	2001-06-13 21:11:59 +0000	[diff] [blame]	358	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
				359	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
				360	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
				361	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
				362	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
				363	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
				364	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
				365	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
				366	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
				367	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
				368	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
				369	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
				370	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
				371	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
				372	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
				373	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
				374	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
				375	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
				376	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
				377	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
				378	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
				379	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
				380	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
				381	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
				382	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
				383	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
				384	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
				385	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
				386	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
				387	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
				388	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
				389	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
				390	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
				391	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
				392	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
				393	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
				394	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
				395	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
				396	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
				397	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
				398	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
				399	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
				400	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
				401	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
				402	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
				403	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
				404	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
				405	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
				406	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
				407	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
				408	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
				409	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
				410	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
				411	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
				412	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
				413	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
				414	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
				415	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
				416	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
				417	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
				418	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
				419	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
				420	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
				421	{ "p", 0, 1, 1, 0, 0, 0, 0, "paragraph " },
				422	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
				423	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
				424	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
				425	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
				426	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
				427	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
				428	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
				429	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
				430	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
				431	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
				432	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
				433	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
				434	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
				435	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
				436	{ "table", 0, 0, 0, 0, 0, 0, 0, " " },
				437	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
				438	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
				439	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
				440	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
				441	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
				442	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
				443	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
				444	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
				445	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
				446	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
				447	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
				448	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	449	};
				450
				451	/*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	452	* start tags that imply the end of current element
				453	*/
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	454	static const char *htmlStartClose[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	455	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
				456	"dl", "ul", "ol", "menu", "dir", "address", "pre",
				457	"listing", "xmp", "head", NULL,
				458	"head", "p", NULL,
				459	"title", "p", NULL,
				460	"body", "head", "style", "link", "title", "p", NULL,
				461	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
				462	"pre", "listing", "xmp", "head", "li", NULL,
				463	"hr", "p", "head", NULL,
				464	"h1", "p", "head", NULL,
				465	"h2", "p", "head", NULL,
				466	"h3", "p", "head", NULL,
				467	"h4", "p", "head", NULL,
				468	"h5", "p", "head", NULL,
				469	"h6", "p", "head", NULL,
				470	"dir", "p", "head", NULL,
				471	"address", "p", "head", "ul", NULL,
				472	"pre", "p", "head", "ul", NULL,
				473	"listing", "p", "head", NULL,
				474	"xmp", "p", "head", NULL,
				475	"blockquote", "p", "head", NULL,
				476	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
				477	"xmp", "head", NULL,
				478	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				479	"head", "dd", NULL,
				480	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
				481	"head", "dt", NULL,
				482	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
				483	"listing", "xmp", NULL,
				484	"ol", "p", "head", "ul", NULL,
				485	"menu", "p", "head", "ul", NULL,
				486	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
				487	"div", "p", "head", NULL,
				488	"noscript", "p", "head", NULL,
				489	"center", "font", "b", "i", "p", "head", NULL,
				490	"a", "a", NULL,
				491	"caption", "p", NULL,
				492	"colgroup", "caption", "colgroup", "col", "p", NULL,
				493	"col", "caption", "col", "p", NULL,
				494	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
				495	"listing", "xmp", "a", NULL,
Daniel Veillard	43dadeb	2001-04-24 11:23:35 +0000	[diff] [blame]	496	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
				497	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	498	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
				499	"thead", "caption", "col", "colgroup", NULL,
				500	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				501	"tbody", "p", NULL,
				502	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
				503	"tfoot", "tbody", "p", NULL,
				504	"optgroup", "option", NULL,
				505	"option", "option", NULL,
				506	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
				507	"pre", "listing", "xmp", "a", NULL,
				508	NULL
				509	};
				510
				511	/*
				512	* The list of HTML elements which are supposed not to have
				513	* CDATA content and where a p element will be implied
				514	*
				515	* TODO: extend that list by reading the HTML SGML DtD on
				516	* implied paragraph
				517	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	518	static const char *htmlNoContentElements[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	519	"html",
				520	"head",
				521	"body",
				522	NULL
				523	};
				524
				525	/*
				526	* The list of HTML attributes which are of content %Script;
				527	* NOTE: when adding ones, check htmlIsScriptAttribute() since
				528	* it assumes the name starts with 'on'
				529	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	530	static const char *htmlScriptAttributes[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	531	"onclick",
				532	"ondblclick",
				533	"onmousedown",
				534	"onmouseup",
				535	"onmouseover",
				536	"onmousemove",
				537	"onmouseout",
				538	"onkeypress",
				539	"onkeydown",
				540	"onkeyup",
				541	"onload",
				542	"onunload",
				543	"onfocus",
				544	"onblur",
				545	"onsubmit",
				546	"onrest",
				547	"onchange",
				548	"onselect"
				549	};
				550
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	551	/*
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	552	* This table is used by the htmlparser to know what to do with
				553	* broken html pages. By assigning different priorities to different
				554	* elements the parser can decide how to handle extra endtags.
				555	* Endtags are only allowed to close elements with lower or equal
				556	* priority.
				557	*/
Daniel Veillard	a2bc368	2001-05-03 08:27:20 +0000	[diff] [blame]	558
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	559	typedef struct {
				560	const char *name;
				561	int priority;
				562	} elementPriority;
				563
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	564	static const elementPriority htmlEndPriority[] = {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	565	{"div", 150},
				566	{"td", 160},
				567	{"th", 160},
				568	{"tr", 170},
				569	{"thead", 180},
				570	{"tbody", 180},
				571	{"tfoot", 180},
				572	{"table", 190},
				573	{"head", 200},
				574	{"body", 200},
				575	{"html", 220},
				576	{NULL, 100} /* Default priority */
				577	};
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	578
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	579	static const char** htmlStartCloseIndex[100];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	580	static int htmlStartCloseIndexinitialized = 0;
				581
				582	/************************************************************************
				583	* *
				584	* functions to handle HTML specific data *
				585	* *
				586	************************************************************************/
				587
				588	/**
				589	* htmlInitAutoClose:
				590	*
				591	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				592	* This is not reentrant. Call xmlInitParser() once before processing in
				593	* case of use in multithreaded programs.
				594	*/
				595	void
				596	htmlInitAutoClose(void) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	597	int indx, i = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	598
				599	if (htmlStartCloseIndexinitialized) return;
				600
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	601	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
				602	indx = 0;
				603	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
				604	htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	605	while (htmlStartClose[i] != NULL) i++;
				606	i++;
				607	}
				608	htmlStartCloseIndexinitialized = 1;
				609	}
				610
				611	/**
				612	* htmlTagLookup:
				613	* @tag: The tag name in lowercase
				614	*
				615	* Lookup the HTML tag in the ElementTable
				616	*
				617	* Returns the related htmlElemDescPtr or NULL if not found.
				618	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	619	const htmlElemDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	620	htmlTagLookup(const xmlChar *tag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	621	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	622
				623	for (i = 0; i < (sizeof(html40ElementTable) /
				624	sizeof(html40ElementTable[0]));i++) {
Daniel Veillard	1ed3f88	2001-04-18 09:45:35 +0000	[diff] [blame]	625	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	626	return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	627	}
				628	return(NULL);
				629	}
				630
				631	/**
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	632	* htmlGetEndPriority:
				633	* @name: The name of the element to look up the priority for.
				634	*
				635	* Return value: The "endtag" priority.
				636	**/
				637	static int
				638	htmlGetEndPriority (const xmlChar *name) {
				639	int i = 0;
				640
				641	while ((htmlEndPriority[i].name != NULL) &&
				642	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
				643	i++;
				644
				645	return(htmlEndPriority[i].priority);
				646	}
				647
				648	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	649	* htmlCheckAutoClose:
				650	* @newtag: The new tag name
				651	* @oldtag: The old tag name
				652	*
				653	* Checks wether the new tag is one of the registered valid tags for closing old.
				654	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
				655	*
				656	* Returns 0 if no, 1 if yes.
				657	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	658	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	659	htmlCheckAutoClose(const xmlChar newtag, const xmlChar oldtag) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	660	int i, indx;
				661	const char **closed = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	662
				663	if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
				664
				665	/* inefficient, but not a big deal */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	666	for (indx = 0; indx < 100;indx++) {
				667	closed = htmlStartCloseIndex[indx];
				668	if (closed == NULL) return(0);
				669	if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	670	}
				671
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	672	i = closed - htmlStartClose;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	673	i++;
				674	while (htmlStartClose[i] != NULL) {
				675	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
				676	return(1);
				677	}
				678	i++;
				679	}
				680	return(0);
				681	}
				682
				683	/**
				684	* htmlAutoCloseOnClose:
				685	* @ctxt: an HTML parser context
				686	* @newtag: The new tag name
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	687	* @force: force the tag closure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	688	*
				689	* The HTmL DtD allows an ending tag to implicitely close other tags.
				690	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	691	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	692	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	693	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	694	xmlChar *oldname;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	695	int i, priority;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	696
				697	#ifdef DEBUG
				698	xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
				699	for (i = 0;i < ctxt->nameNr;i++)
				700	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				701	#endif
				702
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	703	priority = htmlGetEndPriority (newtag);
				704
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	705	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	706
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	707	if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
Daniel Veillard	0a2a163	2001-05-11 14:18:03 +0000	[diff] [blame]	708	/*
				709	* A missplaced endtagad can only close elements with lower
				710	* or equal priority, so if we find an element with higher
				711	* priority before we find an element with
				712	* matching name, we just ignore this endtag
				713	*/
				714	if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	715	}
				716	if (i < 0) return;
				717
				718	while (!xmlStrEqual(newtag, ctxt->name)) {
				719	info = htmlTagLookup(ctxt->name);
				720	if ((info == NULL) \|\| (info->endTag == 1)) {
				721	#ifdef DEBUG
				722	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
				723	#endif
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	724	} else if (info->endTag == 3) {
				725	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	726	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	727
Daniel Veillard	56098d4	2001-04-24 12:51:09 +0000	[diff] [blame]	728	#endif
				729	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				730	ctxt->sax->error(ctxt->userData,
				731	"Opening and ending tag mismatch: %s and %s\n",
				732	newtag, ctxt->name);
				733	ctxt->wellFormed = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	734	}
				735	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				736	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				737	oldname = htmlnamePop(ctxt);
				738	if (oldname != NULL) {
				739	#ifdef DEBUG
				740	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
				741	#endif
				742	xmlFree(oldname);
				743	}
				744	}
				745	}
				746
				747	/**
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	748	* htmlAutoCloseOnEnd:
				749	* @ctxt: an HTML parser context
				750	*
				751	* Close all remaining tags at the end of the stream
				752	*/
				753	static void
				754	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
				755	xmlChar *oldname;
				756	int i;
				757
				758	if (ctxt->nameNr == 0)
				759	return;
				760	#ifdef DEBUG
				761	xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
				762	#endif
				763
				764	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				765	#ifdef DEBUG
				766	xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
				767	#endif
				768	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				769	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				770	oldname = htmlnamePop(ctxt);
				771	if (oldname != NULL) {
				772	#ifdef DEBUG
				773	xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
				774	#endif
				775	xmlFree(oldname);
				776	}
				777	}
				778	}
				779
				780	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	781	* htmlAutoClose:
				782	* @ctxt: an HTML parser context
				783	* @newtag: The new tag name or NULL
				784	*
				785	* The HTmL DtD allows a tag to implicitely close other tags.
				786	* The list is kept in htmlStartClose array. This function is
				787	* called when a new tag has been detected and generates the
				788	* appropriates closes if possible/needed.
				789	* If newtag is NULL this mean we are at the end of the resource
				790	* and we should check
				791	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	792	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	793	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				794	xmlChar *oldname;
				795	while ((newtag != NULL) && (ctxt->name != NULL) &&
				796	(htmlCheckAutoClose(newtag, ctxt->name))) {
				797	#ifdef DEBUG
				798	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
				799	#endif
				800	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				801	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				802	oldname = htmlnamePop(ctxt);
				803	if (oldname != NULL) {
				804	#ifdef DEBUG
				805	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				806	#endif
				807	xmlFree(oldname);
				808	}
				809	}
				810	if (newtag == NULL) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	811	htmlAutoCloseOnEnd(ctxt);
				812	return;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	813	}
				814	while ((newtag == NULL) && (ctxt->name != NULL) &&
				815	((xmlStrEqual(ctxt->name, BAD_CAST"head")) \|\|
				816	(xmlStrEqual(ctxt->name, BAD_CAST"body")) \|\|
				817	(xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
				818	#ifdef DEBUG
				819	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
				820	#endif
				821	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				822	ctxt->sax->endElement(ctxt->userData, ctxt->name);
				823	oldname = htmlnamePop(ctxt);
				824	if (oldname != NULL) {
				825	#ifdef DEBUG
				826	xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
				827	#endif
				828	xmlFree(oldname);
				829	}
				830	}
				831
				832	}
				833
				834	/**
				835	* htmlAutoCloseTag:
				836	* @doc: the HTML document
				837	* @name: The tag name
				838	* @elem: the HTML element
				839	*
				840	* The HTmL DtD allows a tag to implicitely close other tags.
				841	* The list is kept in htmlStartClose array. This function checks
				842	* if the element or one of it's children would autoclose the
				843	* given tag.
				844	*
				845	* Returns 1 if autoclose, 0 otherwise
				846	*/
				847	int
				848	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
				849	htmlNodePtr child;
				850
				851	if (elem == NULL) return(1);
				852	if (xmlStrEqual(name, elem->name)) return(0);
				853	if (htmlCheckAutoClose(elem->name, name)) return(1);
				854	child = elem->children;
				855	while (child != NULL) {
				856	if (htmlAutoCloseTag(doc, name, child)) return(1);
				857	child = child->next;
				858	}
				859	return(0);
				860	}
				861
				862	/**
				863	* htmlIsAutoClosed:
				864	* @doc: the HTML document
				865	* @elem: the HTML element
				866	*
				867	* The HTmL DtD allows a tag to implicitely close other tags.
				868	* The list is kept in htmlStartClose array. This function checks
				869	* if a tag is autoclosed by one of it's child
				870	*
				871	* Returns 1 if autoclosed, 0 otherwise
				872	*/
				873	int
				874	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
				875	htmlNodePtr child;
				876
				877	if (elem == NULL) return(1);
				878	child = elem->children;
				879	while (child != NULL) {
				880	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
				881	child = child->next;
				882	}
				883	return(0);
				884	}
				885
				886	/**
				887	* htmlCheckImplied:
				888	* @ctxt: an HTML parser context
				889	* @newtag: The new tag name
				890	*
				891	* The HTML DtD allows a tag to exists only implicitely
				892	* called when a new tag has been detected and generates the
				893	* appropriates implicit tags if missing
				894	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	895	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	896	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
				897	if (!htmlOmittedDefaultValue)
				898	return;
				899	if (xmlStrEqual(newtag, BAD_CAST"html"))
				900	return;
				901	if (ctxt->nameNr <= 0) {
				902	#ifdef DEBUG
				903	xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
				904	#endif
				905	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
				906	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				907	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
				908	}
				909	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
				910	return;
				911	if ((ctxt->nameNr <= 1) &&
				912	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
				913	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
				914	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
				915	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
				916	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
				917	(xmlStrEqual(newtag, BAD_CAST"base")))) {
				918	/*
				919	* dropped OBJECT ... i you put it first BODY will be
				920	* assumed !
				921	*/
				922	#ifdef DEBUG
				923	xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
				924	#endif
				925	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
				926	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				927	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
				928	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
				929	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
				930	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
				931	int i;
				932	for (i = 0;i < ctxt->nameNr;i++) {
				933	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
				934	return;
				935	}
				936	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
				937	return;
				938	}
				939	}
				940
				941	#ifdef DEBUG
				942	xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
				943	#endif
				944	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
				945	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				946	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
				947	}
				948	}
				949
				950	/**
				951	* htmlCheckParagraph
				952	* @ctxt: an HTML parser context
				953	*
				954	* Check whether a p element need to be implied before inserting
				955	* characters in the current element.
				956	*
				957	* Returns 1 if a paragraph has been inserted, 0 if not and -1
				958	* in case of error.
				959	*/
				960
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	961	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	962	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
				963	const xmlChar *tag;
				964	int i;
				965
				966	if (ctxt == NULL)
				967	return(-1);
				968	tag = ctxt->name;
				969	if (tag == NULL) {
				970	htmlAutoClose(ctxt, BAD_CAST"p");
				971	htmlCheckImplied(ctxt, BAD_CAST"p");
				972	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				973	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				974	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				975	return(1);
				976	}
				977	if (!htmlOmittedDefaultValue)
				978	return(0);
				979	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
				980	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
				981	#ifdef DEBUG
				982	xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
				983	#endif
				984	htmlAutoClose(ctxt, BAD_CAST"p");
				985	htmlCheckImplied(ctxt, BAD_CAST"p");
				986	htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
				987	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				988	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
				989	return(1);
				990	}
				991	}
				992	return(0);
				993	}
				994
				995	/**
				996	* htmlIsScriptAttribute:
				997	* @name: an attribute name
				998	*
				999	* Check if an attribute is of content type Script
				1000	*
				1001	* Returns 1 is the attribute is a script 0 otherwise
				1002	*/
				1003	int
				1004	htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1005	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1006
				1007	if (name == NULL)
				1008	return(0);
				1009	/*
				1010	* all script attributes start with 'on'
				1011	*/
				1012	if ((name[0] != 'o') \|\| (name[1] != 'n'))
				1013	return(0);
				1014	for (i = 0;
				1015	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
				1016	i++) {
				1017	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
				1018	return(1);
				1019	}
				1020	return(0);
				1021	}
				1022
				1023	/************************************************************************
				1024	* *
				1025	* The list of HTML predefined entities *
				1026	* *
				1027	************************************************************************/
				1028
				1029
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1030	static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1031	/*
				1032	* the 4 absolute ones, plus apostrophe.
				1033	*/
				1034	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
				1035	{ 38, "amp", "ampersand, U+0026 ISOnum" },
				1036	{ 39, "apos", "single quote" },
				1037	{ 60, "lt", "less-than sign, U+003C ISOnum" },
				1038	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
				1039
				1040	/*
				1041	* A bunch still in the 128-255 range
				1042	* Replacing them depend really on the charset used.
				1043	*/
				1044	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
				1045	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
				1046	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
				1047	{ 163, "pound","pound sign, U+00A3 ISOnum" },
				1048	{ 164, "curren","currency sign, U+00A4 ISOnum" },
				1049	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
				1050	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
				1051	{ 167, "sect", "section sign, U+00A7 ISOnum" },
				1052	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
				1053	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
				1054	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
				1055	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
				1056	{ 172, "not", "not sign, U+00AC ISOnum" },
				1057	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
				1058	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
				1059	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
				1060	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
				1061	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
				1062	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
				1063	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
				1064	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
				1065	{ 181, "micro","micro sign, U+00B5 ISOnum" },
				1066	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
				1067	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
				1068	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
				1069	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
				1070	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
				1071	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
				1072	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
				1073	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
				1074	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
				1075	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
				1076	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
				1077	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
				1078	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
				1079	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
				1080	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
				1081	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
				1082	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
				1083	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
				1084	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
				1085	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
				1086	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
				1087	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
				1088	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
				1089	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
				1090	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
				1091	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
				1092	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
				1093	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
				1094	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
				1095	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
				1096	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
				1097	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
				1098	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
				1099	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
				1100	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
				1101	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
				1102	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
				1103	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
				1104	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
				1105	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
				1106	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
				1107	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
				1108	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
				1109	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
				1110	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
				1111	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
				1112	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
				1113	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
				1114	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
				1115	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
				1116	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
				1117	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
				1118	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
				1119	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
				1120	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
				1121	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
				1122	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
				1123	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
				1124	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
				1125	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
				1126	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
				1127	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
				1128	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
				1129	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
				1130	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
				1131	{ 247, "divide","division sign, U+00F7 ISOnum" },
				1132	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
				1133	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
				1134	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
				1135	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
				1136	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
				1137	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
				1138	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
				1139	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
				1140
				1141	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
				1142	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
				1143	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
				1144	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
				1145	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
				1146
				1147	/*
				1148	* Anything below should really be kept as entities references
				1149	*/
				1150	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
				1151
				1152	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
				1153	{ 732, "tilde","small tilde, U+02DC ISOdia" },
				1154
				1155	{ 913, "Alpha","greek capital letter alpha, U+0391" },
				1156	{ 914, "Beta", "greek capital letter beta, U+0392" },
				1157	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
				1158	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
				1159	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
				1160	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
				1161	{ 919, "Eta", "greek capital letter eta, U+0397" },
				1162	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
				1163	{ 921, "Iota", "greek capital letter iota, U+0399" },
				1164	{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1165	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1166	{ 924, "Mu", "greek capital letter mu, U+039C" },
				1167	{ 925, "Nu", "greek capital letter nu, U+039D" },
				1168	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
				1169	{ 927, "Omicron","greek capital letter omicron, U+039F" },
				1170	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
				1171	{ 929, "Rho", "greek capital letter rho, U+03A1" },
				1172	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
				1173	{ 932, "Tau", "greek capital letter tau, U+03A4" },
				1174	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
				1175	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
				1176	{ 935, "Chi", "greek capital letter chi, U+03A7" },
				1177	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
				1178	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
				1179
				1180	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
				1181	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
				1182	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
				1183	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
				1184	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
				1185	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
				1186	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
				1187	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
				1188	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
				1189	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
				1190	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
				1191	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
				1192	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
				1193	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
				1194	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
				1195	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
				1196	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
				1197	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
				1198	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
				1199	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
				1200	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
				1201	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
				1202	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
				1203	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
				1204	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
				1205	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
				1206	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
				1207	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
				1208
				1209	{ 8194, "ensp", "en space, U+2002 ISOpub" },
				1210	{ 8195, "emsp", "em space, U+2003 ISOpub" },
				1211	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
				1212	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
				1213	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
				1214	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
				1215	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
				1216	{ 8211, "ndash","en dash, U+2013 ISOpub" },
				1217	{ 8212, "mdash","em dash, U+2014 ISOpub" },
				1218	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
				1219	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
				1220	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
				1221	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
				1222	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
				1223	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
				1224	{ 8224, "dagger","dagger, U+2020 ISOpub" },
				1225	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
				1226
				1227	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
				1228	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
				1229
				1230	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
				1231
				1232	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
				1233	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
				1234
				1235	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
				1236	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
				1237
				1238	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
				1239	{ 8260, "frasl","fraction slash, U+2044 NEW" },
				1240
				1241	{ 8364, "euro", "euro sign, U+20AC NEW" },
				1242
				1243	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
				1244	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
				1245	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
				1246	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
				1247	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
				1248	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
				1249	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
				1250	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
				1251	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
				1252	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
				1253	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
				1254	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
				1255	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
				1256	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
				1257	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
				1258	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
				1259
				1260	{ 8704, "forall","for all, U+2200 ISOtech" },
				1261	{ 8706, "part", "partial differential, U+2202 ISOtech" },
				1262	{ 8707, "exist","there exists, U+2203 ISOtech" },
				1263	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
				1264	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
				1265	{ 8712, "isin", "element of, U+2208 ISOtech" },
				1266	{ 8713, "notin","not an element of, U+2209 ISOtech" },
				1267	{ 8715, "ni", "contains as member, U+220B ISOtech" },
				1268	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
				1269	{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
				1270	{ 8722, "minus","minus sign, U+2212 ISOtech" },
				1271	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
				1272	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
				1273	{ 8733, "prop", "proportional to, U+221D ISOtech" },
				1274	{ 8734, "infin","infinity, U+221E ISOtech" },
				1275	{ 8736, "ang", "angle, U+2220 ISOamso" },
				1276	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
				1277	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
				1278	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
				1279	{ 8746, "cup", "union = cup, U+222A ISOtech" },
				1280	{ 8747, "int", "integral, U+222B ISOtech" },
				1281	{ 8756, "there4","therefore, U+2234 ISOtech" },
				1282	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
				1283	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
				1284	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
				1285	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
				1286	{ 8801, "equiv","identical to, U+2261 ISOtech" },
				1287	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
				1288	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
				1289	{ 8834, "sub", "subset of, U+2282 ISOtech" },
				1290	{ 8835, "sup", "superset of, U+2283 ISOtech" },
				1291	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
				1292	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
				1293	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
				1294	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
				1295	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
				1296	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
				1297	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
				1298	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
				1299	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
				1300	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
				1301	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
				1302	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
				1303	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
				1304	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
				1305
				1306	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
				1307	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
				1308	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
				1309	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
				1310
				1311	};
				1312
				1313	/************************************************************************
				1314	* *
				1315	* Commodity functions to handle entities *
				1316	* *
				1317	************************************************************************/
				1318
				1319	/*
				1320	* Macro used to grow the current buffer.
				1321	*/
				1322	#define growBuffer(buffer) { \
				1323	buffer##_size *= 2; \
				1324	buffer = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
				1325	if (buffer == NULL) { \
				1326	perror("realloc failed"); \
				1327	return(NULL); \
				1328	} \
				1329	}
				1330
				1331	/**
				1332	* htmlEntityLookup:
				1333	* @name: the entity name
				1334	*
				1335	* Lookup the given entity in EntitiesTable
				1336	*
				1337	* TODO: the linear scan is really ugly, an hash table is really needed.
				1338	*
				1339	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1340	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1341	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1342	htmlEntityLookup(const xmlChar *name) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1343	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1344
				1345	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1346	sizeof(html40EntitiesTable[0]));i++) {
				1347	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
				1348	#ifdef DEBUG
				1349	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
				1350	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1351	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1352	}
				1353	}
				1354	return(NULL);
				1355	}
				1356
				1357	/**
				1358	* htmlEntityValueLookup:
				1359	* @value: the entity's unicode value
				1360	*
				1361	* Lookup the given entity in EntitiesTable
				1362	*
				1363	* TODO: the linear scan is really ugly, an hash table is really needed.
				1364	*
				1365	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
				1366	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1367	const htmlEntityDesc *
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1368	htmlEntityValueLookup(unsigned int value) {
				1369	unsigned int i;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1370	#ifdef DEBUG
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	1371	unsigned int lv = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1372	#endif
				1373
				1374	for (i = 0;i < (sizeof(html40EntitiesTable)/
				1375	sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1376	if (html40EntitiesTable[i].value >= value) {
				1377	if (html40EntitiesTable[i].value > value)
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1378	break;
				1379	#ifdef DEBUG
				1380	xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
				1381	#endif
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	1382	return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1383	}
				1384	#ifdef DEBUG
				1385	if (lv > html40EntitiesTable[i].value) {
				1386	xmlGenericError(xmlGenericErrorContext,
				1387	"html40EntitiesTable[] is not sorted (%d > %d)!\n",
				1388	lv, html40EntitiesTable[i].value);
				1389	}
				1390	lv = html40EntitiesTable[i].value;
				1391	#endif
				1392	}
				1393	return(NULL);
				1394	}
				1395
				1396	/**
				1397	* UTF8ToHtml:
				1398	* @out: a pointer to an array of bytes to store the result
				1399	* @outlen: the length of @out
				1400	* @in: a pointer to an array of UTF-8 chars
				1401	* @inlen: the length of @in
				1402	*
				1403	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1404	* plus HTML entities block of chars out.
				1405	*
				1406	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1407	* The value of @inlen after return is the number of octets consumed
				1408	* as the return value is positive, else unpredictiable.
				1409	* The value of @outlen after return is the number of octets consumed.
				1410	*/
				1411	int
				1412	UTF8ToHtml(unsigned char* out, int *outlen,
				1413	const unsigned char* in, int *inlen) {
				1414	const unsigned char* processed = in;
				1415	const unsigned char* outend;
				1416	const unsigned char* outstart = out;
				1417	const unsigned char* instart = in;
				1418	const unsigned char* inend;
				1419	unsigned int c, d;
				1420	int trailing;
				1421
				1422	if (in == NULL) {
				1423	/*
				1424	* initialization nothing to do
				1425	*/
				1426	*outlen = 0;
				1427	*inlen = 0;
				1428	return(0);
				1429	}
				1430	inend = in + (*inlen);
				1431	outend = out + (*outlen);
				1432	while (in < inend) {
				1433	d = *in++;
				1434	if (d < 0x80) { c= d; trailing= 0; }
				1435	else if (d < 0xC0) {
				1436	/* trailing byte in leading position */
				1437	*outlen = out - outstart;
				1438	*inlen = processed - instart;
				1439	return(-2);
				1440	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1441	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1442	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1443	else {
				1444	/* no chance for this in Ascii */
				1445	*outlen = out - outstart;
				1446	*inlen = processed - instart;
				1447	return(-2);
				1448	}
				1449
				1450	if (inend - in < trailing) {
				1451	break;
				1452	}
				1453
				1454	for ( ; trailing; trailing--) {
				1455	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				1456	break;
				1457	c <<= 6;
				1458	c \|= d & 0x3F;
				1459	}
				1460
				1461	/* assertion: c is a single UTF-4 value */
				1462	if (c < 0x80) {
				1463	if (out + 1 >= outend)
				1464	break;
				1465	*out++ = c;
				1466	} else {
				1467	int len;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1468	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1469
				1470	/*
				1471	* Try to lookup a predefined HTML entity for it
				1472	*/
				1473
				1474	ent = htmlEntityValueLookup(c);
				1475	if (ent == NULL) {
				1476	/* no chance for this in Ascii */
				1477	*outlen = out - outstart;
				1478	*inlen = processed - instart;
				1479	return(-2);
				1480	}
				1481	len = strlen(ent->name);
				1482	if (out + 2 + len >= outend)
				1483	break;
				1484	*out++ = '&';
				1485	memcpy(out, ent->name, len);
				1486	out += len;
				1487	*out++ = ';';
				1488	}
				1489	processed = in;
				1490	}
				1491	*outlen = out - outstart;
				1492	*inlen = processed - instart;
				1493	return(0);
				1494	}
				1495
				1496	/**
				1497	* htmlEncodeEntities:
				1498	* @out: a pointer to an array of bytes to store the result
				1499	* @outlen: the length of @out
				1500	* @in: a pointer to an array of UTF-8 chars
				1501	* @inlen: the length of @in
				1502	* @quoteChar: the quote character to escape (' or ") or zero.
				1503	*
				1504	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				1505	* plus HTML entities block of chars out.
				1506	*
				1507	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				1508	* The value of @inlen after return is the number of octets consumed
				1509	* as the return value is positive, else unpredictiable.
				1510	* The value of @outlen after return is the number of octets consumed.
				1511	*/
				1512	int
				1513	htmlEncodeEntities(unsigned char* out, int *outlen,
				1514	const unsigned char* in, int *inlen, int quoteChar) {
				1515	const unsigned char* processed = in;
				1516	const unsigned char* outend = out + (*outlen);
				1517	const unsigned char* outstart = out;
				1518	const unsigned char* instart = in;
				1519	const unsigned char* inend = in + (*inlen);
				1520	unsigned int c, d;
				1521	int trailing;
				1522
				1523	while (in < inend) {
				1524	d = *in++;
				1525	if (d < 0x80) { c= d; trailing= 0; }
				1526	else if (d < 0xC0) {
				1527	/* trailing byte in leading position */
				1528	*outlen = out - outstart;
				1529	*inlen = processed - instart;
				1530	return(-2);
				1531	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1532	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1533	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1534	else {
				1535	/* no chance for this in Ascii */
				1536	*outlen = out - outstart;
				1537	*inlen = processed - instart;
				1538	return(-2);
				1539	}
				1540
				1541	if (inend - in < trailing)
				1542	break;
				1543
				1544	while (trailing--) {
				1545	if (((d= *in++) & 0xC0) != 0x80) {
				1546	*outlen = out - outstart;
				1547	*inlen = processed - instart;
				1548	return(-2);
				1549	}
				1550	c <<= 6;
				1551	c \|= d & 0x3F;
				1552	}
				1553
				1554	/* assertion: c is a single UTF-4 value */
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1555	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
				1556	(c != '&') && (c != '<') && (c != '>')) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1557	if (out >= outend)
				1558	break;
				1559	*out++ = c;
				1560	} else {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1561	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1562	const char *cp;
				1563	char nbuf[16];
				1564	int len;
				1565
				1566	/*
				1567	* Try to lookup a predefined HTML entity for it
				1568	*/
				1569	ent = htmlEntityValueLookup(c);
				1570	if (ent == NULL) {
				1571	sprintf(nbuf, "#%u", c);
				1572	cp = nbuf;
				1573	}
				1574	else
				1575	cp = ent->name;
				1576	len = strlen(cp);
				1577	if (out + 2 + len > outend)
				1578	break;
				1579	*out++ = '&';
				1580	memcpy(out, cp, len);
				1581	out += len;
				1582	*out++ = ';';
				1583	}
				1584	processed = in;
				1585	}
				1586	*outlen = out - outstart;
				1587	*inlen = processed - instart;
				1588	return(0);
				1589	}
				1590
				1591	/**
				1592	* htmlDecodeEntities:
				1593	* @ctxt: the parser context
				1594	* @len: the len to decode (in bytes !), -1 for no size limit
				1595	* @end: an end marker xmlChar, 0 if none
				1596	* @end2: an end marker xmlChar, 0 if none
				1597	* @end3: an end marker xmlChar, 0 if none
				1598	*
				1599	* Subtitute the HTML entities by their value
				1600	*
				1601	* DEPRECATED !!!!
				1602	*
				1603	* Returns A newly allocated string with the substitution done. The caller
				1604	* must deallocate it !
				1605	*/
				1606	xmlChar *
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	1607	htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
				1608	xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1609	static int deprecated = 0;
				1610	if (!deprecated) {
				1611	xmlGenericError(xmlGenericErrorContext,
				1612	"htmlDecodeEntities() deprecated function reached\n");
				1613	deprecated = 1;
				1614	}
				1615	return(NULL);
				1616	#if 0
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1617	xmlChar *name = NULL;
				1618	xmlChar *buffer = NULL;
				1619	unsigned int buffer_size = 0;
				1620	unsigned int nbchars = 0;
				1621	htmlEntityDescPtr ent;
				1622	unsigned int max = (unsigned int) len;
				1623	int c,l;
				1624
				1625	if (ctxt->depth > 40) {
				1626	ctxt->errNo = XML_ERR_ENTITY_LOOP;
				1627	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1628	ctxt->sax->error(ctxt->userData,
				1629	"Detected entity reference loop\n");
				1630	ctxt->wellFormed = 0;
				1631	ctxt->disableSAX = 1;
				1632	return(NULL);
				1633	}
				1634
				1635	/*
				1636	* allocate a translation buffer.
				1637	*/
				1638	buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
				1639	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1640	if (buffer == NULL) {
				1641	perror("xmlDecodeEntities: malloc failed");
				1642	return(NULL);
				1643	}
				1644
				1645	/*
				1646	* Ok loop until we reach one of the ending char or a size limit.
				1647	*/
				1648	c = CUR_CHAR(l);
				1649	while ((nbchars < max) && (c != end) &&
				1650	(c != end2) && (c != end3)) {
				1651
				1652	if (c == 0) break;
				1653	if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
				1654	int val = htmlParseCharRef(ctxt);
				1655	COPY_BUF(0,buffer,nbchars,val);
				1656	NEXTL(l);
				1657	} else if ((c == '&') && (ctxt->token != '&')) {
				1658	ent = htmlParseEntityRef(ctxt, &name);
				1659	if (name != NULL) {
				1660	if (ent != NULL) {
				1661	int val = ent->value;
				1662	COPY_BUF(0,buffer,nbchars,val);
				1663	NEXTL(l);
				1664	} else {
				1665	const xmlChar *cur = name;
				1666
				1667	buffer[nbchars++] = '&';
				1668	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1669	growBuffer(buffer);
				1670	}
				1671	while (*cur != 0) {
				1672	buffer[nbchars++] = *cur++;
				1673	}
				1674	buffer[nbchars++] = ';';
				1675	}
				1676	}
				1677	} else {
				1678	COPY_BUF(l,buffer,nbchars,c);
				1679	NEXTL(l);
				1680	if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
				1681	growBuffer(buffer);
				1682	}
				1683	}
				1684	c = CUR_CHAR(l);
				1685	}
				1686	buffer[nbchars++] = 0;
				1687	return(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1688	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1689	}
				1690
				1691	/************************************************************************
				1692	* *
				1693	* Commodity functions to handle streams *
				1694	* *
				1695	************************************************************************/
				1696
				1697	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1698	* htmlNewInputStream:
				1699	* @ctxt: an HTML parser context
				1700	*
				1701	* Create a new input stream structure
				1702	* Returns the new input stream or NULL
				1703	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1704	static htmlParserInputPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1705	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
				1706	htmlParserInputPtr input;
				1707
				1708	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				1709	if (input == NULL) {
				1710	ctxt->errNo = XML_ERR_NO_MEMORY;
				1711	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				1712	ctxt->sax->error(ctxt->userData,
				1713	"malloc: couldn't allocate a new input stream\n");
				1714	return(NULL);
				1715	}
				1716	memset(input, 0, sizeof(htmlParserInput));
				1717	input->filename = NULL;
				1718	input->directory = NULL;
				1719	input->base = NULL;
				1720	input->cur = NULL;
				1721	input->buf = NULL;
				1722	input->line = 1;
				1723	input->col = 1;
				1724	input->buf = NULL;
				1725	input->free = NULL;
				1726	input->version = NULL;
				1727	input->consumed = 0;
				1728	input->length = 0;
				1729	return(input);
				1730	}
				1731
				1732
				1733	/************************************************************************
				1734	* *
				1735	* Commodity functions, cleanup needed ? *
				1736	* *
				1737	************************************************************************/
				1738
				1739	/**
				1740	* areBlanks:
				1741	* @ctxt: an HTML parser context
				1742	* @str: a xmlChar *
				1743	* @len: the size of @str
				1744	*
				1745	* Is this a sequence of blank chars that one can ignore ?
				1746	*
				1747	* Returns 1 if ignorable 0 otherwise.
				1748	*/
				1749
				1750	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
				1751	int i;
				1752	xmlNodePtr lastChild;
				1753
				1754	for (i = 0;i < len;i++)
				1755	if (!(IS_BLANK(str[i]))) return(0);
				1756
				1757	if (CUR == 0) return(1);
				1758	if (CUR != '<') return(0);
				1759	if (ctxt->name == NULL)
				1760	return(1);
				1761	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
				1762	return(1);
				1763	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
				1764	return(1);
				1765	if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
				1766	return(1);
				1767	if (ctxt->node == NULL) return(0);
				1768	lastChild = xmlGetLastChild(ctxt->node);
				1769	if (lastChild == NULL) {
Daniel Veillard	7db3773	2001-07-12 01:20:08 +0000	[diff] [blame]	1770	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
				1771	(ctxt->node->content != NULL)) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1772	} else if (xmlNodeIsText(lastChild)) {
				1773	return(0);
				1774	} else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
				1775	return(0);
				1776	} else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
				1777	return(0);
				1778	} else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
				1779	return(0);
				1780	}
				1781	return(1);
				1782	}
				1783
				1784	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1785	* htmlNewDocNoDtD:
				1786	* @URI: URI for the dtd, or NULL
				1787	* @ExternalID: the external ID of the DTD, or NULL
				1788	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1789	* Creates a new HTML document without a DTD node if @URI and @ExternalID
				1790	* are NULL
				1791	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1792	* Returns a new document, do not intialize the DTD if not provided
				1793	*/
				1794	htmlDocPtr
				1795	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
				1796	xmlDocPtr cur;
				1797
				1798	/*
				1799	* Allocate a new document and fill the fields.
				1800	*/
				1801	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
				1802	if (cur == NULL) {
				1803	xmlGenericError(xmlGenericErrorContext,
				1804	"xmlNewDoc : malloc failed\n");
				1805	return(NULL);
				1806	}
				1807	memset(cur, 0, sizeof(xmlDoc));
				1808
				1809	cur->type = XML_HTML_DOCUMENT_NODE;
				1810	cur->version = NULL;
				1811	cur->intSubset = NULL;
				1812	if ((ExternalID != NULL) \|\|
				1813	(URI != NULL))
				1814	xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
				1815	cur->doc = cur;
				1816	cur->name = NULL;
				1817	cur->children = NULL;
				1818	cur->extSubset = NULL;
				1819	cur->oldNs = NULL;
				1820	cur->encoding = NULL;
				1821	cur->standalone = 1;
				1822	cur->compression = 0;
				1823	cur->ids = NULL;
				1824	cur->refs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1825	cur->_private = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1826	return(cur);
				1827	}
				1828
				1829	/**
				1830	* htmlNewDoc:
				1831	* @URI: URI for the dtd, or NULL
				1832	* @ExternalID: the external ID of the DTD, or NULL
				1833	*
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1834	* Creates a new HTML document
				1835	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1836	* Returns a new document
				1837	*/
				1838	htmlDocPtr
				1839	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
				1840	if ((URI == NULL) && (ExternalID == NULL))
				1841	return(htmlNewDocNoDtD(
Daniel Veillard	6426935	2001-05-04 17:52:34 +0000	[diff] [blame]	1842	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
				1843	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1844
				1845	return(htmlNewDocNoDtD(URI, ExternalID));
				1846	}
				1847
				1848
				1849	/************************************************************************
				1850	* *
				1851	* The parser itself *
				1852	* Relates to http://www.w3.org/TR/html40 *
				1853	* *
				1854	************************************************************************/
				1855
				1856	/************************************************************************
				1857	* *
				1858	* The parser itself *
				1859	* *
				1860	************************************************************************/
				1861
				1862	/**
				1863	* htmlParseHTMLName:
				1864	* @ctxt: an HTML parser context
				1865	*
				1866	* parse an HTML tag or attribute name, note that we convert it to lowercase
				1867	* since HTML names are not case-sensitive.
				1868	*
				1869	* Returns the Tag Name parsed or NULL
				1870	*/
				1871
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1872	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1873	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
				1874	xmlChar *ret = NULL;
				1875	int i = 0;
				1876	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
				1877
				1878	if (!IS_LETTER(CUR) && (CUR != '_') &&
				1879	(CUR != ':')) return(NULL);
				1880
				1881	while ((i < HTML_PARSER_BUFFER_SIZE) &&
				1882	((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1883	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
				1884	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
				1885	else loc[i] = CUR;
				1886	i++;
				1887
				1888	NEXT;
				1889	}
				1890
				1891	ret = xmlStrndup(loc, i);
				1892
				1893	return(ret);
				1894	}
				1895
				1896	/**
				1897	* htmlParseName:
				1898	* @ctxt: an HTML parser context
				1899	*
				1900	* parse an HTML name, this routine is case sensistive.
				1901	*
				1902	* Returns the Name parsed or NULL
				1903	*/
				1904
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1905	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1906	htmlParseName(htmlParserCtxtPtr ctxt) {
				1907	xmlChar buf[HTML_MAX_NAMELEN];
				1908	int len = 0;
				1909
				1910	GROW;
				1911	if (!IS_LETTER(CUR) && (CUR != '_')) {
				1912	return(NULL);
				1913	}
				1914
				1915	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1916	(CUR == '.') \|\| (CUR == '-') \|\|
				1917	(CUR == '_') \|\| (CUR == ':') \|\|
				1918	(IS_COMBINING(CUR)) \|\|
				1919	(IS_EXTENDER(CUR))) {
				1920	buf[len++] = CUR;
				1921	NEXT;
				1922	if (len >= HTML_MAX_NAMELEN) {
				1923	xmlGenericError(xmlGenericErrorContext,
				1924	"htmlParseName: reached HTML_MAX_NAMELEN limit\n");
				1925	while ((IS_LETTER(CUR)) \|\| (IS_DIGIT(CUR)) \|\|
				1926	(CUR == '.') \|\| (CUR == '-') \|\|
				1927	(CUR == '_') \|\| (CUR == ':') \|\|
				1928	(IS_COMBINING(CUR)) \|\|
				1929	(IS_EXTENDER(CUR)))
				1930	NEXT;
				1931	break;
				1932	}
				1933	}
				1934	return(xmlStrndup(buf, len));
				1935	}
				1936
				1937	/**
				1938	* htmlParseHTMLAttribute:
				1939	* @ctxt: an HTML parser context
				1940	* @stop: a char stop value
				1941	*
				1942	* parse an HTML attribute value till the stop (quote), if
				1943	* stop is 0 then it stops at the first space
				1944	*
				1945	* Returns the attribute parsed or NULL
				1946	*/
				1947
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1948	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1949	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
				1950	xmlChar *buffer = NULL;
				1951	int buffer_size = 0;
				1952	xmlChar *out = NULL;
				1953	xmlChar *name = NULL;
				1954
				1955	xmlChar *cur = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	1956	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1957
				1958	/*
				1959	* allocate a translation buffer.
				1960	*/
				1961	buffer_size = HTML_PARSER_BUFFER_SIZE;
				1962	buffer = (xmlChar ) xmlMalloc(buffer_size sizeof(xmlChar));
				1963	if (buffer == NULL) {
				1964	perror("htmlParseHTMLAttribute: malloc failed");
				1965	return(NULL);
				1966	}
				1967	out = buffer;
				1968
				1969	/*
				1970	* Ok loop until we reach one of the ending chars
				1971	*/
				1972	while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
				1973	if ((stop == 0) && (IS_BLANK(CUR))) break;
				1974	if (CUR == '&') {
				1975	if (NXT(1) == '#') {
				1976	unsigned int c;
				1977	int bits;
				1978
				1979	c = htmlParseCharRef(ctxt);
				1980	if (c < 0x80)
				1981	{ *out++ = c; bits= -6; }
				1982	else if (c < 0x800)
				1983	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1984	else if (c < 0x10000)
				1985	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1986	else
				1987	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1988
				1989	for ( ; bits >= 0; bits-= 6) {
				1990	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				1991	}
				1992	} else {
				1993	ent = htmlParseEntityRef(ctxt, &name);
				1994	if (name == NULL) {
				1995	*out++ = '&';
				1996	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1997	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1998
				1999	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2000	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2001	}
				2002	} else if (ent == NULL) {
				2003	*out++ = '&';
				2004	cur = name;
				2005	while (*cur != 0) {
				2006	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2007	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2008
				2009	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2010	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2011	}
				2012	out++ = cur++;
				2013	}
				2014	xmlFree(name);
				2015	} else {
				2016	unsigned int c;
				2017	int bits;
				2018
				2019	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2020	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2021
				2022	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2023	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2024	}
				2025	c = (xmlChar)ent->value;
				2026	if (c < 0x80)
				2027	{ *out++ = c; bits= -6; }
				2028	else if (c < 0x800)
				2029	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2030	else if (c < 0x10000)
				2031	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2032	else
				2033	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2034
				2035	for ( ; bits >= 0; bits-= 6) {
				2036	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2037	}
				2038	xmlFree(name);
				2039	}
				2040	}
				2041	} else {
				2042	unsigned int c;
				2043	int bits, l;
				2044
				2045	if (out - buffer > buffer_size - 100) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2046	int indx = out - buffer;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2047
				2048	growBuffer(buffer);
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2049	out = &buffer[indx];
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2050	}
				2051	c = CUR_CHAR(l);
				2052	if (c < 0x80)
				2053	{ *out++ = c; bits= -6; }
				2054	else if (c < 0x800)
				2055	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				2056	else if (c < 0x10000)
				2057	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				2058	else
				2059	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				2060
				2061	for ( ; bits >= 0; bits-= 6) {
				2062	*out++ = ((c >> bits) & 0x3F) \| 0x80;
				2063	}
				2064	NEXT;
				2065	}
				2066	}
				2067	*out++ = 0;
				2068	return(buffer);
				2069	}
				2070
				2071	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2072	* htmlParseEntityRef:
				2073	* @ctxt: an HTML parser context
				2074	* @str: location to store the entity name
				2075	*
				2076	* parse an HTML ENTITY references
				2077	*
				2078	* [68] EntityRef ::= '&' Name ';'
				2079	*
				2080	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
				2081	* if non-NULL *str will have to be freed by the caller.
				2082	*/
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2083	const htmlEntityDesc *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2084	htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
				2085	xmlChar *name;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	2086	const htmlEntityDesc * ent = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2087	*str = NULL;
				2088
				2089	if (CUR == '&') {
				2090	NEXT;
				2091	name = htmlParseName(ctxt);
				2092	if (name == NULL) {
				2093	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2094	ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
				2095	ctxt->wellFormed = 0;
				2096	} else {
				2097	GROW;
				2098	if (CUR == ';') {
				2099	*str = name;
				2100
				2101	/*
				2102	* Lookup the entity in the table.
				2103	*/
				2104	ent = htmlEntityLookup(name);
				2105	if (ent != NULL) /* OK that's ugly !!! */
				2106	NEXT;
				2107	} else {
				2108	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2109	ctxt->sax->error(ctxt->userData,
				2110	"htmlParseEntityRef: expecting ';'\n");
				2111	*str = name;
				2112	}
				2113	}
				2114	}
				2115	return(ent);
				2116	}
				2117
				2118	/**
				2119	* htmlParseAttValue:
				2120	* @ctxt: an HTML parser context
				2121	*
				2122	* parse a value for an attribute
				2123	* Note: the parser won't do substitution of entities here, this
				2124	* will be handled later in xmlStringGetNodeList, unless it was
				2125	* asked for ctxt->replaceEntities != 0
				2126	*
				2127	* Returns the AttValue parsed or NULL.
				2128	*/
				2129
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2130	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2131	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
				2132	xmlChar *ret = NULL;
				2133
				2134	if (CUR == '"') {
				2135	NEXT;
				2136	ret = htmlParseHTMLAttribute(ctxt, '"');
				2137	if (CUR != '"') {
				2138	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2139	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2140	ctxt->wellFormed = 0;
				2141	} else
				2142	NEXT;
				2143	} else if (CUR == '\'') {
				2144	NEXT;
				2145	ret = htmlParseHTMLAttribute(ctxt, '\'');
				2146	if (CUR != '\'') {
				2147	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2148	ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
				2149	ctxt->wellFormed = 0;
				2150	} else
				2151	NEXT;
				2152	} else {
				2153	/*
				2154	* That's an HTMLism, the attribute value may not be quoted
				2155	*/
				2156	ret = htmlParseHTMLAttribute(ctxt, 0);
				2157	if (ret == NULL) {
				2158	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2159	ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
				2160	ctxt->wellFormed = 0;
				2161	}
				2162	}
				2163	return(ret);
				2164	}
				2165
				2166	/**
				2167	* htmlParseSystemLiteral:
				2168	* @ctxt: an HTML parser context
				2169	*
				2170	* parse an HTML Literal
				2171	*
				2172	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
				2173	*
				2174	* Returns the SystemLiteral parsed or NULL
				2175	*/
				2176
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2177	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2178	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
				2179	const xmlChar *q;
				2180	xmlChar *ret = NULL;
				2181
				2182	if (CUR == '"') {
				2183	NEXT;
				2184	q = CUR_PTR;
				2185	while ((IS_CHAR(CUR)) && (CUR != '"'))
				2186	NEXT;
				2187	if (!IS_CHAR(CUR)) {
				2188	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2189	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2190	ctxt->wellFormed = 0;
				2191	} else {
				2192	ret = xmlStrndup(q, CUR_PTR - q);
				2193	NEXT;
				2194	}
				2195	} else if (CUR == '\'') {
				2196	NEXT;
				2197	q = CUR_PTR;
				2198	while ((IS_CHAR(CUR)) && (CUR != '\''))
				2199	NEXT;
				2200	if (!IS_CHAR(CUR)) {
				2201	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2202	ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
				2203	ctxt->wellFormed = 0;
				2204	} else {
				2205	ret = xmlStrndup(q, CUR_PTR - q);
				2206	NEXT;
				2207	}
				2208	} else {
				2209	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2210	ctxt->sax->error(ctxt->userData,
				2211	"SystemLiteral \" or ' expected\n");
				2212	ctxt->wellFormed = 0;
				2213	}
				2214
				2215	return(ret);
				2216	}
				2217
				2218	/**
				2219	* htmlParsePubidLiteral:
				2220	* @ctxt: an HTML parser context
				2221	*
				2222	* parse an HTML public literal
				2223	*
				2224	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
				2225	*
				2226	* Returns the PubidLiteral parsed or NULL.
				2227	*/
				2228
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2229	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2230	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
				2231	const xmlChar *q;
				2232	xmlChar *ret = NULL;
				2233	/*
				2234	* Name ::= (Letter \| '_') (NameChar)*
				2235	*/
				2236	if (CUR == '"') {
				2237	NEXT;
				2238	q = CUR_PTR;
				2239	while (IS_PUBIDCHAR(CUR)) NEXT;
				2240	if (CUR != '"') {
				2241	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2242	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2243	ctxt->wellFormed = 0;
				2244	} else {
				2245	ret = xmlStrndup(q, CUR_PTR - q);
				2246	NEXT;
				2247	}
				2248	} else if (CUR == '\'') {
				2249	NEXT;
				2250	q = CUR_PTR;
				2251	while ((IS_LETTER(CUR)) && (CUR != '\''))
				2252	NEXT;
				2253	if (!IS_LETTER(CUR)) {
				2254	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2255	ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
				2256	ctxt->wellFormed = 0;
				2257	} else {
				2258	ret = xmlStrndup(q, CUR_PTR - q);
				2259	NEXT;
				2260	}
				2261	} else {
				2262	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2263	ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
				2264	ctxt->wellFormed = 0;
				2265	}
				2266
				2267	return(ret);
				2268	}
				2269
				2270	/**
				2271	* htmlParseScript:
				2272	* @ctxt: an HTML parser context
				2273	*
				2274	* parse the content of an HTML SCRIPT or STYLE element
				2275	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
				2276	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
				2277	* http://www.w3.org/TR/html4/types.html#type-script
				2278	* http://www.w3.org/TR/html4/types.html#h-6.15
				2279	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
				2280	*
				2281	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
				2282	* element and the value of intrinsic event attributes. User agents must
				2283	* not evaluate script data as HTML markup but instead must pass it on as
				2284	* data to a script engine.
				2285	* NOTES:
				2286	* - The content is passed like CDATA
				2287	* - the attributes for style and scripting "onXXX" are also described
				2288	* as CDATA but SGML allows entities references in attributes so their
				2289	* processing is identical as other attributes
				2290	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2291	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2292	htmlParseScript(htmlParserCtxtPtr ctxt) {
				2293	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
				2294	int nbchar = 0;
				2295	xmlChar cur;
				2296
				2297	SHRINK;
				2298	cur = CUR;
				2299	while (IS_CHAR(cur)) {
				2300	if ((cur == '<') && (NXT(1) == '/')) {
				2301	/*
				2302	* One should break here, the specification is clear:
				2303	* Authors should therefore escape "</" within the content.
				2304	* Escape mechanisms are specific to each scripting or
				2305	* style sheet language.
				2306	*/
				2307	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
				2308	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
				2309	break; /* while */
				2310	}
				2311	buf[nbchar++] = cur;
				2312	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2313	if (ctxt->sax->cdataBlock!= NULL) {
				2314	/*
				2315	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2316	*/
				2317	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2318	}
				2319	nbchar = 0;
				2320	}
				2321	NEXT;
				2322	cur = CUR;
				2323	}
				2324	if (!(IS_CHAR(cur))) {
				2325	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2326	ctxt->sax->error(ctxt->userData,
				2327	"Invalid char in CDATA 0x%X\n", cur);
				2328	ctxt->wellFormed = 0;
				2329	NEXT;
				2330	}
				2331
				2332	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2333	if (ctxt->sax->cdataBlock!= NULL) {
				2334	/*
				2335	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
				2336	*/
				2337	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
				2338	}
				2339	}
				2340	}
				2341
				2342
				2343	/**
				2344	* htmlParseCharData:
				2345	* @ctxt: an HTML parser context
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2346	*
				2347	* parse a CharData section.
				2348	* if we are within a CDATA section ']]>' marks an end of section.
				2349	*
				2350	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
				2351	*/
				2352
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2353	static void
				2354	htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2355	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
				2356	int nbchar = 0;
				2357	int cur, l;
				2358
				2359	SHRINK;
				2360	cur = CUR_CHAR(l);
				2361	while (((cur != '<') \|\| (ctxt->token == '<')) &&
				2362	((cur != '&') \|\| (ctxt->token == '&')) &&
				2363	(IS_CHAR(cur))) {
				2364	COPY_BUF(l,buf,nbchar,cur);
				2365	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
				2366	/*
				2367	* Ok the segment is to be consumed as chars.
				2368	*/
				2369	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2370	if (areBlanks(ctxt, buf, nbchar)) {
				2371	if (ctxt->sax->ignorableWhitespace != NULL)
				2372	ctxt->sax->ignorableWhitespace(ctxt->userData,
				2373	buf, nbchar);
				2374	} else {
				2375	htmlCheckParagraph(ctxt);
				2376	if (ctxt->sax->characters != NULL)
				2377	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2378	}
				2379	}
				2380	nbchar = 0;
				2381	}
				2382	NEXTL(l);
				2383	cur = CUR_CHAR(l);
				2384	}
				2385	if (nbchar != 0) {
				2386	/*
				2387	* Ok the segment is to be consumed as chars.
				2388	*/
				2389	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
				2390	if (areBlanks(ctxt, buf, nbchar)) {
				2391	if (ctxt->sax->ignorableWhitespace != NULL)
				2392	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
				2393	} else {
				2394	htmlCheckParagraph(ctxt);
				2395	if (ctxt->sax->characters != NULL)
				2396	ctxt->sax->characters(ctxt->userData, buf, nbchar);
				2397	}
				2398	}
				2399	}
				2400	}
				2401
				2402	/**
				2403	* htmlParseExternalID:
				2404	* @ctxt: an HTML parser context
				2405	* @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2406	*
				2407	* Parse an External ID or a Public ID
				2408	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2409	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
				2410	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
				2411	*
				2412	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
				2413	*
				2414	* Returns the function returns SystemLiteral and in the second
				2415	* case publicID receives PubidLiteral, is strict is off
				2416	* it is possible to return NULL and have publicID set.
				2417	*/
				2418
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2419	static xmlChar *
				2420	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2421	xmlChar *URI = NULL;
				2422
				2423	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
				2424	(UPP(2) == 'S') && (UPP(3) == 'T') &&
				2425	(UPP(4) == 'E') && (UPP(5) == 'M')) {
				2426	SKIP(6);
				2427	if (!IS_BLANK(CUR)) {
				2428	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2429	ctxt->sax->error(ctxt->userData,
				2430	"Space required after 'SYSTEM'\n");
				2431	ctxt->wellFormed = 0;
				2432	}
				2433	SKIP_BLANKS;
				2434	URI = htmlParseSystemLiteral(ctxt);
				2435	if (URI == NULL) {
				2436	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2437	ctxt->sax->error(ctxt->userData,
				2438	"htmlParseExternalID: SYSTEM, no URI\n");
				2439	ctxt->wellFormed = 0;
				2440	}
				2441	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
				2442	(UPP(2) == 'B') && (UPP(3) == 'L') &&
				2443	(UPP(4) == 'I') && (UPP(5) == 'C')) {
				2444	SKIP(6);
				2445	if (!IS_BLANK(CUR)) {
				2446	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2447	ctxt->sax->error(ctxt->userData,
				2448	"Space required after 'PUBLIC'\n");
				2449	ctxt->wellFormed = 0;
				2450	}
				2451	SKIP_BLANKS;
				2452	*publicID = htmlParsePubidLiteral(ctxt);
				2453	if (*publicID == NULL) {
				2454	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2455	ctxt->sax->error(ctxt->userData,
				2456	"htmlParseExternalID: PUBLIC, no Public Identifier\n");
				2457	ctxt->wellFormed = 0;
				2458	}
				2459	SKIP_BLANKS;
				2460	if ((CUR == '"') \|\| (CUR == '\'')) {
				2461	URI = htmlParseSystemLiteral(ctxt);
				2462	}
				2463	}
				2464	return(URI);
				2465	}
				2466
				2467	/**
				2468	* htmlParseComment:
				2469	* @ctxt: an HTML parser context
				2470	*
				2471	* Parse an XML (SGML) comment <!-- .... -->
				2472	*
				2473	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
				2474	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2475	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2476	htmlParseComment(htmlParserCtxtPtr ctxt) {
				2477	xmlChar *buf = NULL;
				2478	int len;
				2479	int size = HTML_PARSER_BUFFER_SIZE;
				2480	int q, ql;
				2481	int r, rl;
				2482	int cur, l;
				2483	xmlParserInputState state;
				2484
				2485	/*
				2486	* Check that there is a comment right here.
				2487	*/
				2488	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
				2489	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
				2490
				2491	state = ctxt->instate;
				2492	ctxt->instate = XML_PARSER_COMMENT;
				2493	SHRINK;
				2494	SKIP(4);
				2495	buf = (xmlChar ) xmlMalloc(size sizeof(xmlChar));
				2496	if (buf == NULL) {
				2497	xmlGenericError(xmlGenericErrorContext,
				2498	"malloc of %d byte failed\n", size);
				2499	ctxt->instate = state;
				2500	return;
				2501	}
				2502	q = CUR_CHAR(ql);
				2503	NEXTL(ql);
				2504	r = CUR_CHAR(rl);
				2505	NEXTL(rl);
				2506	cur = CUR_CHAR(l);
				2507	len = 0;
				2508	while (IS_CHAR(cur) &&
				2509	((cur != '>') \|\|
				2510	(r != '-') \|\| (q != '-'))) {
				2511	if (len + 5 >= size) {
				2512	size *= 2;
				2513	buf = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
				2514	if (buf == NULL) {
				2515	xmlGenericError(xmlGenericErrorContext,
				2516	"realloc of %d byte failed\n", size);
				2517	ctxt->instate = state;
				2518	return;
				2519	}
				2520	}
				2521	COPY_BUF(ql,buf,len,q);
				2522	q = r;
				2523	ql = rl;
				2524	r = cur;
				2525	rl = l;
				2526	NEXTL(l);
				2527	cur = CUR_CHAR(l);
				2528	if (cur == 0) {
				2529	SHRINK;
				2530	GROW;
				2531	cur = CUR_CHAR(l);
				2532	}
				2533	}
				2534	buf[len] = 0;
				2535	if (!IS_CHAR(cur)) {
				2536	ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
				2537	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2538	ctxt->sax->error(ctxt->userData,
				2539	"Comment not terminated \n<!--%.50s\n", buf);
				2540	ctxt->wellFormed = 0;
				2541	xmlFree(buf);
				2542	} else {
				2543	NEXT;
				2544	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
				2545	(!ctxt->disableSAX))
				2546	ctxt->sax->comment(ctxt->userData, buf);
				2547	xmlFree(buf);
				2548	}
				2549	ctxt->instate = state;
				2550	}
				2551
				2552	/**
				2553	* htmlParseCharRef:
				2554	* @ctxt: an HTML parser context
				2555	*
				2556	* parse Reference declarations
				2557	*
				2558	* [66] CharRef ::= '&#' [0-9]+ ';' \|
				2559	* '&#x' [0-9a-fA-F]+ ';'
				2560	*
				2561	* Returns the value parsed (as an int)
				2562	*/
				2563	int
				2564	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
				2565	int val = 0;
				2566
				2567	if ((CUR == '&') && (NXT(1) == '#') &&
				2568	(NXT(2) == 'x')) {
				2569	SKIP(3);
				2570	while (CUR != ';') {
				2571	if ((CUR >= '0') && (CUR <= '9'))
				2572	val = val * 16 + (CUR - '0');
				2573	else if ((CUR >= 'a') && (CUR <= 'f'))
				2574	val = val * 16 + (CUR - 'a') + 10;
				2575	else if ((CUR >= 'A') && (CUR <= 'F'))
				2576	val = val * 16 + (CUR - 'A') + 10;
				2577	else {
				2578	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2579	ctxt->sax->error(ctxt->userData,
				2580	"htmlParseCharRef: invalid hexadecimal value\n");
				2581	ctxt->wellFormed = 0;
				2582	return(0);
				2583	}
				2584	NEXT;
				2585	}
				2586	if (CUR == ';')
				2587	NEXT;
				2588	} else if ((CUR == '&') && (NXT(1) == '#')) {
				2589	SKIP(2);
				2590	while (CUR != ';') {
				2591	if ((CUR >= '0') && (CUR <= '9'))
				2592	val = val * 10 + (CUR - '0');
				2593	else {
				2594	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2595	ctxt->sax->error(ctxt->userData,
				2596	"htmlParseCharRef: invalid decimal value\n");
				2597	ctxt->wellFormed = 0;
				2598	return(0);
				2599	}
				2600	NEXT;
				2601	}
				2602	if (CUR == ';')
				2603	NEXT;
				2604	} else {
				2605	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2606	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
				2607	ctxt->wellFormed = 0;
				2608	}
				2609	/*
				2610	* Check the value IS_CHAR ...
				2611	*/
				2612	if (IS_CHAR(val)) {
				2613	return(val);
				2614	} else {
				2615	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2616	ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
				2617	val);
				2618	ctxt->wellFormed = 0;
				2619	}
				2620	return(0);
				2621	}
				2622
				2623
				2624	/**
				2625	* htmlParseDocTypeDecl :
				2626	* @ctxt: an HTML parser context
				2627	*
				2628	* parse a DOCTYPE declaration
				2629	*
				2630	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
				2631	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
				2632	*/
				2633
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2634	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2635	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
				2636	xmlChar *name;
				2637	xmlChar *ExternalID = NULL;
				2638	xmlChar *URI = NULL;
				2639
				2640	/*
				2641	* We know that '<!DOCTYPE' has been detected.
				2642	*/
				2643	SKIP(9);
				2644
				2645	SKIP_BLANKS;
				2646
				2647	/*
				2648	* Parse the DOCTYPE name.
				2649	*/
				2650	name = htmlParseName(ctxt);
				2651	if (name == NULL) {
				2652	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2653	ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
				2654	ctxt->wellFormed = 0;
				2655	}
				2656	/*
				2657	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
				2658	*/
				2659
				2660	SKIP_BLANKS;
				2661
				2662	/*
				2663	* Check for SystemID and ExternalID
				2664	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2665	URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2666	SKIP_BLANKS;
				2667
				2668	/*
				2669	* We should be at the end of the DOCTYPE declaration.
				2670	*/
				2671	if (CUR != '>') {
				2672	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
Daniel Veillard	f6ed8bc	2001-10-02 09:22:47 +0000	[diff] [blame]	2673	ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2674	ctxt->wellFormed = 0;
				2675	/* We shouldn't try to resynchronize ... */
				2676	}
				2677	NEXT;
				2678
				2679	/*
				2680	* Create or update the document accordingly to the DOCTYPE
				2681	*/
				2682	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
				2683	(!ctxt->disableSAX))
				2684	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
				2685
				2686	/*
				2687	* Cleanup, since we don't use all those identifiers
				2688	*/
				2689	if (URI != NULL) xmlFree(URI);
				2690	if (ExternalID != NULL) xmlFree(ExternalID);
				2691	if (name != NULL) xmlFree(name);
				2692	}
				2693
				2694	/**
				2695	* htmlParseAttribute:
				2696	* @ctxt: an HTML parser context
				2697	* @value: a xmlChar ** used to store the value of the attribute
				2698	*
				2699	* parse an attribute
				2700	*
				2701	* [41] Attribute ::= Name Eq AttValue
				2702	*
				2703	* [25] Eq ::= S? '=' S?
				2704	*
				2705	* With namespace:
				2706	*
				2707	* [NS 11] Attribute ::= QName Eq AttValue
				2708	*
				2709	* Also the case QName == xmlns:??? is handled independently as a namespace
				2710	* definition.
				2711	*
				2712	* Returns the attribute name, and the value in *value.
				2713	*/
				2714
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2715	static xmlChar *
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2716	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
				2717	xmlChar name, val = NULL;
				2718
				2719	*value = NULL;
				2720	name = htmlParseHTMLName(ctxt);
				2721	if (name == NULL) {
				2722	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2723	ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
				2724	ctxt->wellFormed = 0;
				2725	return(NULL);
				2726	}
				2727
				2728	/*
				2729	* read the value
				2730	*/
				2731	SKIP_BLANKS;
				2732	if (CUR == '=') {
				2733	NEXT;
				2734	SKIP_BLANKS;
				2735	val = htmlParseAttValue(ctxt);
				2736	/******
				2737	} else {
				2738	* TODO : some attribute must have values, some may not
				2739	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2740	ctxt->sax->warning(ctxt->userData,
				2741	"No value for attribute %s\n", name); */
				2742	}
				2743
				2744	*value = val;
				2745	return(name);
				2746	}
				2747
				2748	/**
				2749	* htmlCheckEncoding:
				2750	* @ctxt: an HTML parser context
				2751	* @attvalue: the attribute value
				2752	*
				2753	* Checks an http-equiv attribute from a Meta tag to detect
				2754	* the encoding
				2755	* If a new encoding is detected the parser is switched to decode
				2756	* it and pass UTF8
				2757	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2758	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2759	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
				2760	const xmlChar *encoding;
				2761
				2762	if ((ctxt == NULL) \|\| (attvalue == NULL))
				2763	return;
				2764
				2765	/* do not change encoding */
				2766	if (ctxt->input->encoding != NULL)
				2767	return;
				2768
				2769	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
				2770	if (encoding != NULL) {
				2771	encoding += 8;
				2772	} else {
				2773	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
				2774	if (encoding != NULL)
				2775	encoding += 9;
				2776	}
				2777	if (encoding != NULL) {
				2778	xmlCharEncoding enc;
				2779	xmlCharEncodingHandlerPtr handler;
				2780
				2781	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
				2782
				2783	if (ctxt->input->encoding != NULL)
				2784	xmlFree((xmlChar *) ctxt->input->encoding);
				2785	ctxt->input->encoding = xmlStrdup(encoding);
				2786
				2787	enc = xmlParseCharEncoding((const char *) encoding);
				2788	/*
				2789	* registered set of known encodings
				2790	*/
				2791	if (enc != XML_CHAR_ENCODING_ERROR) {
				2792	xmlSwitchEncoding(ctxt, enc);
				2793	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2794	} else {
				2795	/*
				2796	* fallback for unknown encodings
				2797	*/
				2798	handler = xmlFindCharEncodingHandler((const char *) encoding);
				2799	if (handler != NULL) {
				2800	xmlSwitchToEncoding(ctxt, handler);
				2801	ctxt->charset = XML_CHAR_ENCODING_UTF8;
				2802	} else {
				2803	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
				2804	}
				2805	}
				2806
				2807	if ((ctxt->input->buf != NULL) &&
				2808	(ctxt->input->buf->encoder != NULL) &&
				2809	(ctxt->input->buf->raw != NULL) &&
				2810	(ctxt->input->buf->buffer != NULL)) {
				2811	int nbchars;
				2812	int processed;
				2813
				2814	/*
				2815	* convert as much as possible to the parser reading buffer.
				2816	*/
				2817	processed = ctxt->input->cur - ctxt->input->base;
				2818	xmlBufferShrink(ctxt->input->buf->buffer, processed);
				2819	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
				2820	ctxt->input->buf->buffer,
				2821	ctxt->input->buf->raw);
				2822	if (nbchars < 0) {
				2823	ctxt->errNo = XML_ERR_INVALID_ENCODING;
				2824	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2825	ctxt->sax->error(ctxt->userData,
				2826	"htmlCheckEncoding: encoder error\n");
				2827	}
				2828	ctxt->input->base =
				2829	ctxt->input->cur = ctxt->input->buf->buffer->content;
				2830	}
				2831	}
				2832	}
				2833
				2834	/**
				2835	* htmlCheckMeta:
				2836	* @ctxt: an HTML parser context
				2837	* @atts: the attributes values
				2838	*
				2839	* Checks an attributes from a Meta tag
				2840	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2841	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2842	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
				2843	int i;
				2844	const xmlChar att, value;
				2845	int http = 0;
				2846	const xmlChar *content = NULL;
				2847
				2848	if ((ctxt == NULL) \|\| (atts == NULL))
				2849	return;
				2850
				2851	i = 0;
				2852	att = atts[i++];
				2853	while (att != NULL) {
				2854	value = atts[i++];
				2855	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
				2856	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
				2857	http = 1;
				2858	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
				2859	content = value;
				2860	att = atts[i++];
				2861	}
				2862	if ((http) && (content != NULL))
				2863	htmlCheckEncoding(ctxt, content);
				2864
				2865	}
				2866
				2867	/**
				2868	* htmlParseStartTag:
				2869	* @ctxt: an HTML parser context
				2870	*
				2871	* parse a start of tag either for rule element or
				2872	* EmptyElement. In both case we don't parse the tag closing chars.
				2873	*
				2874	* [40] STag ::= '<' Name (S Attribute)* S? '>'
				2875	*
				2876	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
				2877	*
				2878	* With namespace:
				2879	*
				2880	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
				2881	*
				2882	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
				2883	*
				2884	*/
				2885
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2886	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2887	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
				2888	xmlChar *name;
				2889	xmlChar *attname;
				2890	xmlChar *attvalue;
				2891	const xmlChar **atts = NULL;
				2892	int nbatts = 0;
				2893	int maxatts = 0;
				2894	int meta = 0;
				2895	int i;
				2896
				2897	if (CUR != '<') return;
				2898	NEXT;
				2899
				2900	GROW;
				2901	name = htmlParseHTMLName(ctxt);
				2902	if (name == NULL) {
				2903	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2904	ctxt->sax->error(ctxt->userData,
				2905	"htmlParseStartTag: invalid element name\n");
				2906	ctxt->wellFormed = 0;
				2907	/* Dump the bogus tag like browsers do */
				2908	while ((IS_CHAR(CUR)) && (CUR != '>'))
				2909	NEXT;
				2910	return;
				2911	}
				2912	if (xmlStrEqual(name, BAD_CAST"meta"))
				2913	meta = 1;
				2914
				2915	/*
				2916	* Check for auto-closure of HTML elements.
				2917	*/
				2918	htmlAutoClose(ctxt, name);
				2919
				2920	/*
				2921	* Check for implied HTML elements.
				2922	*/
				2923	htmlCheckImplied(ctxt, name);
				2924
				2925	/*
				2926	* Avoid html at any level > 0, head at any level != 1
				2927	* or any attempt to recurse body
				2928	*/
				2929	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
				2930	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2931	ctxt->sax->error(ctxt->userData,
				2932	"htmlParseStartTag: misplaced <html> tag\n");
				2933	ctxt->wellFormed = 0;
				2934	xmlFree(name);
				2935	return;
				2936	}
				2937	if ((ctxt->nameNr != 1) &&
				2938	(xmlStrEqual(name, BAD_CAST"head"))) {
				2939	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2940	ctxt->sax->error(ctxt->userData,
				2941	"htmlParseStartTag: misplaced <head> tag\n");
				2942	ctxt->wellFormed = 0;
				2943	xmlFree(name);
				2944	return;
				2945	}
				2946	if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	2947	int indx;
				2948	for (indx = 0;indx < ctxt->nameNr;indx++) {
				2949	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2950	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2951	ctxt->sax->error(ctxt->userData,
				2952	"htmlParseStartTag: misplaced <body> tag\n");
				2953	ctxt->wellFormed = 0;
				2954	xmlFree(name);
				2955	return;
				2956	}
				2957	}
				2958	}
				2959
				2960	/*
				2961	* Now parse the attributes, it ends up with the ending
				2962	*
				2963	* (S Attribute)* S?
				2964	*/
				2965	SKIP_BLANKS;
				2966	while ((IS_CHAR(CUR)) &&
				2967	(CUR != '>') &&
				2968	((CUR != '/') \|\| (NXT(1) != '>'))) {
				2969	long cons = ctxt->nbChars;
				2970
				2971	GROW;
				2972	attname = htmlParseAttribute(ctxt, &attvalue);
				2973	if (attname != NULL) {
				2974
				2975	/*
				2976	* Well formedness requires at most one declaration of an attribute
				2977	*/
				2978	for (i = 0; i < nbatts;i += 2) {
				2979	if (xmlStrEqual(atts[i], attname)) {
				2980	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				2981	ctxt->sax->error(ctxt->userData,
				2982	"Attribute %s redefined\n",
				2983	attname);
				2984	ctxt->wellFormed = 0;
				2985	xmlFree(attname);
				2986	if (attvalue != NULL)
				2987	xmlFree(attvalue);
				2988	goto failed;
				2989	}
				2990	}
				2991
				2992	/*
				2993	* Add the pair to atts
				2994	*/
				2995	if (atts == NULL) {
				2996	maxatts = 10;
				2997	atts = (const xmlChar *) xmlMalloc(maxatts sizeof(xmlChar *));
				2998	if (atts == NULL) {
				2999	xmlGenericError(xmlGenericErrorContext,
				3000	"malloc of %ld byte failed\n",
				3001	maxatts * (long)sizeof(xmlChar *));
				3002	if (name != NULL) xmlFree(name);
				3003	return;
				3004	}
				3005	} else if (nbatts + 4 > maxatts) {
				3006	maxatts *= 2;
				3007	atts = (const xmlChar *) xmlRealloc((void ) atts,
				3008	maxatts * sizeof(xmlChar *));
				3009	if (atts == NULL) {
				3010	xmlGenericError(xmlGenericErrorContext,
				3011	"realloc of %ld byte failed\n",
				3012	maxatts * (long)sizeof(xmlChar *));
				3013	if (name != NULL) xmlFree(name);
				3014	return;
				3015	}
				3016	}
				3017	atts[nbatts++] = attname;
				3018	atts[nbatts++] = attvalue;
				3019	atts[nbatts] = NULL;
				3020	atts[nbatts + 1] = NULL;
				3021	}
				3022	else {
				3023	/* Dump the bogus attribute string up to the next blank or
				3024	* the end of the tag. */
				3025	while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
				3026	&& ((CUR != '/') \|\| (NXT(1) != '>')))
				3027	NEXT;
				3028	}
				3029
				3030	failed:
				3031	SKIP_BLANKS;
				3032	if (cons == ctxt->nbChars) {
				3033	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3034	ctxt->sax->error(ctxt->userData,
				3035	"htmlParseStartTag: problem parsing attributes\n");
				3036	ctxt->wellFormed = 0;
				3037	break;
				3038	}
				3039	}
				3040
				3041	/*
				3042	* Handle specific association to the META tag
				3043	*/
				3044	if (meta)
				3045	htmlCheckMeta(ctxt, atts);
				3046
				3047	/*
				3048	* SAX: Start of Element !
				3049	*/
				3050	htmlnamePush(ctxt, xmlStrdup(name));
				3051	#ifdef DEBUG
				3052	xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
				3053	#endif
				3054	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
				3055	ctxt->sax->startElement(ctxt->userData, name, atts);
				3056
				3057	if (atts != NULL) {
				3058	for (i = 0;i < nbatts;i++) {
				3059	if (atts[i] != NULL)
				3060	xmlFree((xmlChar *) atts[i]);
				3061	}
				3062	xmlFree((void *) atts);
				3063	}
				3064	if (name != NULL) xmlFree(name);
				3065	}
				3066
				3067	/**
				3068	* htmlParseEndTag:
				3069	* @ctxt: an HTML parser context
				3070	*
				3071	* parse an end of tag
				3072	*
				3073	* [42] ETag ::= '</' Name S? '>'
				3074	*
				3075	* With namespace
				3076	*
				3077	* [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3078	*
				3079	* Returns 1 if the current level should be closed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3080	*/
				3081
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3082	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3083	htmlParseEndTag(htmlParserCtxtPtr ctxt) {
				3084	xmlChar *name;
				3085	xmlChar *oldname;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3086	int i, ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3087
				3088	if ((CUR != '<') \|\| (NXT(1) != '/')) {
				3089	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3090	ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
				3091	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3092	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3093	}
				3094	SKIP(2);
				3095
				3096	name = htmlParseHTMLName(ctxt);
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3097	if (name == NULL) return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3098
				3099	/*
				3100	* We should definitely be at the ending "S? '>'" part
				3101	*/
				3102	SKIP_BLANKS;
				3103	if ((!IS_CHAR(CUR)) \|\| (CUR != '>')) {
				3104	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3105	ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
				3106	ctxt->wellFormed = 0;
				3107	} else
				3108	NEXT;
				3109
				3110	/*
				3111	* If the name read is not one of the element in the parsing stack
				3112	* then return, it's just an error.
				3113	*/
				3114	for (i = (ctxt->nameNr - 1);i >= 0;i--) {
				3115	if (xmlStrEqual(name, ctxt->nameTab[i])) break;
				3116	}
				3117	if (i < 0) {
				3118	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3119	ctxt->sax->error(ctxt->userData,
				3120	"Unexpected end tag : %s\n", name);
				3121	xmlFree(name);
				3122	ctxt->wellFormed = 0;
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3123	return(0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3124	}
				3125
				3126
				3127	/*
				3128	* Check for auto-closure of HTML elements.
				3129	*/
				3130
				3131	htmlAutoCloseOnClose(ctxt, name);
				3132
				3133	/*
				3134	* Well formedness constraints, opening and closing must match.
				3135	* With the exception that the autoclose may have popped stuff out
				3136	* of the stack.
				3137	*/
				3138	if (!xmlStrEqual(name, ctxt->name)) {
				3139	#ifdef DEBUG
				3140	xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
				3141	#endif
				3142	if ((ctxt->name != NULL) &&
				3143	(!xmlStrEqual(ctxt->name, name))) {
				3144	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3145	ctxt->sax->error(ctxt->userData,
				3146	"Opening and ending tag mismatch: %s and %s\n",
				3147	name, ctxt->name);
				3148	ctxt->wellFormed = 0;
				3149	}
				3150	}
				3151
				3152	/*
				3153	* SAX: End of Tag
				3154	*/
				3155	oldname = ctxt->name;
				3156	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
				3157	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3158	ctxt->sax->endElement(ctxt->userData, name);
				3159	oldname = htmlnamePop(ctxt);
				3160	if (oldname != NULL) {
				3161	#ifdef DEBUG
				3162	xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
				3163	#endif
				3164	xmlFree(oldname);
				3165	#ifdef DEBUG
				3166	} else {
				3167	xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
				3168	#endif
				3169	}
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3170	ret = 1;
				3171	} else {
				3172	ret = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3173	}
				3174
				3175	if (name != NULL)
				3176	xmlFree(name);
				3177
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3178	return(ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3179	}
				3180
				3181
				3182	/**
				3183	* htmlParseReference:
				3184	* @ctxt: an HTML parser context
				3185	*
				3186	* parse and handle entity references in content,
				3187	* this will end-up in a call to character() since this is either a
				3188	* CharRef, or a predefined entity.
				3189	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3190	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3191	htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3192	const htmlEntityDesc * ent;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3193	xmlChar out[6];
				3194	xmlChar *name;
				3195	if (CUR != '&') return;
				3196
				3197	if (NXT(1) == '#') {
				3198	unsigned int c;
				3199	int bits, i = 0;
				3200
				3201	c = htmlParseCharRef(ctxt);
				3202	if (c == 0)
				3203	return;
				3204
				3205	if (c < 0x80) { out[i++]= c; bits= -6; }
				3206	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3207	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3208	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3209
				3210	for ( ; bits >= 0; bits-= 6) {
				3211	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3212	}
				3213	out[i] = 0;
				3214
				3215	htmlCheckParagraph(ctxt);
				3216	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3217	ctxt->sax->characters(ctxt->userData, out, i);
				3218	} else {
				3219	ent = htmlParseEntityRef(ctxt, &name);
				3220	if (name == NULL) {
				3221	htmlCheckParagraph(ctxt);
				3222	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3223	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3224	return;
				3225	}
				3226	if ((ent == NULL) \|\| (ent->value <= 0)) {
				3227	htmlCheckParagraph(ctxt);
				3228	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
				3229	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
				3230	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
				3231	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
				3232	}
				3233	} else {
				3234	unsigned int c;
				3235	int bits, i = 0;
				3236
				3237	c = ent->value;
				3238	if (c < 0x80)
				3239	{ out[i++]= c; bits= -6; }
				3240	else if (c < 0x800)
				3241	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				3242	else if (c < 0x10000)
				3243	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				3244	else
				3245	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				3246
				3247	for ( ; bits >= 0; bits-= 6) {
				3248	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
				3249	}
				3250	out[i] = 0;
				3251
				3252	htmlCheckParagraph(ctxt);
				3253	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				3254	ctxt->sax->characters(ctxt->userData, out, i);
				3255	}
				3256	xmlFree(name);
				3257	}
				3258	}
				3259
				3260	/**
				3261	* htmlParseContent:
				3262	* @ctxt: an HTML parser context
				3263	* @name: the node name
				3264	*
				3265	* Parse a content: comment, sub-element, reference or text.
				3266	*
				3267	*/
				3268
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3269	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3270	htmlParseContent(htmlParserCtxtPtr ctxt) {
				3271	xmlChar *currentNode;
				3272	int depth;
				3273
				3274	currentNode = xmlStrdup(ctxt->name);
				3275	depth = ctxt->nameNr;
				3276	while (1) {
				3277	long cons = ctxt->nbChars;
				3278
				3279	GROW;
				3280	/*
				3281	* Our tag or one of it's parent or children is ending.
				3282	*/
				3283	if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3284	if (htmlParseEndTag(ctxt) &&
				3285	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
				3286	if (currentNode != NULL)
				3287	xmlFree(currentNode);
				3288	return;
				3289	}
				3290	continue; /* while */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3291	}
				3292
				3293	/*
				3294	* Has this node been popped out during parsing of
				3295	* the next element
				3296	*/
Daniel Veillard	f420ac5	2001-07-04 16:04:09 +0000	[diff] [blame]	3297	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
				3298	(!xmlStrEqual(currentNode, ctxt->name)))
				3299	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3300	if (currentNode != NULL) xmlFree(currentNode);
				3301	return;
				3302	}
				3303
Daniel Veillard	f9533d1	2001-03-03 10:04:57 +0000	[diff] [blame]	3304	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
				3305	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3306	/*
				3307	* Handle SCRIPT/STYLE separately
				3308	*/
				3309	htmlParseScript(ctxt);
				3310	} else {
				3311	/*
				3312	* Sometimes DOCTYPE arrives in the middle of the document
				3313	*/
				3314	if ((CUR == '<') && (NXT(1) == '!') &&
				3315	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3316	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3317	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3318	(UPP(8) == 'E')) {
				3319	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3320	ctxt->sax->error(ctxt->userData,
				3321	"Misplaced DOCTYPE declaration\n");
				3322	ctxt->wellFormed = 0;
				3323	htmlParseDocTypeDecl(ctxt);
				3324	}
				3325
				3326	/*
				3327	* First case : a comment
				3328	*/
				3329	if ((CUR == '<') && (NXT(1) == '!') &&
				3330	(NXT(2) == '-') && (NXT(3) == '-')) {
				3331	htmlParseComment(ctxt);
				3332	}
				3333
				3334	/*
				3335	* Second case : a sub-element.
				3336	*/
				3337	else if (CUR == '<') {
				3338	htmlParseElement(ctxt);
				3339	}
				3340
				3341	/*
				3342	* Third case : a reference. If if has not been resolved,
				3343	* parsing returns it's Name, create the node
				3344	*/
				3345	else if (CUR == '&') {
				3346	htmlParseReference(ctxt);
				3347	}
				3348
				3349	/*
				3350	* Fourth : end of the resource
				3351	*/
				3352	else if (CUR == 0) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3353	htmlAutoCloseOnEnd(ctxt);
				3354	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3355	}
				3356
				3357	/*
				3358	* Last case, text. Note that References are handled directly.
				3359	*/
				3360	else {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3361	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3362	}
				3363
				3364	if (cons == ctxt->nbChars) {
				3365	if (ctxt->node != NULL) {
				3366	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3367	ctxt->sax->error(ctxt->userData,
				3368	"detected an error in element content\n");
				3369	ctxt->wellFormed = 0;
				3370	}
				3371	break;
				3372	}
				3373	}
				3374	GROW;
				3375	}
				3376	if (currentNode != NULL) xmlFree(currentNode);
				3377	}
				3378
				3379	/**
				3380	* htmlParseElement:
				3381	* @ctxt: an HTML parser context
				3382	*
				3383	* parse an HTML element, this is highly recursive
				3384	*
				3385	* [39] element ::= EmptyElemTag \| STag content ETag
				3386	*
				3387	* [41] Attribute ::= Name Eq AttValue
				3388	*/
				3389
				3390	void
				3391	htmlParseElement(htmlParserCtxtPtr ctxt) {
				3392	xmlChar *name;
				3393	xmlChar *currentNode = NULL;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	3394	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3395	htmlParserNodeInfo node_info;
				3396	xmlChar *oldname;
				3397	int depth = ctxt->nameNr;
Daniel Veillard	3fbe8e3	2001-10-06 13:30:33 +0000	[diff] [blame^]	3398	const xmlChar *oldptr;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3399
				3400	/* Capture start position */
				3401	if (ctxt->record_info) {
				3402	node_info.begin_pos = ctxt->input->consumed +
				3403	(CUR_PTR - ctxt->input->base);
				3404	node_info.begin_line = ctxt->input->line;
				3405	}
				3406
				3407	oldname = xmlStrdup(ctxt->name);
				3408	htmlParseStartTag(ctxt);
				3409	name = ctxt->name;
				3410	#ifdef DEBUG
				3411	if (oldname == NULL)
				3412	xmlGenericError(xmlGenericErrorContext,
				3413	"Start of element %s\n", name);
				3414	else if (name == NULL)
				3415	xmlGenericError(xmlGenericErrorContext,
				3416	"Start of element failed, was %s\n", oldname);
				3417	else
				3418	xmlGenericError(xmlGenericErrorContext,
				3419	"Start of element %s, was %s\n", name, oldname);
				3420	#endif
				3421	if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) \|\|
				3422	(name == NULL)) {
				3423	if (CUR == '>')
				3424	NEXT;
				3425	if (oldname != NULL)
				3426	xmlFree(oldname);
				3427	return;
				3428	}
				3429	if (oldname != NULL)
				3430	xmlFree(oldname);
				3431
				3432	/*
				3433	* Lookup the info for that element.
				3434	*/
				3435	info = htmlTagLookup(name);
				3436	if (info == NULL) {
				3437	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3438	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				3439	name);
				3440	ctxt->wellFormed = 0;
				3441	} else if (info->depr) {
				3442	/***************************
				3443	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				3444	ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
				3445	name);
				3446	***************************/
				3447	}
				3448
				3449	/*
				3450	* Check for an Empty Element labelled the XML/SGML way
				3451	*/
				3452	if ((CUR == '/') && (NXT(1) == '>')) {
				3453	SKIP(2);
				3454	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3455	ctxt->sax->endElement(ctxt->userData, name);
				3456	oldname = htmlnamePop(ctxt);
				3457	#ifdef DEBUG
				3458	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
				3459	#endif
				3460	if (oldname != NULL)
				3461	xmlFree(oldname);
				3462	return;
				3463	}
				3464
				3465	if (CUR == '>') {
				3466	NEXT;
				3467	} else {
				3468	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3469	ctxt->sax->error(ctxt->userData,
				3470	"Couldn't find end of Start Tag %s\n",
				3471	name);
				3472	ctxt->wellFormed = 0;
				3473
				3474	/*
				3475	* end of parsing of this node.
				3476	*/
				3477	if (xmlStrEqual(name, ctxt->name)) {
				3478	nodePop(ctxt);
				3479	oldname = htmlnamePop(ctxt);
				3480	#ifdef DEBUG
				3481	xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
				3482	#endif
				3483	if (oldname != NULL)
				3484	xmlFree(oldname);
				3485	}
				3486
				3487	/*
				3488	* Capture end position and add node
				3489	*/
				3490	if ( currentNode != NULL && ctxt->record_info ) {
				3491	node_info.end_pos = ctxt->input->consumed +
				3492	(CUR_PTR - ctxt->input->base);
				3493	node_info.end_line = ctxt->input->line;
				3494	node_info.node = ctxt->node;
				3495	xmlParserAddNodeInfo(ctxt, &node_info);
				3496	}
				3497	return;
				3498	}
				3499
				3500	/*
				3501	* Check for an Empty Element from DTD definition
				3502	*/
				3503	if ((info != NULL) && (info->empty)) {
				3504	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				3505	ctxt->sax->endElement(ctxt->userData, name);
				3506	oldname = htmlnamePop(ctxt);
				3507	#ifdef DEBUG
				3508	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				3509	#endif
				3510	if (oldname != NULL)
				3511	xmlFree(oldname);
				3512	return;
				3513	}
				3514
				3515	/*
				3516	* Parse the content of the element:
				3517	*/
				3518	currentNode = xmlStrdup(ctxt->name);
				3519	depth = ctxt->nameNr;
				3520	while (IS_CHAR(CUR)) {
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3521	oldptr = ctxt->input->cur;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3522	htmlParseContent(ctxt);
William M. Brack	d28e48a	2001-09-23 01:55:08 +0000	[diff] [blame]	3523	if (oldptr==ctxt->input->cur) break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3524	if (ctxt->nameNr < depth) break;
				3525	}
				3526
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3527	/*
				3528	* Capture end position and add node
				3529	*/
				3530	if ( currentNode != NULL && ctxt->record_info ) {
				3531	node_info.end_pos = ctxt->input->consumed +
				3532	(CUR_PTR - ctxt->input->base);
				3533	node_info.end_line = ctxt->input->line;
				3534	node_info.node = ctxt->node;
				3535	xmlParserAddNodeInfo(ctxt, &node_info);
				3536	}
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3537	if (!IS_CHAR(CUR)) {
				3538	htmlAutoCloseOnEnd(ctxt);
				3539	}
				3540
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3541	if (currentNode != NULL)
				3542	xmlFree(currentNode);
				3543	}
				3544
				3545	/**
				3546	* htmlParseDocument :
				3547	* @ctxt: an HTML parser context
				3548	*
				3549	* parse an HTML document (and build a tree if using the standard SAX
				3550	* interface).
				3551	*
				3552	* Returns 0, -1 in case of error. the parser context is augmented
				3553	* as a result of the parsing.
				3554	*/
				3555
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3556	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3557	htmlParseDocument(htmlParserCtxtPtr ctxt) {
				3558	xmlDtdPtr dtd;
				3559
				3560	htmlDefaultSAXHandlerInit();
				3561	ctxt->html = 1;
				3562
				3563	GROW;
				3564	/*
				3565	* SAX: beginning of the document processing.
				3566	*/
				3567	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				3568	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
				3569
				3570	/*
				3571	* Wipe out everything which is before the first '<'
				3572	*/
				3573	SKIP_BLANKS;
				3574	if (CUR == 0) {
				3575	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				3576	ctxt->sax->error(ctxt->userData, "Document is empty\n");
				3577	ctxt->wellFormed = 0;
				3578	}
				3579
				3580	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
				3581	ctxt->sax->startDocument(ctxt->userData);
				3582
				3583
				3584	/*
				3585	* Parse possible comments before any content
				3586	*/
				3587	while ((CUR == '<') && (NXT(1) == '!') &&
				3588	(NXT(2) == '-') && (NXT(3) == '-')) {
				3589	htmlParseComment(ctxt);
				3590	SKIP_BLANKS;
				3591	}
				3592
				3593
				3594	/*
				3595	* Then possibly doc type declaration(s) and more Misc
				3596	* (doctypedecl Misc*)?
				3597	*/
				3598	if ((CUR == '<') && (NXT(1) == '!') &&
				3599	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				3600	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				3601	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				3602	(UPP(8) == 'E')) {
				3603	htmlParseDocTypeDecl(ctxt);
				3604	}
				3605	SKIP_BLANKS;
				3606
				3607	/*
				3608	* Parse possible comments before any content
				3609	*/
				3610	while ((CUR == '<') && (NXT(1) == '!') &&
				3611	(NXT(2) == '-') && (NXT(3) == '-')) {
				3612	htmlParseComment(ctxt);
				3613	SKIP_BLANKS;
				3614	}
				3615
				3616	/*
				3617	* Time to start parsing the tree itself
				3618	*/
				3619	htmlParseContent(ctxt);
				3620
				3621	/*
				3622	* autoclose
				3623	*/
				3624	if (CUR == 0)
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3625	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3626
				3627
				3628	/*
				3629	* SAX: end of the document processing.
				3630	*/
				3631	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3632	ctxt->sax->endDocument(ctxt->userData);
				3633
				3634	if (ctxt->myDoc != NULL) {
				3635	dtd = xmlGetIntSubset(ctxt->myDoc);
				3636	if (dtd == NULL)
				3637	ctxt->myDoc->intSubset =
				3638	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				3639	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				3640	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				3641	}
				3642	if (! ctxt->wellFormed) return(-1);
				3643	return(0);
				3644	}
				3645
				3646
				3647	/************************************************************************
				3648	* *
				3649	* Parser contexts handling *
				3650	* *
				3651	************************************************************************/
				3652
				3653	/**
				3654	* xmlInitParserCtxt:
				3655	* @ctxt: an HTML parser context
				3656	*
				3657	* Initialize a parser context
				3658	*/
				3659
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3660	static void
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3661	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
				3662	{
				3663	htmlSAXHandler *sax;
				3664
				3665	if (ctxt == NULL) return;
				3666	memset(ctxt, 0, sizeof(htmlParserCtxt));
				3667
				3668	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
				3669	if (sax == NULL) {
				3670	xmlGenericError(xmlGenericErrorContext,
				3671	"htmlInitParserCtxt: out of memory\n");
				3672	}
				3673	else
				3674	memset(sax, 0, sizeof(htmlSAXHandler));
				3675
				3676	/* Allocate the Input stack */
				3677	ctxt->inputTab = (htmlParserInputPtr *)
				3678	xmlMalloc(5 * sizeof(htmlParserInputPtr));
				3679	if (ctxt->inputTab == NULL) {
				3680	xmlGenericError(xmlGenericErrorContext,
				3681	"htmlInitParserCtxt: out of memory\n");
				3682	ctxt->inputNr = 0;
				3683	ctxt->inputMax = 0;
				3684	ctxt->input = NULL;
				3685	return;
				3686	}
				3687	ctxt->inputNr = 0;
				3688	ctxt->inputMax = 5;
				3689	ctxt->input = NULL;
				3690	ctxt->version = NULL;
				3691	ctxt->encoding = NULL;
				3692	ctxt->standalone = -1;
				3693	ctxt->instate = XML_PARSER_START;
				3694
				3695	/* Allocate the Node stack */
				3696	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
				3697	if (ctxt->nodeTab == NULL) {
				3698	xmlGenericError(xmlGenericErrorContext,
				3699	"htmlInitParserCtxt: out of memory\n");
				3700	ctxt->nodeNr = 0;
				3701	ctxt->nodeMax = 0;
				3702	ctxt->node = NULL;
				3703	ctxt->inputNr = 0;
				3704	ctxt->inputMax = 0;
				3705	ctxt->input = NULL;
				3706	return;
				3707	}
				3708	ctxt->nodeNr = 0;
				3709	ctxt->nodeMax = 10;
				3710	ctxt->node = NULL;
				3711
				3712	/* Allocate the Name stack */
				3713	ctxt->nameTab = (xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
				3714	if (ctxt->nameTab == NULL) {
				3715	xmlGenericError(xmlGenericErrorContext,
				3716	"htmlInitParserCtxt: out of memory\n");
				3717	ctxt->nameNr = 0;
				3718	ctxt->nameMax = 10;
				3719	ctxt->name = NULL;
				3720	ctxt->nodeNr = 0;
				3721	ctxt->nodeMax = 0;
				3722	ctxt->node = NULL;
				3723	ctxt->inputNr = 0;
				3724	ctxt->inputMax = 0;
				3725	ctxt->input = NULL;
				3726	return;
				3727	}
				3728	ctxt->nameNr = 0;
				3729	ctxt->nameMax = 10;
				3730	ctxt->name = NULL;
				3731
				3732	if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
				3733	else {
				3734	ctxt->sax = sax;
				3735	memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
				3736	}
				3737	ctxt->userData = ctxt;
				3738	ctxt->myDoc = NULL;
				3739	ctxt->wellFormed = 1;
				3740	ctxt->replaceEntities = 0;
				3741	ctxt->html = 1;
				3742	ctxt->record_info = 0;
				3743	ctxt->validate = 0;
				3744	ctxt->nbChars = 0;
				3745	ctxt->checkIndex = 0;
Daniel Veillard	dc2cee2	2001-08-22 16:30:37 +0000	[diff] [blame]	3746	ctxt->catalogs = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3747	xmlInitNodeInfoSeq(&ctxt->node_seq);
				3748	}
				3749
				3750	/**
				3751	* htmlFreeParserCtxt:
				3752	* @ctxt: an HTML parser context
				3753	*
				3754	* Free all the memory used by a parser context. However the parsed
				3755	* document in ctxt->myDoc is not freed.
				3756	*/
				3757
				3758	void
				3759	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
				3760	{
				3761	xmlFreeParserCtxt(ctxt);
				3762	}
				3763
				3764	/**
				3765	* htmlCreateDocParserCtxt :
				3766	* @cur: a pointer to an array of xmlChar
				3767	* @encoding: a free form C string describing the HTML document encoding, or NULL
				3768	*
				3769	* Create a parser context for an HTML document.
				3770	*
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3771	* TODO: check the need to add encoding handling there
				3772	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3773	* Returns the new parser context or NULL
				3774	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3775	static htmlParserCtxtPtr
Daniel Veillard	c86a4fa	2001-03-26 16:28:29 +0000	[diff] [blame]	3776	htmlCreateDocParserCtxt(xmlChar cur, const char encoding ATTRIBUTE_UNUSED) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3777	htmlParserCtxtPtr ctxt;
				3778	htmlParserInputPtr input;
				3779	/* htmlCharEncoding enc; */
				3780
				3781	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				3782	if (ctxt == NULL) {
				3783	perror("malloc");
				3784	return(NULL);
				3785	}
				3786	htmlInitParserCtxt(ctxt);
				3787	input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				3788	if (input == NULL) {
				3789	perror("malloc");
				3790	xmlFree(ctxt);
				3791	return(NULL);
				3792	}
				3793	memset(input, 0, sizeof(htmlParserInput));
				3794
				3795	input->line = 1;
				3796	input->col = 1;
				3797	input->base = cur;
				3798	input->cur = cur;
				3799
				3800	inputPush(ctxt, input);
				3801	return(ctxt);
				3802	}
				3803
				3804	/************************************************************************
				3805	* *
				3806	* Progressive parsing interfaces *
				3807	* *
				3808	************************************************************************/
				3809
				3810	/**
				3811	* htmlParseLookupSequence:
				3812	* @ctxt: an HTML parser context
				3813	* @first: the first char to lookup
				3814	* @next: the next char to lookup or zero
				3815	* @third: the next char to lookup or zero
				3816	*
				3817	* Try to find if a sequence (first, next, third) or just (first next) or
				3818	* (first) is available in the input stream.
				3819	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
				3820	* to avoid rescanning sequences of bytes, it DOES change the state of the
				3821	* parser, do not use liberally.
				3822	* This is basically similar to xmlParseLookupSequence()
				3823	*
				3824	* Returns the index to the current parsing point if the full sequence
				3825	* is available, -1 otherwise.
				3826	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3827	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3828	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
				3829	xmlChar next, xmlChar third) {
				3830	int base, len;
				3831	htmlParserInputPtr in;
				3832	const xmlChar *buf;
				3833
				3834	in = ctxt->input;
				3835	if (in == NULL) return(-1);
				3836	base = in->cur - in->base;
				3837	if (base < 0) return(-1);
				3838	if (ctxt->checkIndex > base)
				3839	base = ctxt->checkIndex;
				3840	if (in->buf == NULL) {
				3841	buf = in->base;
				3842	len = in->length;
				3843	} else {
				3844	buf = in->buf->buffer->content;
				3845	len = in->buf->buffer->use;
				3846	}
				3847	/* take into account the sequence length */
				3848	if (third) len -= 2;
				3849	else if (next) len --;
				3850	for (;base < len;base++) {
				3851	if (buf[base] == first) {
				3852	if (third != 0) {
				3853	if ((buf[base + 1] != next) \|\|
				3854	(buf[base + 2] != third)) continue;
				3855	} else if (next != 0) {
				3856	if (buf[base + 1] != next) continue;
				3857	}
				3858	ctxt->checkIndex = 0;
				3859	#ifdef DEBUG_PUSH
				3860	if (next == 0)
				3861	xmlGenericError(xmlGenericErrorContext,
				3862	"HPP: lookup '%c' found at %d\n",
				3863	first, base);
				3864	else if (third == 0)
				3865	xmlGenericError(xmlGenericErrorContext,
				3866	"HPP: lookup '%c%c' found at %d\n",
				3867	first, next, base);
				3868	else
				3869	xmlGenericError(xmlGenericErrorContext,
				3870	"HPP: lookup '%c%c%c' found at %d\n",
				3871	first, next, third, base);
				3872	#endif
				3873	return(base - (in->cur - in->base));
				3874	}
				3875	}
				3876	ctxt->checkIndex = base;
				3877	#ifdef DEBUG_PUSH
				3878	if (next == 0)
				3879	xmlGenericError(xmlGenericErrorContext,
				3880	"HPP: lookup '%c' failed\n", first);
				3881	else if (third == 0)
				3882	xmlGenericError(xmlGenericErrorContext,
				3883	"HPP: lookup '%c%c' failed\n", first, next);
				3884	else
				3885	xmlGenericError(xmlGenericErrorContext,
				3886	"HPP: lookup '%c%c%c' failed\n", first, next, third);
				3887	#endif
				3888	return(-1);
				3889	}
				3890
				3891	/**
				3892	* htmlParseTryOrFinish:
				3893	* @ctxt: an HTML parser context
				3894	* @terminate: last chunk indicator
				3895	*
				3896	* Try to progress on parsing
				3897	*
				3898	* Returns zero if no parsing was possible
				3899	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	3900	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3901	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
				3902	int ret = 0;
				3903	htmlParserInputPtr in;
				3904	int avail = 0;
				3905	xmlChar cur, next;
				3906
				3907	#ifdef DEBUG_PUSH
				3908	switch (ctxt->instate) {
				3909	case XML_PARSER_EOF:
				3910	xmlGenericError(xmlGenericErrorContext,
				3911	"HPP: try EOF\n"); break;
				3912	case XML_PARSER_START:
				3913	xmlGenericError(xmlGenericErrorContext,
				3914	"HPP: try START\n"); break;
				3915	case XML_PARSER_MISC:
				3916	xmlGenericError(xmlGenericErrorContext,
				3917	"HPP: try MISC\n");break;
				3918	case XML_PARSER_COMMENT:
				3919	xmlGenericError(xmlGenericErrorContext,
				3920	"HPP: try COMMENT\n");break;
				3921	case XML_PARSER_PROLOG:
				3922	xmlGenericError(xmlGenericErrorContext,
				3923	"HPP: try PROLOG\n");break;
				3924	case XML_PARSER_START_TAG:
				3925	xmlGenericError(xmlGenericErrorContext,
				3926	"HPP: try START_TAG\n");break;
				3927	case XML_PARSER_CONTENT:
				3928	xmlGenericError(xmlGenericErrorContext,
				3929	"HPP: try CONTENT\n");break;
				3930	case XML_PARSER_CDATA_SECTION:
				3931	xmlGenericError(xmlGenericErrorContext,
				3932	"HPP: try CDATA_SECTION\n");break;
				3933	case XML_PARSER_END_TAG:
				3934	xmlGenericError(xmlGenericErrorContext,
				3935	"HPP: try END_TAG\n");break;
				3936	case XML_PARSER_ENTITY_DECL:
				3937	xmlGenericError(xmlGenericErrorContext,
				3938	"HPP: try ENTITY_DECL\n");break;
				3939	case XML_PARSER_ENTITY_VALUE:
				3940	xmlGenericError(xmlGenericErrorContext,
				3941	"HPP: try ENTITY_VALUE\n");break;
				3942	case XML_PARSER_ATTRIBUTE_VALUE:
				3943	xmlGenericError(xmlGenericErrorContext,
				3944	"HPP: try ATTRIBUTE_VALUE\n");break;
				3945	case XML_PARSER_DTD:
				3946	xmlGenericError(xmlGenericErrorContext,
				3947	"HPP: try DTD\n");break;
				3948	case XML_PARSER_EPILOG:
				3949	xmlGenericError(xmlGenericErrorContext,
				3950	"HPP: try EPILOG\n");break;
				3951	case XML_PARSER_PI:
				3952	xmlGenericError(xmlGenericErrorContext,
				3953	"HPP: try PI\n");break;
				3954	case XML_PARSER_SYSTEM_LITERAL:
				3955	xmlGenericError(xmlGenericErrorContext,
				3956	"HPP: try SYSTEM_LITERAL\n");break;
				3957	}
				3958	#endif
				3959
				3960	while (1) {
				3961
				3962	in = ctxt->input;
				3963	if (in == NULL) break;
				3964	if (in->buf == NULL)
				3965	avail = in->length - (in->cur - in->base);
				3966	else
				3967	avail = in->buf->buffer->use - (in->cur - in->base);
				3968	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	3969	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	3970	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				3971	/*
				3972	* SAX: end of the document processing.
				3973	*/
				3974	ctxt->instate = XML_PARSER_EOF;
				3975	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				3976	ctxt->sax->endDocument(ctxt->userData);
				3977	}
				3978	}
				3979	if (avail < 1)
				3980	goto done;
				3981	switch (ctxt->instate) {
				3982	case XML_PARSER_EOF:
				3983	/*
				3984	* Document parsing is done !
				3985	*/
				3986	goto done;
				3987	case XML_PARSER_START:
				3988	/*
				3989	* Very first chars read from the document flow.
				3990	*/
				3991	cur = in->cur[0];
				3992	if (IS_BLANK(cur)) {
				3993	SKIP_BLANKS;
				3994	if (in->buf == NULL)
				3995	avail = in->length - (in->cur - in->base);
				3996	else
				3997	avail = in->buf->buffer->use - (in->cur - in->base);
				3998	}
				3999	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
				4000	ctxt->sax->setDocumentLocator(ctxt->userData,
				4001	&xmlDefaultSAXLocator);
				4002	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
				4003	(!ctxt->disableSAX))
				4004	ctxt->sax->startDocument(ctxt->userData);
				4005
				4006	cur = in->cur[0];
				4007	next = in->cur[1];
				4008	if ((cur == '<') && (next == '!') &&
				4009	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4010	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4011	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4012	(UPP(8) == 'E')) {
				4013	if ((!terminate) &&
				4014	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4015	goto done;
				4016	#ifdef DEBUG_PUSH
				4017	xmlGenericError(xmlGenericErrorContext,
				4018	"HPP: Parsing internal subset\n");
				4019	#endif
				4020	htmlParseDocTypeDecl(ctxt);
				4021	ctxt->instate = XML_PARSER_PROLOG;
				4022	#ifdef DEBUG_PUSH
				4023	xmlGenericError(xmlGenericErrorContext,
				4024	"HPP: entering PROLOG\n");
				4025	#endif
				4026	} else {
				4027	ctxt->instate = XML_PARSER_MISC;
				4028	}
				4029	#ifdef DEBUG_PUSH
				4030	xmlGenericError(xmlGenericErrorContext,
				4031	"HPP: entering MISC\n");
				4032	#endif
				4033	break;
				4034	case XML_PARSER_MISC:
				4035	SKIP_BLANKS;
				4036	if (in->buf == NULL)
				4037	avail = in->length - (in->cur - in->base);
				4038	else
				4039	avail = in->buf->buffer->use - (in->cur - in->base);
				4040	if (avail < 2)
				4041	goto done;
				4042	cur = in->cur[0];
				4043	next = in->cur[1];
				4044	if ((cur == '<') && (next == '!') &&
				4045	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4046	if ((!terminate) &&
				4047	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4048	goto done;
				4049	#ifdef DEBUG_PUSH
				4050	xmlGenericError(xmlGenericErrorContext,
				4051	"HPP: Parsing Comment\n");
				4052	#endif
				4053	htmlParseComment(ctxt);
				4054	ctxt->instate = XML_PARSER_MISC;
				4055	} else if ((cur == '<') && (next == '!') &&
				4056	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4057	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4058	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4059	(UPP(8) == 'E')) {
				4060	if ((!terminate) &&
				4061	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4062	goto done;
				4063	#ifdef DEBUG_PUSH
				4064	xmlGenericError(xmlGenericErrorContext,
				4065	"HPP: Parsing internal subset\n");
				4066	#endif
				4067	htmlParseDocTypeDecl(ctxt);
				4068	ctxt->instate = XML_PARSER_PROLOG;
				4069	#ifdef DEBUG_PUSH
				4070	xmlGenericError(xmlGenericErrorContext,
				4071	"HPP: entering PROLOG\n");
				4072	#endif
				4073	} else if ((cur == '<') && (next == '!') &&
				4074	(avail < 9)) {
				4075	goto done;
				4076	} else {
				4077	ctxt->instate = XML_PARSER_START_TAG;
				4078	#ifdef DEBUG_PUSH
				4079	xmlGenericError(xmlGenericErrorContext,
				4080	"HPP: entering START_TAG\n");
				4081	#endif
				4082	}
				4083	break;
				4084	case XML_PARSER_PROLOG:
				4085	SKIP_BLANKS;
				4086	if (in->buf == NULL)
				4087	avail = in->length - (in->cur - in->base);
				4088	else
				4089	avail = in->buf->buffer->use - (in->cur - in->base);
				4090	if (avail < 2)
				4091	goto done;
				4092	cur = in->cur[0];
				4093	next = in->cur[1];
				4094	if ((cur == '<') && (next == '!') &&
				4095	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4096	if ((!terminate) &&
				4097	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4098	goto done;
				4099	#ifdef DEBUG_PUSH
				4100	xmlGenericError(xmlGenericErrorContext,
				4101	"HPP: Parsing Comment\n");
				4102	#endif
				4103	htmlParseComment(ctxt);
				4104	ctxt->instate = XML_PARSER_PROLOG;
				4105	} else if ((cur == '<') && (next == '!') &&
				4106	(avail < 4)) {
				4107	goto done;
				4108	} else {
				4109	ctxt->instate = XML_PARSER_START_TAG;
				4110	#ifdef DEBUG_PUSH
				4111	xmlGenericError(xmlGenericErrorContext,
				4112	"HPP: entering START_TAG\n");
				4113	#endif
				4114	}
				4115	break;
				4116	case XML_PARSER_EPILOG:
				4117	if (in->buf == NULL)
				4118	avail = in->length - (in->cur - in->base);
				4119	else
				4120	avail = in->buf->buffer->use - (in->cur - in->base);
				4121	if (avail < 1)
				4122	goto done;
				4123	cur = in->cur[0];
				4124	if (IS_BLANK(cur)) {
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4125	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4126	goto done;
				4127	}
				4128	if (avail < 2)
				4129	goto done;
				4130	next = in->cur[1];
				4131	if ((cur == '<') && (next == '!') &&
				4132	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4133	if ((!terminate) &&
				4134	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4135	goto done;
				4136	#ifdef DEBUG_PUSH
				4137	xmlGenericError(xmlGenericErrorContext,
				4138	"HPP: Parsing Comment\n");
				4139	#endif
				4140	htmlParseComment(ctxt);
				4141	ctxt->instate = XML_PARSER_EPILOG;
				4142	} else if ((cur == '<') && (next == '!') &&
				4143	(avail < 4)) {
				4144	goto done;
				4145	} else {
				4146	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4147	ctxt->wellFormed = 0;
				4148	ctxt->instate = XML_PARSER_EOF;
				4149	#ifdef DEBUG_PUSH
				4150	xmlGenericError(xmlGenericErrorContext,
				4151	"HPP: entering EOF\n");
				4152	#endif
				4153	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4154	ctxt->sax->endDocument(ctxt->userData);
				4155	goto done;
				4156	}
				4157	break;
				4158	case XML_PARSER_START_TAG: {
				4159	xmlChar name, oldname;
				4160	int depth = ctxt->nameNr;
Daniel Veillard	bb37129	2001-08-16 23:26:59 +0000	[diff] [blame]	4161	const htmlElemDesc * info;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4162
				4163	if (avail < 2)
				4164	goto done;
				4165	cur = in->cur[0];
				4166	if (cur != '<') {
				4167	ctxt->instate = XML_PARSER_CONTENT;
				4168	#ifdef DEBUG_PUSH
				4169	xmlGenericError(xmlGenericErrorContext,
				4170	"HPP: entering CONTENT\n");
				4171	#endif
				4172	break;
				4173	}
Daniel Veillard	f69bb4b	2001-05-19 13:24:56 +0000	[diff] [blame]	4174	if (in->cur[1] == '/') {
				4175	ctxt->instate = XML_PARSER_END_TAG;
				4176	ctxt->checkIndex = 0;
				4177	#ifdef DEBUG_PUSH
				4178	xmlGenericError(xmlGenericErrorContext,
				4179	"HPP: entering END_TAG\n");
				4180	#endif
				4181	break;
				4182	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4183	if ((!terminate) &&
				4184	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4185	goto done;
				4186
				4187	oldname = xmlStrdup(ctxt->name);
				4188	htmlParseStartTag(ctxt);
				4189	name = ctxt->name;
				4190	#ifdef DEBUG
				4191	if (oldname == NULL)
				4192	xmlGenericError(xmlGenericErrorContext,
				4193	"Start of element %s\n", name);
				4194	else if (name == NULL)
				4195	xmlGenericError(xmlGenericErrorContext,
				4196	"Start of element failed, was %s\n",
				4197	oldname);
				4198	else
				4199	xmlGenericError(xmlGenericErrorContext,
				4200	"Start of element %s, was %s\n",
				4201	name, oldname);
				4202	#endif
				4203	if (((depth == ctxt->nameNr) &&
				4204	(xmlStrEqual(oldname, ctxt->name))) \|\|
				4205	(name == NULL)) {
				4206	if (CUR == '>')
				4207	NEXT;
				4208	if (oldname != NULL)
				4209	xmlFree(oldname);
				4210	break;
				4211	}
				4212	if (oldname != NULL)
				4213	xmlFree(oldname);
				4214
				4215	/*
				4216	* Lookup the info for that element.
				4217	*/
				4218	info = htmlTagLookup(name);
				4219	if (info == NULL) {
				4220	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4221	ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
				4222	name);
				4223	ctxt->wellFormed = 0;
				4224	} else if (info->depr) {
				4225	/***************************
				4226	if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
				4227	ctxt->sax->warning(ctxt->userData,
				4228	"Tag %s is deprecated\n",
				4229	name);
				4230	***************************/
				4231	}
				4232
				4233	/*
				4234	* Check for an Empty Element labelled the XML/SGML way
				4235	*/
				4236	if ((CUR == '/') && (NXT(1) == '>')) {
				4237	SKIP(2);
				4238	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4239	ctxt->sax->endElement(ctxt->userData, name);
				4240	oldname = htmlnamePop(ctxt);
				4241	#ifdef DEBUG
				4242	xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
				4243	oldname);
				4244	#endif
				4245	if (oldname != NULL)
				4246	xmlFree(oldname);
				4247	ctxt->instate = XML_PARSER_CONTENT;
				4248	#ifdef DEBUG_PUSH
				4249	xmlGenericError(xmlGenericErrorContext,
				4250	"HPP: entering CONTENT\n");
				4251	#endif
				4252	break;
				4253	}
				4254
				4255	if (CUR == '>') {
				4256	NEXT;
				4257	} else {
				4258	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4259	ctxt->sax->error(ctxt->userData,
				4260	"Couldn't find end of Start Tag %s\n",
				4261	name);
				4262	ctxt->wellFormed = 0;
				4263
				4264	/*
				4265	* end of parsing of this node.
				4266	*/
				4267	if (xmlStrEqual(name, ctxt->name)) {
				4268	nodePop(ctxt);
				4269	oldname = htmlnamePop(ctxt);
				4270	#ifdef DEBUG
				4271	xmlGenericError(xmlGenericErrorContext,
				4272	"End of start tag problem: popping out %s\n", oldname);
				4273	#endif
				4274	if (oldname != NULL)
				4275	xmlFree(oldname);
				4276	}
				4277
				4278	ctxt->instate = XML_PARSER_CONTENT;
				4279	#ifdef DEBUG_PUSH
				4280	xmlGenericError(xmlGenericErrorContext,
				4281	"HPP: entering CONTENT\n");
				4282	#endif
				4283	break;
				4284	}
				4285
				4286	/*
				4287	* Check for an Empty Element from DTD definition
				4288	*/
				4289	if ((info != NULL) && (info->empty)) {
				4290	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
				4291	ctxt->sax->endElement(ctxt->userData, name);
				4292	oldname = htmlnamePop(ctxt);
				4293	#ifdef DEBUG
				4294	xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
				4295	#endif
				4296	if (oldname != NULL)
				4297	xmlFree(oldname);
				4298	}
				4299	ctxt->instate = XML_PARSER_CONTENT;
				4300	#ifdef DEBUG_PUSH
				4301	xmlGenericError(xmlGenericErrorContext,
				4302	"HPP: entering CONTENT\n");
				4303	#endif
				4304	break;
				4305	}
				4306	case XML_PARSER_CONTENT: {
				4307	long cons;
				4308	/*
				4309	* Handle preparsed entities and charRef
				4310	*/
				4311	if (ctxt->token != 0) {
				4312	xmlChar chr[2] = { 0 , 0 } ;
				4313
				4314	chr[0] = (xmlChar) ctxt->token;
				4315	htmlCheckParagraph(ctxt);
				4316	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
				4317	ctxt->sax->characters(ctxt->userData, chr, 1);
				4318	ctxt->token = 0;
				4319	ctxt->checkIndex = 0;
				4320	}
				4321	if ((avail == 1) && (terminate)) {
				4322	cur = in->cur[0];
				4323	if ((cur != '<') && (cur != '&')) {
				4324	if (ctxt->sax != NULL) {
				4325	if (IS_BLANK(cur)) {
				4326	if (ctxt->sax->ignorableWhitespace != NULL)
				4327	ctxt->sax->ignorableWhitespace(
				4328	ctxt->userData, &cur, 1);
				4329	} else {
				4330	htmlCheckParagraph(ctxt);
				4331	if (ctxt->sax->characters != NULL)
				4332	ctxt->sax->characters(
				4333	ctxt->userData, &cur, 1);
				4334	}
				4335	}
				4336	ctxt->token = 0;
				4337	ctxt->checkIndex = 0;
				4338	NEXT;
William M. Brack	1633d18	2001-10-05 15:41:19 +0000	[diff] [blame]	4339	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4340	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4341	}
				4342	if (avail < 2)
				4343	goto done;
				4344	cur = in->cur[0];
				4345	next = in->cur[1];
				4346	cons = ctxt->nbChars;
				4347	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
				4348	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
				4349	/*
				4350	* Handle SCRIPT/STYLE separately
				4351	*/
				4352	if ((!terminate) &&
				4353	(htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
				4354	goto done;
				4355	htmlParseScript(ctxt);
				4356	if ((cur == '<') && (next == '/')) {
				4357	ctxt->instate = XML_PARSER_END_TAG;
				4358	ctxt->checkIndex = 0;
				4359	#ifdef DEBUG_PUSH
				4360	xmlGenericError(xmlGenericErrorContext,
				4361	"HPP: entering END_TAG\n");
				4362	#endif
				4363	break;
				4364	}
				4365	} else {
				4366	/*
				4367	* Sometimes DOCTYPE arrives in the middle of the document
				4368	*/
				4369	if ((cur == '<') && (next == '!') &&
				4370	(UPP(2) == 'D') && (UPP(3) == 'O') &&
				4371	(UPP(4) == 'C') && (UPP(5) == 'T') &&
				4372	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
				4373	(UPP(8) == 'E')) {
				4374	if ((!terminate) &&
				4375	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4376	goto done;
				4377	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4378	ctxt->sax->error(ctxt->userData,
				4379	"Misplaced DOCTYPE declaration\n");
				4380	ctxt->wellFormed = 0;
				4381	htmlParseDocTypeDecl(ctxt);
				4382	} else if ((cur == '<') && (next == '!') &&
				4383	(in->cur[2] == '-') && (in->cur[3] == '-')) {
				4384	if ((!terminate) &&
				4385	(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
				4386	goto done;
				4387	#ifdef DEBUG_PUSH
				4388	xmlGenericError(xmlGenericErrorContext,
				4389	"HPP: Parsing Comment\n");
				4390	#endif
				4391	htmlParseComment(ctxt);
				4392	ctxt->instate = XML_PARSER_CONTENT;
				4393	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
				4394	goto done;
				4395	} else if ((cur == '<') && (next == '/')) {
				4396	ctxt->instate = XML_PARSER_END_TAG;
				4397	ctxt->checkIndex = 0;
				4398	#ifdef DEBUG_PUSH
				4399	xmlGenericError(xmlGenericErrorContext,
				4400	"HPP: entering END_TAG\n");
				4401	#endif
				4402	break;
				4403	} else if (cur == '<') {
				4404	ctxt->instate = XML_PARSER_START_TAG;
				4405	ctxt->checkIndex = 0;
				4406	#ifdef DEBUG_PUSH
				4407	xmlGenericError(xmlGenericErrorContext,
				4408	"HPP: entering START_TAG\n");
				4409	#endif
				4410	break;
				4411	} else if (cur == '&') {
				4412	if ((!terminate) &&
				4413	(htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
				4414	goto done;
				4415	#ifdef DEBUG_PUSH
				4416	xmlGenericError(xmlGenericErrorContext,
				4417	"HPP: Parsing Reference\n");
				4418	#endif
				4419	/* TODO: check generation of subtrees if noent !!! */
				4420	htmlParseReference(ctxt);
				4421	} else {
				4422	/* TODO Avoid the extra copy, handle directly !!!!!! */
				4423	/*
				4424	* Goal of the following test is :
				4425	* - minimize calls to the SAX 'character' callback
				4426	* when they are mergeable
				4427	*/
				4428	if ((ctxt->inputNr == 1) &&
				4429	(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
				4430	if ((!terminate) &&
				4431	(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
				4432	goto done;
				4433	}
				4434	ctxt->checkIndex = 0;
				4435	#ifdef DEBUG_PUSH
				4436	xmlGenericError(xmlGenericErrorContext,
				4437	"HPP: Parsing char data\n");
				4438	#endif
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	4439	htmlParseCharData(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4440	}
				4441	}
				4442	if (cons == ctxt->nbChars) {
				4443	if (ctxt->node != NULL) {
				4444	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
				4445	ctxt->sax->error(ctxt->userData,
				4446	"detected an error in element content\n");
				4447	ctxt->wellFormed = 0;
				4448	}
				4449	NEXT;
				4450	break;
				4451	}
				4452
				4453	break;
				4454	}
				4455	case XML_PARSER_END_TAG:
				4456	if (avail < 2)
				4457	goto done;
				4458	if ((!terminate) &&
				4459	(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
				4460	goto done;
				4461	htmlParseEndTag(ctxt);
				4462	if (ctxt->nameNr == 0) {
				4463	ctxt->instate = XML_PARSER_EPILOG;
				4464	} else {
				4465	ctxt->instate = XML_PARSER_CONTENT;
				4466	}
				4467	ctxt->checkIndex = 0;
				4468	#ifdef DEBUG_PUSH
				4469	xmlGenericError(xmlGenericErrorContext,
				4470	"HPP: entering CONTENT\n");
				4471	#endif
				4472	break;
				4473	case XML_PARSER_CDATA_SECTION:
				4474	xmlGenericError(xmlGenericErrorContext,
				4475	"HPP: internal error, state == CDATA\n");
				4476	ctxt->instate = XML_PARSER_CONTENT;
				4477	ctxt->checkIndex = 0;
				4478	#ifdef DEBUG_PUSH
				4479	xmlGenericError(xmlGenericErrorContext,
				4480	"HPP: entering CONTENT\n");
				4481	#endif
				4482	break;
				4483	case XML_PARSER_DTD:
				4484	xmlGenericError(xmlGenericErrorContext,
				4485	"HPP: internal error, state == DTD\n");
				4486	ctxt->instate = XML_PARSER_CONTENT;
				4487	ctxt->checkIndex = 0;
				4488	#ifdef DEBUG_PUSH
				4489	xmlGenericError(xmlGenericErrorContext,
				4490	"HPP: entering CONTENT\n");
				4491	#endif
				4492	break;
				4493	case XML_PARSER_COMMENT:
				4494	xmlGenericError(xmlGenericErrorContext,
				4495	"HPP: internal error, state == COMMENT\n");
				4496	ctxt->instate = XML_PARSER_CONTENT;
				4497	ctxt->checkIndex = 0;
				4498	#ifdef DEBUG_PUSH
				4499	xmlGenericError(xmlGenericErrorContext,
				4500	"HPP: entering CONTENT\n");
				4501	#endif
				4502	break;
				4503	case XML_PARSER_PI:
				4504	xmlGenericError(xmlGenericErrorContext,
				4505	"HPP: internal error, state == PI\n");
				4506	ctxt->instate = XML_PARSER_CONTENT;
				4507	ctxt->checkIndex = 0;
				4508	#ifdef DEBUG_PUSH
				4509	xmlGenericError(xmlGenericErrorContext,
				4510	"HPP: entering CONTENT\n");
				4511	#endif
				4512	break;
				4513	case XML_PARSER_ENTITY_DECL:
				4514	xmlGenericError(xmlGenericErrorContext,
				4515	"HPP: internal error, state == ENTITY_DECL\n");
				4516	ctxt->instate = XML_PARSER_CONTENT;
				4517	ctxt->checkIndex = 0;
				4518	#ifdef DEBUG_PUSH
				4519	xmlGenericError(xmlGenericErrorContext,
				4520	"HPP: entering CONTENT\n");
				4521	#endif
				4522	break;
				4523	case XML_PARSER_ENTITY_VALUE:
				4524	xmlGenericError(xmlGenericErrorContext,
				4525	"HPP: internal error, state == ENTITY_VALUE\n");
				4526	ctxt->instate = XML_PARSER_CONTENT;
				4527	ctxt->checkIndex = 0;
				4528	#ifdef DEBUG_PUSH
				4529	xmlGenericError(xmlGenericErrorContext,
				4530	"HPP: entering DTD\n");
				4531	#endif
				4532	break;
				4533	case XML_PARSER_ATTRIBUTE_VALUE:
				4534	xmlGenericError(xmlGenericErrorContext,
				4535	"HPP: internal error, state == ATTRIBUTE_VALUE\n");
				4536	ctxt->instate = XML_PARSER_START_TAG;
				4537	ctxt->checkIndex = 0;
				4538	#ifdef DEBUG_PUSH
				4539	xmlGenericError(xmlGenericErrorContext,
				4540	"HPP: entering START_TAG\n");
				4541	#endif
				4542	break;
				4543	case XML_PARSER_SYSTEM_LITERAL:
				4544	xmlGenericError(xmlGenericErrorContext,
				4545	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
				4546	ctxt->instate = XML_PARSER_CONTENT;
				4547	ctxt->checkIndex = 0;
				4548	#ifdef DEBUG_PUSH
				4549	xmlGenericError(xmlGenericErrorContext,
				4550	"HPP: entering CONTENT\n");
				4551	#endif
				4552	break;
				4553	case XML_PARSER_IGNORE:
				4554	xmlGenericError(xmlGenericErrorContext,
				4555	"HPP: internal error, state == XML_PARSER_IGNORE\n");
				4556	ctxt->instate = XML_PARSER_CONTENT;
				4557	ctxt->checkIndex = 0;
				4558	#ifdef DEBUG_PUSH
				4559	xmlGenericError(xmlGenericErrorContext,
				4560	"HPP: entering CONTENT\n");
				4561	#endif
				4562	break;
				4563	}
				4564	}
				4565	done:
				4566	if ((avail == 0) && (terminate)) {
Daniel Veillard	a3bfca5	2001-04-12 15:42:58 +0000	[diff] [blame]	4567	htmlAutoCloseOnEnd(ctxt);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4568	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
				4569	/*
				4570	* SAX: end of the document processing.
				4571	*/
				4572	ctxt->instate = XML_PARSER_EOF;
				4573	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4574	ctxt->sax->endDocument(ctxt->userData);
				4575	}
				4576	}
				4577	if ((ctxt->myDoc != NULL) &&
				4578	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
				4579	(ctxt->instate == XML_PARSER_EPILOG))) {
				4580	xmlDtdPtr dtd;
				4581	dtd = xmlGetIntSubset(ctxt->myDoc);
				4582	if (dtd == NULL)
				4583	ctxt->myDoc->intSubset =
				4584	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
				4585	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
				4586	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
				4587	}
				4588	#ifdef DEBUG_PUSH
				4589	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
				4590	#endif
				4591	return(ret);
				4592	}
				4593
				4594	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4595	* htmlParseChunk:
				4596	* @ctxt: an XML parser context
				4597	* @chunk: an char array
				4598	* @size: the size in byte of the chunk
				4599	* @terminate: last chunk indicator
				4600	*
				4601	* Parse a Chunk of memory
				4602	*
				4603	* Returns zero if no error, the xmlParserErrors otherwise.
				4604	*/
				4605	int
				4606	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
				4607	int terminate) {
				4608	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4609	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
				4610	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
				4611	int cur = ctxt->input->cur - ctxt->input->base;
				4612
				4613	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4614	ctxt->input->base = ctxt->input->buf->buffer->content + base;
				4615	ctxt->input->cur = ctxt->input->base + cur;
				4616	#ifdef DEBUG_PUSH
				4617	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4618	#endif
				4619
				4620	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
				4621	htmlParseTryOrFinish(ctxt, terminate);
				4622	} else if (ctxt->instate != XML_PARSER_EOF) {
				4623	xmlParserInputBufferPush(ctxt->input->buf, 0, "");
				4624	htmlParseTryOrFinish(ctxt, terminate);
				4625	}
				4626	if (terminate) {
				4627	if ((ctxt->instate != XML_PARSER_EOF) &&
				4628	(ctxt->instate != XML_PARSER_EPILOG) &&
				4629	(ctxt->instate != XML_PARSER_MISC)) {
				4630	ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4631	ctxt->wellFormed = 0;
				4632	}
				4633	if (ctxt->instate != XML_PARSER_EOF) {
				4634	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
				4635	ctxt->sax->endDocument(ctxt->userData);
				4636	}
				4637	ctxt->instate = XML_PARSER_EOF;
				4638	}
				4639	return((xmlParserErrors) ctxt->errNo);
				4640	}
				4641
				4642	/************************************************************************
				4643	* *
				4644	* User entry points *
				4645	* *
				4646	************************************************************************/
				4647
				4648	/**
				4649	* htmlCreatePushParserCtxt :
				4650	* @sax: a SAX handler
				4651	* @user_data: The user data returned on SAX callbacks
				4652	* @chunk: a pointer to an array of chars
				4653	* @size: number of chars in the array
				4654	* @filename: an optional file name or URI
				4655	* @enc: an optional encoding
				4656	*
				4657	* Create a parser context for using the HTML parser in push mode
				4658	* To allow content encoding detection, @size should be >= 4
				4659	* The value of @filename is used for fetching external entities
				4660	* and error/warning reports.
				4661	*
				4662	* Returns the new parser context or NULL
				4663	*/
				4664	htmlParserCtxtPtr
				4665	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
				4666	const char chunk, int size, const char filename,
				4667	xmlCharEncoding enc) {
				4668	htmlParserCtxtPtr ctxt;
				4669	htmlParserInputPtr inputStream;
				4670	xmlParserInputBufferPtr buf;
				4671
				4672	buf = xmlAllocParserInputBuffer(enc);
				4673	if (buf == NULL) return(NULL);
				4674
				4675	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4676	if (ctxt == NULL) {
				4677	xmlFree(buf);
				4678	return(NULL);
				4679	}
				4680	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4681	htmlInitParserCtxt(ctxt);
				4682	if (sax != NULL) {
				4683	if (ctxt->sax != &htmlDefaultSAXHandler)
				4684	xmlFree(ctxt->sax);
				4685	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
				4686	if (ctxt->sax == NULL) {
				4687	xmlFree(buf);
				4688	xmlFree(ctxt);
				4689	return(NULL);
				4690	}
				4691	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
				4692	if (user_data != NULL)
				4693	ctxt->userData = user_data;
				4694	}
				4695	if (filename == NULL) {
				4696	ctxt->directory = NULL;
				4697	} else {
				4698	ctxt->directory = xmlParserGetDirectory(filename);
				4699	}
				4700
				4701	inputStream = htmlNewInputStream(ctxt);
				4702	if (inputStream == NULL) {
				4703	xmlFreeParserCtxt(ctxt);
				4704	return(NULL);
				4705	}
				4706
				4707	if (filename == NULL)
				4708	inputStream->filename = NULL;
				4709	else
				4710	inputStream->filename = xmlMemStrdup(filename);
				4711	inputStream->buf = buf;
				4712	inputStream->base = inputStream->buf->buffer->content;
				4713	inputStream->cur = inputStream->buf->buffer->content;
				4714
				4715	inputPush(ctxt, inputStream);
				4716
				4717	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
				4718	(ctxt->input->buf != NULL)) {
				4719	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
				4720	#ifdef DEBUG_PUSH
				4721	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
				4722	#endif
				4723	}
				4724
				4725	return(ctxt);
				4726	}
				4727
				4728	/**
				4729	* htmlSAXParseDoc :
				4730	* @cur: a pointer to an array of xmlChar
				4731	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4732	* @sax: the SAX handler block
				4733	* @userData: if using SAX, this pointer will be provided on callbacks.
				4734	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4735	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
				4736	* to handle parse events. If sax is NULL, fallback to the default DOM
				4737	* behavior and return a tree.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4738	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4739	* Returns the resulting document tree unless SAX is NULL or the document is
				4740	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4741	*/
				4742
				4743	htmlDocPtr
				4744	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
				4745	htmlDocPtr ret;
				4746	htmlParserCtxtPtr ctxt;
				4747
				4748	if (cur == NULL) return(NULL);
				4749
				4750
				4751	ctxt = htmlCreateDocParserCtxt(cur, encoding);
				4752	if (ctxt == NULL) return(NULL);
				4753	if (sax != NULL) {
				4754	ctxt->sax = sax;
				4755	ctxt->userData = userData;
				4756	}
				4757
				4758	htmlParseDocument(ctxt);
				4759	ret = ctxt->myDoc;
				4760	if (sax != NULL) {
				4761	ctxt->sax = NULL;
				4762	ctxt->userData = NULL;
				4763	}
				4764	htmlFreeParserCtxt(ctxt);
				4765
				4766	return(ret);
				4767	}
				4768
				4769	/**
				4770	* htmlParseDoc :
				4771	* @cur: a pointer to an array of xmlChar
				4772	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4773	*
				4774	* parse an HTML in-memory document and build a tree.
				4775	*
				4776	* Returns the resulting document tree
				4777	*/
				4778
				4779	htmlDocPtr
				4780	htmlParseDoc(xmlChar cur, const char encoding) {
				4781	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
				4782	}
				4783
				4784
				4785	/**
				4786	* htmlCreateFileParserCtxt :
				4787	* @filename: the filename
				4788	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4789	*
				4790	* Create a parser context for a file content.
				4791	* Automatic support for ZLIB/Compress compressed document is provided
				4792	* by default if found at compile-time.
				4793	*
				4794	* Returns the new parser context or NULL
				4795	*/
				4796	htmlParserCtxtPtr
				4797	htmlCreateFileParserCtxt(const char filename, const char encoding)
				4798	{
				4799	htmlParserCtxtPtr ctxt;
				4800	htmlParserInputPtr inputStream;
				4801	xmlParserInputBufferPtr buf;
				4802	/* htmlCharEncoding enc; */
				4803	xmlChar content, content_line = (xmlChar *) "charset=";
				4804
				4805	buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
				4806	if (buf == NULL) return(NULL);
				4807
				4808	ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
				4809	if (ctxt == NULL) {
				4810	perror("malloc");
				4811	return(NULL);
				4812	}
				4813	memset(ctxt, 0, sizeof(htmlParserCtxt));
				4814	htmlInitParserCtxt(ctxt);
				4815	inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
				4816	if (inputStream == NULL) {
				4817	perror("malloc");
				4818	xmlFree(ctxt);
				4819	return(NULL);
				4820	}
				4821	memset(inputStream, 0, sizeof(htmlParserInput));
				4822
				4823	inputStream->filename = xmlMemStrdup(filename);
				4824	inputStream->line = 1;
				4825	inputStream->col = 1;
				4826	inputStream->buf = buf;
				4827	inputStream->directory = NULL;
				4828
				4829	inputStream->base = inputStream->buf->buffer->content;
				4830	inputStream->cur = inputStream->buf->buffer->content;
				4831	inputStream->free = NULL;
				4832
				4833	inputPush(ctxt, inputStream);
				4834
				4835	/* set encoding */
				4836	if (encoding) {
				4837	content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
				4838	if (content) {
				4839	strcpy ((char )content, (char )content_line);
				4840	strcat ((char )content, (char )encoding);
				4841	htmlCheckEncoding (ctxt, content);
				4842	xmlFree (content);
				4843	}
				4844	}
				4845
				4846	return(ctxt);
				4847	}
				4848
				4849	/**
				4850	* htmlSAXParseFile :
				4851	* @filename: the filename
				4852	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4853	* @sax: the SAX handler block
				4854	* @userData: if using SAX, this pointer will be provided on callbacks.
				4855	*
				4856	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4857	* compressed document is provided by default if found at compile-time.
				4858	* It use the given SAX function block to handle the parsing callback.
				4859	* If sax is NULL, fallback to the default DOM tree building routines.
				4860	*
Daniel Veillard	4d65a1c	2001-07-04 22:06:23 +0000	[diff] [blame]	4861	* Returns the resulting document tree unless SAX is NULL or the document is
				4862	* not well formed.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	4863	*/
				4864
				4865	htmlDocPtr
				4866	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
				4867	void *userData) {
				4868	htmlDocPtr ret;
				4869	htmlParserCtxtPtr ctxt;
				4870	htmlSAXHandlerPtr oldsax = NULL;
				4871
				4872	ctxt = htmlCreateFileParserCtxt(filename, encoding);
				4873	if (ctxt == NULL) return(NULL);
				4874	if (sax != NULL) {
				4875	oldsax = ctxt->sax;
				4876	ctxt->sax = sax;
				4877	ctxt->userData = userData;
				4878	}
				4879
				4880	htmlParseDocument(ctxt);
				4881
				4882	ret = ctxt->myDoc;
				4883	if (sax != NULL) {
				4884	ctxt->sax = oldsax;
				4885	ctxt->userData = NULL;
				4886	}
				4887	htmlFreeParserCtxt(ctxt);
				4888
				4889	return(ret);
				4890	}
				4891
				4892	/**
				4893	* htmlParseFile :
				4894	* @filename: the filename
				4895	* @encoding: a free form C string describing the HTML document encoding, or NULL
				4896	*
				4897	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
				4898	* compressed document is provided by default if found at compile-time.
				4899	*
				4900	* Returns the resulting document tree
				4901	*/
				4902
				4903	htmlDocPtr
				4904	htmlParseFile(const char filename, const char encoding) {
				4905	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
				4906	}
				4907
				4908	/**
				4909	* htmlHandleOmittedElem:
				4910	* @val: int 0 or 1
				4911	*
				4912	* Set and return the previous value for handling HTML omitted tags.
				4913	*
				4914	* Returns the last value for 0 for no handling, 1 for auto insertion.
				4915	*/
				4916
				4917	int
				4918	htmlHandleOmittedElem(int val) {
				4919	int old = htmlOmittedDefaultValue;
				4920
				4921	htmlOmittedDefaultValue = val;
				4922	return(old);
				4923	}
				4924
				4925	#endif /* LIBXML_HTML_ENABLED */